summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.mailmap2
-rw-r--r--Documentation/ABI/testing/sysfs-fs-f2fs5
-rw-r--r--Documentation/ABI/testing/sysfs-platform-dell-laptop8
-rw-r--r--Documentation/admin-guide/binderfs.rst6
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt29
-rw-r--r--Documentation/admin-guide/mm/transhuge.rst14
-rw-r--r--Documentation/admin-guide/mm/userfaultfd.rst51
-rw-r--r--Documentation/admin-guide/sysctl/kernel.rst21
-rw-r--r--Documentation/admin-guide/sysrq.rst20
-rw-r--r--Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt16
-rw-r--r--Documentation/devicetree/bindings/display/panel/panel-dpi.yaml10
-rw-r--r--Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml4
-rw-r--r--Documentation/devicetree/bindings/display/ti/ti,j721e-dss.yaml4
-rw-r--r--Documentation/devicetree/bindings/display/ti/ti,k2g-dss.yaml4
-rw-r--r--Documentation/devicetree/bindings/fpga/fpga-region.txt5
-rw-r--r--Documentation/devicetree/bindings/input/iqs62x-keys.yaml132
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.txt77
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml125
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/goodix.yaml2
-rw-r--r--Documentation/devicetree/bindings/ipmi/aspeed-kcs-bmc.txt20
-rw-r--r--Documentation/devicetree/bindings/mfd/iqs62x.yaml179
-rw-r--r--Documentation/devicetree/bindings/mfd/rn5t618.txt4
-rw-r--r--Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt90
-rw-r--r--Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.yaml236
-rw-r--r--Documentation/devicetree/bindings/mfd/rohm,bd71847-pmic.yaml222
-rw-r--r--Documentation/devicetree/bindings/mfd/st,stm32-lptimer.yaml16
-rw-r--r--Documentation/devicetree/bindings/net/marvell,mvusb.yaml29
-rw-r--r--Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.txt23
-rw-r--r--Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml40
-rw-r--r--Documentation/devicetree/bindings/pwm/iqs620a-pwm.yaml32
-rw-r--r--Documentation/devicetree/bindings/pwm/nvidia,tegra20-pwm.txt1
-rw-r--r--Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.txt162
-rw-r--r--Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.yaml103
-rw-r--r--Documentation/devicetree/bindings/regulator/rohm,bd71847-regulator.yaml97
-rw-r--r--Documentation/devicetree/bindings/sound/cirrus,cs42l51.yaml2
-rw-r--r--Documentation/devicetree/bindings/thermal/imx8mm-thermal.txt15
-rw-r--r--Documentation/devicetree/bindings/thermal/qcom-tsens.yaml2
-rw-r--r--Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.txt1
-rw-r--r--Documentation/devicetree/bindings/thermal/sprd-thermal.yaml107
-rw-r--r--Documentation/devicetree/bindings/thermal/thermal.txt2
-rw-r--r--Documentation/devicetree/bindings/watchdog/ti,rti-wdt.yaml65
-rw-r--r--Documentation/driver-api/thermal/cpu-idle-cooling.rst4
-rw-r--r--Documentation/driver-api/usb/writing_usb_driver.rst7
-rw-r--r--Documentation/driver-api/w1.rst3
-rw-r--r--Documentation/filesystems/9p.rst10
-rw-r--r--Documentation/filesystems/ceph.rst6
-rw-r--r--Documentation/filesystems/f2fs.rst4
-rw-r--r--Documentation/filesystems/orangefs.rst34
-rw-r--r--Documentation/filesystems/overlayfs.rst82
-rw-r--r--Documentation/filesystems/qnx6.rst2
-rw-r--r--Documentation/firmware-guide/acpi/namespace.rst6
-rw-r--r--Documentation/kbuild/kbuild.rst5
-rw-r--r--Documentation/kbuild/llvm.rst15
-rw-r--r--Documentation/openrisc/openrisc_port.rst4
-rw-r--r--Documentation/process/changes.rst4
-rw-r--r--Documentation/vm/free_page_reporting.rst40
-rw-r--r--Documentation/vm/zswap.rst20
-rw-r--r--MAINTAINERS60
-rw-r--r--Makefile40
-rw-r--r--arch/alpha/include/asm/mmzone.h2
-rw-r--r--arch/alpha/include/asm/page.h3
-rw-r--r--arch/alpha/include/asm/pgtable.h2
-rw-r--r--arch/alpha/kernel/syscalls/syscallhdr.sh2
-rw-r--r--arch/arc/include/asm/page.h2
-rw-r--r--arch/arm/include/asm/page.h4
-rw-r--r--arch/arm/include/asm/pgtable-2level.h2
-rw-r--r--arch/arm/include/asm/pgtable.h15
-rw-r--r--arch/arm/mach-omap2/omap-secure.c2
-rw-r--r--arch/arm/mach-omap2/omap-secure.h2
-rw-r--r--arch/arm/mach-omap2/omap-smc.S2
-rw-r--r--arch/arm/mach-pxa/cm-x300.c1
-rw-r--r--arch/arm/mach-pxa/colibri-pxa270-income.c1
-rw-r--r--arch/arm/mach-pxa/corgi.c12
-rw-r--r--arch/arm/mach-pxa/ezx.c1
-rw-r--r--arch/arm/mach-pxa/hx4700.c1
-rw-r--r--arch/arm/mach-pxa/lpd270.c1
-rw-r--r--arch/arm/mach-pxa/magician.c1
-rw-r--r--arch/arm/mach-pxa/mainstone.c1
-rw-r--r--arch/arm/mach-pxa/mioa701.c1
-rw-r--r--arch/arm/mach-pxa/palm27x.c1
-rw-r--r--arch/arm/mach-pxa/palmtc.c11
-rw-r--r--arch/arm/mach-pxa/palmte2.c1
-rw-r--r--arch/arm/mach-pxa/pcm990-baseboard.c1
-rw-r--r--arch/arm/mach-pxa/spitz.c34
-rw-r--r--arch/arm/mach-pxa/tavorevb.c2
-rw-r--r--arch/arm/mach-pxa/viper.c1
-rw-r--r--arch/arm/mach-pxa/z2.c2
-rw-r--r--arch/arm/mach-pxa/zylonite.c1
-rw-r--r--arch/arm/mach-s3c24xx/mach-h1940.c1
-rw-r--r--arch/arm/mach-s3c24xx/mach-rx1950.c1
-rw-r--r--arch/arm/mach-s3c64xx/dev-backlight.c3
-rw-r--r--arch/arm/mach-s3c64xx/mach-crag6410.c1
-rw-r--r--arch/arm/mach-s3c64xx/mach-hmt.c1
-rw-r--r--arch/arm/mach-s3c64xx/mach-smartq.c1
-rw-r--r--arch/arm/mach-s3c64xx/mach-smdk6410.c2
-rw-r--r--arch/arm/mm/fault.c2
-rw-r--r--arch/arm/mm/mmu.c14
-rw-r--r--arch/arm64/Kconfig5
-rw-r--r--arch/arm64/Kconfig.debug13
-rw-r--r--arch/arm64/Makefile7
-rw-r--r--arch/arm64/include/asm/memory.h12
-rw-r--r--arch/arm64/include/asm/page.h4
-rw-r--r--arch/arm64/kernel/armv8_deprecated.c2
-rw-r--r--arch/arm64/kvm/Kconfig2
-rw-r--r--arch/arm64/mm/fault.c2
-rw-r--r--arch/arm64/mm/init.c6
-rw-r--r--arch/arm64/mm/mmu.c7
-rw-r--r--arch/c6x/include/asm/page.h5
-rw-r--r--arch/csky/include/asm/page.h3
-rw-r--r--arch/csky/include/asm/pgtable.h3
-rw-r--r--arch/csky/mm/fault.c2
-rw-r--r--arch/h8300/include/asm/page.h2
-rw-r--r--arch/h8300/include/uapi/asm/bitsperlong.h15
-rw-r--r--arch/h8300/include/uapi/asm/posix_types.h13
-rw-r--r--arch/hexagon/include/asm/page.h3
-rw-r--r--arch/hexagon/include/asm/pgtable.h2
-rw-r--r--arch/ia64/include/asm/page.h5
-rw-r--r--arch/ia64/include/asm/pgtable.h2
-rw-r--r--arch/ia64/kernel/syscalls/syscallhdr.sh2
-rw-r--r--arch/ia64/kernel/vmlinux.lds.S2
-rw-r--r--arch/ia64/mm/init.c7
-rw-r--r--arch/m68k/68000/timers.c16
-rw-r--r--arch/m68k/coldfire/pit.c16
-rw-r--r--arch/m68k/coldfire/sltimers.c29
-rw-r--r--arch/m68k/coldfire/timers.c31
-rw-r--r--arch/m68k/include/asm/mcf_pgtable.h10
-rw-r--r--arch/m68k/include/asm/motorola_pgtable.h2
-rw-r--r--arch/m68k/include/asm/page.h3
-rw-r--r--arch/m68k/include/asm/sun3_pgtable.h2
-rw-r--r--arch/m68k/mm/fault.c2
-rw-r--r--arch/microblaze/include/asm/page.h2
-rw-r--r--arch/microblaze/include/asm/pgtable.h4
-rw-r--r--arch/microblaze/kernel/syscalls/syscallhdr.sh2
-rw-r--r--arch/mips/fw/arc/memory.c4
-rw-r--r--arch/mips/include/asm/page.h5
-rw-r--r--arch/mips/include/asm/pgtable.h44
-rw-r--r--arch/mips/kernel/syscalls/syscallhdr.sh3
-rw-r--r--arch/mips/kvm/Kconfig2
-rw-r--r--arch/mips/mm/fault.c2
-rw-r--r--arch/nds32/include/asm/page.h3
-rw-r--r--arch/nds32/include/asm/pgtable.h9
-rw-r--r--arch/nds32/kernel/vmlinux.lds.S1
-rw-r--r--arch/nds32/mm/fault.c2
-rw-r--r--arch/nios2/Kconfig1
-rw-r--r--arch/nios2/boot/dts/10m50_devboard.dts6
-rw-r--r--arch/nios2/include/asm/page.h3
-rw-r--r--arch/nios2/include/asm/pgtable.h3
-rw-r--r--arch/nios2/platform/platform.c8
-rw-r--r--arch/openrisc/Kconfig1
-rw-r--r--arch/openrisc/configs/or1ksim_defconfig1
-rw-r--r--arch/openrisc/configs/simple_smp_defconfig1
-rw-r--r--arch/openrisc/include/asm/page.h5
-rw-r--r--arch/openrisc/include/asm/pgtable.h2
-rw-r--r--arch/openrisc/include/uapi/asm/unistd.h1
-rw-r--r--arch/openrisc/kernel/process.c18
-rw-r--r--arch/openrisc/kernel/smp.c3
-rw-r--r--arch/openrisc/kernel/traps.c7
-rw-r--r--arch/parisc/include/asm/page.h3
-rw-r--r--arch/parisc/include/asm/pgtable.h2
-rw-r--r--arch/parisc/include/asm/spinlock.h160
-rw-r--r--arch/parisc/include/asm/spinlock_types.h14
-rw-r--r--arch/parisc/kernel/alternative.c37
-rw-r--r--arch/parisc/kernel/irq.c22
-rw-r--r--arch/parisc/kernel/syscall.S2
-rw-r--r--arch/parisc/kernel/syscalls/syscallhdr.sh2
-rw-r--r--arch/parisc/kernel/syscalls/syscalltbl.sh4
-rw-r--r--arch/powerpc/Kconfig6
-rw-r--r--arch/powerpc/configs/ps3_defconfig2
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash.h3
-rw-r--r--arch/powerpc/include/asm/book3s/64/radix.h3
-rw-r--r--arch/powerpc/include/asm/page.h9
-rw-r--r--arch/powerpc/include/asm/page_64.h7
-rw-r--r--arch/powerpc/include/asm/sparsemem.h3
-rw-r--r--arch/powerpc/include/asm/thread_info.h4
-rw-r--r--arch/powerpc/include/asm/unistd.h1
-rw-r--r--arch/powerpc/kernel/Makefile5
-rw-r--r--arch/powerpc/kernel/entry_64.S2
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S19
-rw-r--r--arch/powerpc/kernel/irq.c13
-rw-r--r--arch/powerpc/kernel/ppc_save_regs.S6
-rw-r--r--arch/powerpc/kernel/ptrace/Makefile2
-rw-r--r--arch/powerpc/kernel/signal.c144
-rw-r--r--arch/powerpc/kernel/signal_32.c140
-rw-r--r--arch/powerpc/kernel/syscall_64.c6
-rw-r--r--arch/powerpc/kernel/syscalls/syscallhdr.sh3
-rw-r--r--arch/powerpc/kernel/time.c48
-rw-r--r--arch/powerpc/kernel/vdso.c3
-rw-r--r--arch/powerpc/kvm/Kconfig2
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c2
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c5
-rw-r--r--arch/powerpc/mm/book3s64/pgtable.c7
-rw-r--r--arch/powerpc/mm/book3s64/pkeys.c2
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c18
-rw-r--r--arch/powerpc/mm/fault.c2
-rw-r--r--arch/powerpc/mm/ioremap.c21
-rw-r--r--arch/powerpc/mm/mem.c10
-rw-r--r--arch/powerpc/perf/Makefile5
-rw-r--r--arch/powerpc/perf/callchain.c356
-rw-r--r--arch/powerpc/perf/callchain.h19
-rw-r--r--arch/powerpc/perf/callchain_32.c196
-rw-r--r--arch/powerpc/perf/callchain_64.c174
-rw-r--r--arch/powerpc/perf/imc-pmu.c173
-rw-r--r--arch/powerpc/platforms/powernv/memtrace.c14
-rw-r--r--arch/powerpc/platforms/powernv/opal-imc.c9
-rw-r--r--arch/powerpc/platforms/ps3/os-area.c4
-rw-r--r--arch/powerpc/platforms/pseries/iommu.c9
-rw-r--r--arch/powerpc/platforms/pseries/papr_scm.c27
-rw-r--r--arch/powerpc/platforms/pseries/ras.c11
-rw-r--r--arch/riscv/Kconfig26
-rw-r--r--arch/riscv/Kconfig.socs10
-rw-r--r--arch/riscv/Makefile6
-rw-r--r--arch/riscv/boot/Makefile3
-rw-r--r--arch/riscv/boot/dts/Makefile1
-rw-r--r--arch/riscv/boot/dts/kendryte/Makefile2
-rw-r--r--arch/riscv/boot/dts/kendryte/k210.dts23
-rw-r--r--arch/riscv/boot/dts/kendryte/k210.dtsi123
-rw-r--r--arch/riscv/configs/defconfig1
-rw-r--r--arch/riscv/configs/nommu_k210_defconfig68
-rw-r--r--arch/riscv/configs/rv32_defconfig1
-rw-r--r--arch/riscv/include/asm/bug.h8
-rw-r--r--arch/riscv/include/asm/cacheflush.h2
-rw-r--r--arch/riscv/include/asm/cpu_ops.h46
-rw-r--r--arch/riscv/include/asm/current.h5
-rw-r--r--arch/riscv/include/asm/fixmap.h2
-rw-r--r--arch/riscv/include/asm/kasan.h2
-rw-r--r--arch/riscv/include/asm/page.h3
-rw-r--r--arch/riscv/include/asm/patch.h12
-rw-r--r--arch/riscv/include/asm/pgtable.h10
-rw-r--r--arch/riscv/include/asm/ptdump.h11
-rw-r--r--arch/riscv/include/asm/sbi.h195
-rw-r--r--arch/riscv/include/asm/set_memory.h48
-rw-r--r--arch/riscv/include/asm/smp.h24
-rw-r--r--arch/riscv/include/asm/soc.h23
-rw-r--r--arch/riscv/kernel/Makefile13
-rw-r--r--arch/riscv/kernel/cpu-hotplug.c87
-rw-r--r--arch/riscv/kernel/cpu_ops.c46
-rw-r--r--arch/riscv/kernel/cpu_ops_sbi.c115
-rw-r--r--arch/riscv/kernel/cpu_ops_spinwait.c43
-rw-r--r--arch/riscv/kernel/entry.S143
-rw-r--r--arch/riscv/kernel/ftrace.c13
-rw-r--r--arch/riscv/kernel/head.S180
-rw-r--r--arch/riscv/kernel/patch.c120
-rw-r--r--arch/riscv/kernel/process.c5
-rw-r--r--arch/riscv/kernel/sbi.c575
-rw-r--r--arch/riscv/kernel/setup.c32
-rw-r--r--arch/riscv/kernel/smpboot.c53
-rw-r--r--arch/riscv/kernel/soc.c28
-rw-r--r--arch/riscv/kernel/stacktrace.c7
-rw-r--r--arch/riscv/kernel/traps.c32
-rw-r--r--arch/riscv/kernel/traps_misaligned.c370
-rw-r--r--arch/riscv/kernel/vmlinux.lds.S23
-rw-r--r--arch/riscv/lib/uaccess.S6
-rw-r--r--arch/riscv/mm/Makefile3
-rw-r--r--arch/riscv/mm/hugetlbpage.c6
-rw-r--r--arch/riscv/mm/init.c44
-rw-r--r--arch/riscv/mm/pageattr.c187
-rw-r--r--arch/riscv/mm/ptdump.c317
-rw-r--r--arch/s390/include/asm/page.h3
-rw-r--r--arch/s390/include/asm/qdio.h16
-rw-r--r--arch/s390/kvm/Kconfig4
-rw-r--r--arch/s390/kvm/vsie.c1
-rw-r--r--arch/s390/mm/fault.c4
-rw-r--r--arch/s390/mm/gmap.c7
-rw-r--r--arch/s390/mm/init.c9
-rw-r--r--arch/sh/include/asm/bitops-op32.h8
-rw-r--r--arch/sh/include/asm/page.h3
-rw-r--r--arch/sh/include/uapi/asm/setup.h2
-rw-r--r--arch/sh/include/uapi/asm/types.h2
-rw-r--r--arch/sh/kernel/syscalls/syscallhdr.sh2
-rw-r--r--arch/sh/mm/fault.c2
-rw-r--r--arch/sh/mm/init.c7
-rw-r--r--arch/sparc/include/asm/dma-mapping.h15
-rw-r--r--arch/sparc/include/asm/page_32.h3
-rw-r--r--arch/sparc/include/asm/page_64.h3
-rw-r--r--arch/sparc/include/asm/pgtable_32.h7
-rw-r--r--arch/sparc/include/asm/pgtable_64.h10
-rw-r--r--arch/sparc/kernel/ioport.c3
-rw-r--r--arch/sparc/kernel/of_device_common.c1
-rw-r--r--arch/sparc/kernel/syscalls/syscallhdr.sh2
-rw-r--r--arch/sparc/mm/io-unit.c9
-rw-r--r--arch/sparc/mm/iommu.c15
-rw-r--r--arch/sparc/mm/mm_32.h3
-rw-r--r--arch/sparc/mm/srmmu.c4
-rw-r--r--arch/sparc/vdso/vdso32/vclock_gettime.c4
-rw-r--r--arch/um/Kconfig9
-rw-r--r--arch/um/configs/i386_defconfig2
-rw-r--r--arch/um/configs/x86_64_defconfig2
-rw-r--r--arch/um/drivers/Kconfig3
-rw-r--r--arch/um/drivers/net_kern.c13
-rw-r--r--arch/um/drivers/ubd_kern.c12
-rw-r--r--arch/um/drivers/vector_kern.c5
-rw-r--r--arch/um/drivers/vector_user.c15
-rw-r--r--arch/um/drivers/vhost_user.h12
-rw-r--r--arch/um/drivers/virtio_uml.c153
-rw-r--r--arch/um/include/asm/Kbuild1
-rw-r--r--arch/um/include/asm/delay.h30
-rw-r--r--arch/um/include/asm/pgtable.h10
-rw-r--r--arch/um/include/linux/time-internal.h84
-rw-r--r--arch/um/include/shared/os.h1
-rw-r--r--arch/um/include/shared/timer-internal.h76
-rw-r--r--arch/um/kernel/kmsg_dump.c9
-rw-r--r--arch/um/kernel/process.c39
-rw-r--r--arch/um/kernel/skas/syscall.c5
-rw-r--r--arch/um/kernel/time.c538
-rw-r--r--arch/um/kernel/uml.lds.S2
-rw-r--r--arch/um/os-Linux/file.c31
-rw-r--r--arch/um/os-Linux/time.c1
-rw-r--r--arch/um/os-Linux/umid.c5
-rw-r--r--arch/unicore32/include/asm/page.h3
-rw-r--r--arch/unicore32/include/asm/pgtable.h3
-rw-r--r--arch/unicore32/kernel/puv3-nb0916.c1
-rw-r--r--arch/unicore32/mm/fault.c2
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/Kconfig.assembler17
-rw-r--r--arch/x86/Makefile22
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/crypto/Makefile162
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S6
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c21
-rw-r--r--arch/x86/crypto/blake2s-core.S2
-rw-r--r--arch/x86/crypto/chacha_glue.c6
-rw-r--r--arch/x86/crypto/poly1305-x86_64-cryptogams.pl16
-rw-r--r--arch/x86/crypto/poly1305_glue.c11
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S4
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c13
-rw-r--r--arch/x86/crypto/sha256-avx-asm.S3
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S3
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c12
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S2
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S3
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c10
-rw-r--r--arch/x86/entry/vdso/vdso32/vclock_gettime.c4
-rw-r--r--arch/x86/events/intel/uncore.c8
-rw-r--r--arch/x86/events/intel/uncore.h3
-rw-r--r--arch/x86/events/intel/uncore_snbep.c511
-rw-r--r--arch/x86/include/asm/dwarf2.h44
-rw-r--r--arch/x86/include/asm/page_types.h7
-rw-r--r--arch/x86/include/asm/pgtable.h73
-rw-r--r--arch/x86/include/asm/pgtable_64.h8
-rw-r--r--arch/x86/include/asm/pgtable_types.h12
-rw-r--r--arch/x86/include/asm/set_memory.h1
-rw-r--r--arch/x86/include/asm/xor_avx.h9
-rw-r--r--arch/x86/kernel/acpi/boot.c2
-rw-r--r--arch/x86/kernel/amd_gart_64.c3
-rw-r--r--arch/x86/kernel/setup.c4
-rw-r--r--arch/x86/kvm/Kconfig4
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/lapic.c3
-rw-r--r--arch/x86/kvm/lapic.h3
-rw-r--r--arch/x86/kvm/svm/avic.c1027
-rw-r--r--arch/x86/kvm/svm/nested.c823
-rw-r--r--arch/x86/kvm/svm/pmu.c (renamed from arch/x86/kvm/pmu_amd.c)0
-rw-r--r--arch/x86/kvm/svm/sev.c1187
-rw-r--r--arch/x86/kvm/svm/svm.c (renamed from arch/x86/kvm/svm.c)3526
-rw-r--r--arch/x86/kvm/svm/svm.h491
-rw-r--r--arch/x86/kvm/svm/vmenter.S162
-rw-r--r--arch/x86/kvm/vmx/nested.c3
-rw-r--r--arch/x86/kvm/vmx/vmenter.S8
-rw-r--r--arch/x86/kvm/vmx/vmx.c12
-rw-r--r--arch/x86/kvm/x86.c3
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/mm/init.c9
-rw-r--r--arch/x86/mm/init_32.c19
-rw-r--r--arch/x86/mm/init_64.c40
-rw-r--r--arch/x86/mm/mm_internal.h3
-rw-r--r--arch/x86/mm/numa.c67
-rw-r--r--arch/x86/mm/pat/set_memory.c13
-rw-r--r--arch/x86/mm/pkeys.c2
-rw-r--r--arch/x86/platform/uv/bios_uv.c3
-rw-r--r--arch/x86/um/asm/processor.h12
-rw-r--r--arch/x86/um/asm/vm-flags.h10
-rw-r--r--arch/x86/xen/setup.c2
-rw-r--r--arch/x86/xen/xen-head.S2
-rw-r--r--arch/xtensa/Kconfig2
-rw-r--r--arch/xtensa/boot/Makefile1
-rw-r--r--arch/xtensa/include/asm/page.h3
-rw-r--r--arch/xtensa/include/asm/pgtable.h3
-rw-r--r--arch/xtensa/kernel/smp.c8
-rw-r--r--arch/xtensa/kernel/syscalls/syscallhdr.sh2
-rw-r--r--arch/xtensa/kernel/time.c12
-rw-r--r--block/blk-cgroup.c22
-rw-r--r--block/blk-mq.c9
-rw-r--r--block/partitions/core.c2
-rw-r--r--drivers/Kconfig4
-rw-r--r--drivers/Makefile1
-rw-r--r--drivers/acpi/arm64/iort.c6
-rw-r--r--drivers/acpi/ec.c4
-rw-r--r--drivers/acpi/nfit/core.c14
-rw-r--r--drivers/acpi/nfit/nfit.h13
-rw-r--r--drivers/acpi/numa/srat.c41
-rw-r--r--drivers/ata/ahci.c29
-rw-r--r--drivers/ata/ahci.h1
-rw-r--r--drivers/ata/ahci_imx.c2
-rw-r--r--drivers/ata/libata-pmp.c1
-rw-r--r--drivers/base/memory.c130
-rw-r--r--drivers/block/loop.c49
-rw-r--r--drivers/block/rbd.c215
-rw-r--r--drivers/block/xen-blkfront.c17
-rw-r--r--drivers/char/hw_random/omap3-rom-rng.c4
-rw-r--r--drivers/char/ipmi/ipmi_msghandler.c18
-rw-r--r--drivers/char/ipmi/ipmi_ssif.c2
-rw-r--r--drivers/char/ipmi/kcs_bmc_aspeed.c151
-rw-r--r--drivers/cpuidle/cpuidle-haltpoll.c4
-rw-r--r--drivers/crypto/chelsio/chcr_ktls.c1
-rw-r--r--drivers/crypto/hisilicon/Kconfig2
-rw-r--r--drivers/crypto/marvell/octeontx/otx_cptvf_algs.c8
-rw-r--r--drivers/dax/bus.c4
-rw-r--r--drivers/dax/super.c28
-rw-r--r--drivers/dma-buf/Kconfig11
-rw-r--r--drivers/dma/tegra20-apb-dma.c1
-rw-r--r--drivers/firmware/edd.c6
-rw-r--r--drivers/firmware/efi/libstub/arm64-stub.c8
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c20
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c27
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c13
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device.c4
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c29
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c14
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c8
-rw-r--r--drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c2
-rw-r--r--drivers/gpu/drm/amd/display/dc/core/dc.c53
-rw-r--r--drivers/gpu/drm/amd/display/dc/core/dc_resource.c1
-rw-r--r--drivers/gpu/drm/amd/display/dc/dc.h3
-rw-r--r--drivers/gpu/drm/amd/display/dc/dc_hw_types.h2
-rw-r--r--drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c17
-rw-r--r--drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c46
-rw-r--r--drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c18
-rw-r--r--drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.h3
-rw-r--r--drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c5
-rw-r--r--drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c1
-rw-r--r--drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c5
-rw-r--r--drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c40
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml/dc_features.h2
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml/display_mode_structs.h7
-rw-r--r--drivers/gpu/drm/amd/display/include/dal_asic_id.h6
-rw-r--r--drivers/gpu/drm/amd/display/modules/freesync/freesync.c34
-rw-r--r--drivers/gpu/drm/amd/display/modules/hdcp/hdcp.c5
-rw-r--r--drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h28
-rw-r--r--drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c2
-rw-r--r--drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c2
-rw-r--r--drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c39
-rw-r--r--drivers/gpu/drm/amd/display/modules/inc/mod_hdcp.h1
-rw-r--r--drivers/gpu/drm/amd/powerplay/amd_powerplay.c3
-rw-r--r--drivers/gpu/drm/amd/powerplay/arcturus_ppt.c62
-rw-r--r--drivers/gpu/drm/amd/powerplay/navi10_ppt.c18
-rw-r--r--drivers/gpu/drm/amd/powerplay/renoir_ppt.c18
-rw-r--r--drivers/gpu/drm/amd/powerplay/renoir_ppt.h2
-rw-r--r--drivers/gpu/drm/amd/powerplay/smu_v11_0.c24
-rw-r--r--drivers/gpu/drm/amd/powerplay/vega20_ppt.c14
-rw-r--r--drivers/gpu/drm/bridge/analogix/analogix_dp_core.c33
-rw-r--r--drivers/gpu/drm/drm_mm.c8
-rw-r--r--drivers/gpu/drm/drm_prime.c37
-rw-r--r--drivers/gpu/drm/exynos/exynos_dp.c29
-rw-r--r--drivers/gpu/drm/i915/Makefile3
-rw-r--r--drivers/gpu/drm/i915/display/intel_ddi.c11
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c14
-rw-r--r--drivers/gpu/drm/i915/gt/intel_ggtt.c37
-rw-r--r--drivers/gpu/drm/i915/gvt/cmd_parser.c16
-rw-r--r--drivers/gpu/drm/i915/gvt/display.c6
-rw-r--r--drivers/gpu/drm/i915/gvt/handlers.c8
-rw-r--r--drivers/gpu/drm/i915/gvt/scheduler.c4
-rw-r--r--drivers/gpu/drm/i915/i915_memcpy.c5
-rw-r--r--drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c2
-rw-r--r--drivers/gpu/drm/nouveau/dispnv04/dac.c3
-rw-r--r--drivers/gpu/drm/nouveau/dispnv04/hw.c1
-rw-r--r--drivers/gpu/drm/nouveau/dispnv50/base507c.c1
-rw-r--r--drivers/gpu/drm/nouveau/dispnv50/core507d.c1
-rw-r--r--drivers/gpu/drm/nouveau/dispnv50/corec37d.c2
-rw-r--r--drivers/gpu/drm/nouveau/dispnv50/curs507a.c21
-rw-r--r--drivers/gpu/drm/nouveau/dispnv50/cursc37a.c9
-rw-r--r--drivers/gpu/drm/nouveau/dispnv50/disp.c1
-rw-r--r--drivers/gpu/drm/nouveau/dispnv50/ovly827e.c2
-rw-r--r--drivers/gpu/drm/nouveau/dispnv50/wndw.h1
-rw-r--r--drivers/gpu/drm/nouveau/include/nvif/device.h21
-rw-r--r--drivers/gpu/drm/nouveau/include/nvif/timer.h35
-rw-r--r--drivers/gpu/drm/nouveau/include/nvif/user.h1
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_bo.c9
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_debugfs.c20
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_drm.c63
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_drv.h2
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_svm.c9
-rw-r--r--drivers/gpu/drm/nouveau/nvif/Kbuild1
-rw-r--r--drivers/gpu/drm/nouveau/nvif/device.c14
-rw-r--r--drivers/gpu/drm/nouveau/nvif/timer.c56
-rw-r--r--drivers/gpu/drm/nouveau/nvif/userc361.c14
-rw-r--r--drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c26
-rw-r--r--drivers/gpu/drm/panel/panel-simple.c11
-rw-r--r--drivers/gpu/drm/rockchip/analogix_dp-rockchip.c36
-rw-r--r--drivers/gpu/drm/ttm/ttm_bo_vm.c63
-rw-r--r--drivers/gpu/drm/vboxvideo/vbox_drv.c4
-rw-r--r--drivers/gpu/drm/vc4/vc4_hdmi.c20
-rw-r--r--drivers/gpu/drm/virtio/virtgpu_object.c14
-rw-r--r--drivers/gpu/drm/xen/xen_drm_front.c2
-rw-r--r--drivers/hv/hv_balloon.c25
-rw-r--r--drivers/hwmon/dell-smm-hwmon.c4
-rw-r--r--drivers/ide/ide-scan-pci.c8
-rw-r--r--drivers/iio/Kconfig1
-rw-r--r--drivers/iio/Makefile1
-rw-r--r--drivers/iio/accel/cros_ec_accel_legacy.c8
-rw-r--r--drivers/iio/adc/Kconfig10
-rw-r--r--drivers/iio/adc/Makefile1
-rw-r--r--drivers/iio/adc/rn5t618-adc.c256
-rw-r--r--drivers/iio/common/cros_ec_sensors/cros_ec_lid_angle.c3
-rw-r--r--drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c13
-rw-r--r--drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c375
-rw-r--r--drivers/iio/industrialio-core.c8
-rw-r--r--drivers/iio/light/Kconfig10
-rw-r--r--drivers/iio/light/Makefile1
-rw-r--r--drivers/iio/light/cros_ec_light_prox.c15
-rw-r--r--drivers/iio/light/iqs621-als.c617
-rw-r--r--drivers/iio/position/Kconfig19
-rw-r--r--drivers/iio/position/Makefile7
-rw-r--r--drivers/iio/position/iqs624-pos.c284
-rw-r--r--drivers/iio/pressure/cros_ec_baro.c14
-rw-r--r--drivers/iio/temperature/Kconfig10
-rw-r--r--drivers/iio/temperature/Makefile1
-rw-r--r--drivers/iio/temperature/iqs620at-temp.c97
-rw-r--r--drivers/input/keyboard/Kconfig10
-rw-r--r--drivers/input/keyboard/Makefile1
-rw-r--r--drivers/input/keyboard/iqs62x-keys.c335
-rw-r--r--drivers/input/serio/i8042-x86ia64io.h11
-rw-r--r--drivers/input/touchscreen/elants_i2c.c1
-rw-r--r--drivers/input/touchscreen/goodix.c608
-rw-r--r--drivers/input/touchscreen/of_touchscreen.c35
-rw-r--r--drivers/iommu/Kconfig21
-rw-r--r--drivers/iommu/amd_iommu_types.h2
-rw-r--r--drivers/iommu/arm-smmu-v3.c214
-rw-r--r--drivers/iommu/arm-smmu.c55
-rw-r--r--drivers/iommu/intel-iommu.c3
-rw-r--r--drivers/iommu/intel-svm.c9
-rw-r--r--drivers/iommu/iommu.c46
-rw-r--r--drivers/iommu/ipmmu-vmsa.c7
-rw-r--r--drivers/iommu/mtk_iommu.c13
-rw-r--r--drivers/iommu/mtk_iommu_v1.c14
-rw-r--r--drivers/iommu/omap-iommu.c10
-rw-r--r--drivers/iommu/omap-iopgtable.h3
-rw-r--r--drivers/iommu/qcom_iommu.c63
-rw-r--r--drivers/iommu/tegra-gart.c2
-rw-r--r--drivers/iommu/virtio-iommu.c42
-rw-r--r--drivers/leds/Kconfig11
-rw-r--r--drivers/leds/Makefile99
-rw-r--r--drivers/leds/led-class.c2
-rw-r--r--drivers/leds/leds-bd2802.c2
-rw-r--r--drivers/leds/leds-ip30.c86
-rw-r--r--drivers/leds/leds-is31fl32xx.c2
-rw-r--r--drivers/leds/leds-lm3532.c2
-rw-r--r--drivers/leds/leds-lm3697.c2
-rw-r--r--drivers/leds/leds-ns2.c99
-rw-r--r--drivers/leds/leds-pwm.c55
-rw-r--r--drivers/md/dm-linear.c18
-rw-r--r--drivers/md/dm-log-writes.c17
-rw-r--r--drivers/md/dm-stripe.c23
-rw-r--r--drivers/md/dm.c32
-rw-r--r--drivers/mfd/Kconfig23
-rw-r--r--drivers/mfd/Makefile1
-rw-r--r--drivers/mfd/aat2870-core.c2
-rw-r--r--drivers/mfd/cros_ec_dev.c2
-rw-r--r--drivers/mfd/da9062-core.c44
-rw-r--r--drivers/mfd/dln2.c30
-rw-r--r--drivers/mfd/intel-lpss-pci.c31
-rw-r--r--drivers/mfd/iqs62x.c1063
-rw-r--r--drivers/mfd/omap-usb-host.c2
-rw-r--r--drivers/mfd/omap-usb-tll.c4
-rw-r--r--drivers/mfd/qcom-pm8xxx.c2
-rw-r--r--drivers/mfd/rk808.c139
-rw-r--r--drivers/mfd/rn5t618.c109
-rw-r--r--drivers/mfd/sprd-sc27xx-spi.c52
-rw-r--r--drivers/misc/lkdtm/bugs.c75
-rw-r--r--drivers/misc/lkdtm/core.c3
-rw-r--r--drivers/misc/lkdtm/lkdtm.h3
-rw-r--r--drivers/misc/mic/Kconfig4
-rw-r--r--drivers/mtd/ubi/fastmap-wl.c15
-rw-r--r--drivers/mtd/ubi/ubi-media.h2
-rw-r--r--drivers/mtd/ubi/wl.c3
-rw-r--r--drivers/net/caif/Kconfig4
-rw-r--r--drivers/net/can/slcan.c4
-rw-r--r--drivers/net/dsa/bcm_sf2.c9
-rw-r--r--drivers/net/dsa/mt7530.c3
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c2
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c3
-rw-r--r--drivers/net/ethernet/cavium/common/cavium_ptp.h2
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c5
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c23
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h1
-rw-r--r--drivers/net/ethernet/faraday/ftgmac100.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c18
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c2
-rw-r--r--drivers/net/ethernet/qlogic/qed/qed_l2.c2
-rw-r--r--drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c31
-rw-r--r--drivers/net/ethernet/realtek/r8169_main.c29
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c2
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c11
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_main.c17
-rw-r--r--drivers/net/macsec.c3
-rw-r--r--drivers/net/phy/at803x.c4
-rw-r--r--drivers/net/phy/micrel.c7
-rw-r--r--drivers/net/tun.c10
-rw-r--r--drivers/net/usb/pegasus.c38
-rw-r--r--drivers/net/wimax/i2400m/driver.c7
-rw-r--r--drivers/nvdimm/bus.c6
-rw-r--r--drivers/nvdimm/dimm.c2
-rw-r--r--drivers/nvdimm/dimm_devs.c95
-rw-r--r--drivers/nvdimm/e820.c18
-rw-r--r--drivers/nvdimm/label.h2
-rw-r--r--drivers/nvdimm/namespace_devs.c28
-rw-r--r--drivers/nvdimm/nd.h7
-rw-r--r--drivers/nvdimm/of_pmem.c4
-rw-r--r--drivers/nvdimm/pfn.h12
-rw-r--r--drivers/nvdimm/pfn_devs.c40
-rw-r--r--drivers/nvdimm/pmem.c101
-rw-r--r--drivers/nvdimm/region_devs.c130
-rw-r--r--drivers/nvme/host/core.c34
-rw-r--r--drivers/nvme/host/fc.c14
-rw-r--r--drivers/nvme/host/multipath.c4
-rw-r--r--drivers/nvme/host/rdma.c2
-rw-r--r--drivers/nvme/host/tcp.c18
-rw-r--r--drivers/nvme/target/configfs.c10
-rw-r--r--drivers/nvme/target/fc.c2
-rw-r--r--drivers/nvme/target/fcloop.c77
-rw-r--r--drivers/nvme/target/rdma.c205
-rw-r--r--drivers/parisc/eisa.c8
-rw-r--r--drivers/pci/ats.c4
-rw-r--r--drivers/pcmcia/cs_internal.h2
-rw-r--r--drivers/pcmcia/omap_cf.c2
-rw-r--r--drivers/pcmcia/rsrc_nonstatic.c6
-rw-r--r--drivers/pcmcia/sa1100_simpad.c6
-rw-r--r--drivers/pcmcia/soc_common.h2
-rw-r--r--drivers/pcmcia/yenta_socket.c10
-rw-r--r--drivers/platform/chrome/Kconfig27
-rw-r--r--drivers/platform/chrome/Makefile5
-rw-r--r--drivers/platform/chrome/chromeos_laptop.c2
-rw-r--r--drivers/platform/chrome/cros_ec.c32
-rw-r--r--drivers/platform/chrome/cros_ec_chardev.c4
-rw-r--r--drivers/platform/chrome/cros_ec_lightbar.c50
-rw-r--r--drivers/platform/chrome/cros_ec_proto.c9
-rw-r--r--drivers/platform/chrome/cros_ec_rpmsg.c16
-rw-r--r--drivers/platform/chrome/cros_ec_sensorhub.c111
-rw-r--r--drivers/platform/chrome/cros_ec_sensorhub_ring.c1046
-rw-r--r--drivers/platform/chrome/cros_ec_spi.c6
-rw-r--r--drivers/platform/chrome/cros_ec_sysfs.c36
-rw-r--r--drivers/platform/chrome/cros_ec_typec.c357
-rw-r--r--drivers/platform/chrome/cros_ec_vbc.c4
-rw-r--r--drivers/platform/chrome/cros_usbpd_notify.c306
-rw-r--r--drivers/platform/chrome/wilco_ec/event.c4
-rw-r--r--drivers/platform/chrome/wilco_ec/properties.c3
-rw-r--r--drivers/platform/chrome/wilco_ec/sysfs.c4
-rw-r--r--drivers/platform/x86/dell-laptop.c4
-rw-r--r--drivers/platform/x86/dell-rbtn.c4
-rw-r--r--drivers/platform/x86/dell-rbtn.h2
-rw-r--r--drivers/platform/x86/dell-smbios-base.c4
-rw-r--r--drivers/platform/x86/dell-smbios-smm.c2
-rw-r--r--drivers/platform/x86/dell-smbios.h2
-rw-r--r--drivers/platform/x86/dell-smo8800.c2
-rw-r--r--drivers/platform/x86/dell-wmi.c4
-rw-r--r--drivers/power/supply/Kconfig2
-rw-r--r--drivers/power/supply/bq2415x_charger.c4
-rw-r--r--drivers/power/supply/bq27xxx_battery.c2
-rw-r--r--drivers/power/supply/cros_usbpd-charger.c50
-rw-r--r--drivers/power/supply/isp1704_charger.c2
-rw-r--r--drivers/power/supply/rx51_battery.c4
-rw-r--r--drivers/ps3/sys-manager-core.c2
-rw-r--r--drivers/pwm/Kconfig58
-rw-r--r--drivers/pwm/core.c135
-rw-r--r--drivers/pwm/pwm-bcm2835.c1
-rw-r--r--drivers/pwm/pwm-imx-tpm.c2
-rw-r--r--drivers/pwm/pwm-imx27.c32
-rw-r--r--drivers/pwm/pwm-jz4740.c162
-rw-r--r--drivers/pwm/pwm-meson.c4
-rw-r--r--drivers/pwm/pwm-mxs.c1
-rw-r--r--drivers/pwm/pwm-omap-dmtimer.c219
-rw-r--r--drivers/pwm/pwm-pca9685.c97
-rw-r--r--drivers/pwm/pwm-rcar.c10
-rw-r--r--drivers/pwm/pwm-renesas-tpu.c11
-rw-r--r--drivers/pwm/pwm-sun4i.c13
-rw-r--r--drivers/pwm/pwm-tegra.c6
-rw-r--r--drivers/rtc/Kconfig10
-rw-r--r--drivers/rtc/Makefile1
-rw-r--r--drivers/rtc/rtc-rc5t619.c444
-rw-r--r--drivers/s390/block/dcssblk.c20
-rw-r--r--drivers/s390/cio/device.c13
-rw-r--r--drivers/s390/cio/qdio.h1
-rw-r--r--drivers/s390/cio/qdio_debug.c16
-rw-r--r--drivers/s390/cio/qdio_debug.h3
-rw-r--r--drivers/s390/cio/qdio_main.c63
-rw-r--r--drivers/s390/cio/qdio_setup.c10
-rw-r--r--drivers/s390/cio/qdio_thinint.c28
-rw-r--r--drivers/s390/cio/vfio_ccw_drv.c5
-rw-r--r--drivers/s390/net/qeth_core.h5
-rw-r--r--drivers/s390/net/qeth_core_main.c65
-rw-r--r--drivers/s390/scsi/zfcp_erp.c10
-rw-r--r--drivers/s390/scsi/zfcp_fsf.c23
-rw-r--r--drivers/s390/scsi/zfcp_qdio.c51
-rw-r--r--drivers/scsi/aacraid/commsup.c7
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_core.c23
-rw-r--r--drivers/scsi/bnx2fc/bnx2fc.h13
-rw-r--r--drivers/scsi/bnx2fc/bnx2fc_fcoe.c8
-rw-r--r--drivers/scsi/bnx2fc/bnx2fc_hwi.c103
-rw-r--r--drivers/scsi/bnx2fc/bnx2fc_io.c34
-rw-r--r--drivers/scsi/constants.c2
-rw-r--r--drivers/scsi/libfc/fc_rport.c10
-rw-r--r--drivers/scsi/lpfc/lpfc.h25
-rw-r--r--drivers/scsi/lpfc/lpfc_attr.c73
-rw-r--r--drivers/scsi/lpfc/lpfc_crtn.h3
-rw-r--r--drivers/scsi/lpfc/lpfc_debugfs.c333
-rw-r--r--drivers/scsi/lpfc/lpfc_debugfs.h3
-rw-r--r--drivers/scsi/lpfc/lpfc_hw.h20
-rw-r--r--drivers/scsi/lpfc/lpfc_init.c106
-rw-r--r--drivers/scsi/lpfc/lpfc_mbox.c2
-rw-r--r--drivers/scsi/lpfc/lpfc_nvme.c149
-rw-r--r--drivers/scsi/lpfc/lpfc_nvmet.c62
-rw-r--r--drivers/scsi/lpfc/lpfc_scsi.c90
-rw-r--r--drivers/scsi/lpfc/lpfc_sli.c47
-rw-r--r--drivers/scsi/lpfc/lpfc_sli.h2
-rw-r--r--drivers/scsi/lpfc/lpfc_sli4.h19
-rw-r--r--drivers/scsi/lpfc/lpfc_version.h2
-rw-r--r--drivers/scsi/mpt3sas/mpt3sas_scsih.c8
-rw-r--r--drivers/scsi/qla2xxx/qla_nvme.c1
-rw-r--r--drivers/scsi/scsi_transport_iscsi.c4
-rw-r--r--drivers/scsi/sr.c4
-rw-r--r--drivers/scsi/ufs/ufs-mediatek.c13
-rw-r--r--drivers/scsi/ufs/ufshcd.c87
-rw-r--r--drivers/scsi/ufs/ufshcd.h15
-rw-r--r--drivers/soc/Kconfig1
-rw-r--r--drivers/soc/Makefile1
-rw-r--r--drivers/soc/kendryte/Kconfig14
-rw-r--r--drivers/soc/kendryte/Makefile3
-rw-r--r--drivers/soc/kendryte/k210-sysctl.c248
-rw-r--r--drivers/staging/gasket/gasket_core.c2
-rw-r--r--drivers/target/target_core_xcopy.c187
-rw-r--r--drivers/target/target_core_xcopy.h9
-rw-r--r--drivers/thermal/Kconfig42
-rw-r--r--drivers/thermal/Makefile3
-rw-r--r--drivers/thermal/cpufreq_cooling.c5
-rw-r--r--drivers/thermal/imx8mm_thermal.c236
-rw-r--r--drivers/thermal/imx_sc_thermal.c148
-rw-r--r--drivers/thermal/imx_thermal.c16
-rw-r--r--drivers/thermal/intel/int340x_thermal/int3400_thermal.c2
-rw-r--r--drivers/thermal/intel/int340x_thermal/processor_thermal_device.c5
-rw-r--r--drivers/thermal/of-thermal.c62
-rw-r--r--drivers/thermal/qcom/tsens-8960.c4
-rw-r--r--drivers/thermal/qcom/tsens-common.c194
-rw-r--r--drivers/thermal/qcom/tsens-v0_1.c6
-rw-r--r--drivers/thermal/qcom/tsens-v1.c6
-rw-r--r--drivers/thermal/qcom/tsens-v2.c24
-rw-r--r--drivers/thermal/qcom/tsens.c65
-rw-r--r--drivers/thermal/qcom/tsens.h105
-rw-r--r--drivers/thermal/qoriq_thermal.c40
-rw-r--r--drivers/thermal/rcar_gen3_thermal.c31
-rw-r--r--drivers/thermal/rcar_thermal.c53
-rw-r--r--drivers/thermal/samsung/exynos_tmu.c4
-rw-r--r--drivers/thermal/sprd_thermal.c552
-rw-r--r--drivers/thermal/st/stm_thermal.c3
-rw-r--r--drivers/thermal/ti-soc-thermal/ti-bandgap.c44
-rw-r--r--drivers/thermal/ti-soc-thermal/ti-bandgap.h4
-rw-r--r--drivers/vdpa/Kconfig37
-rw-r--r--drivers/vdpa/Makefile4
-rw-r--r--drivers/vdpa/ifcvf/Makefile3
-rw-r--r--drivers/vdpa/ifcvf/ifcvf_base.c389
-rw-r--r--drivers/vdpa/ifcvf/ifcvf_base.h118
-rw-r--r--drivers/vdpa/ifcvf/ifcvf_main.c435
-rw-r--r--drivers/vdpa/vdpa.c180
-rw-r--r--drivers/vdpa/vdpa_sim/Makefile2
-rw-r--r--drivers/vdpa/vdpa_sim/vdpa_sim.c629
-rw-r--r--drivers/vhost/Kconfig45
-rw-r--r--drivers/vhost/Kconfig.vringh6
-rw-r--r--drivers/vhost/Makefile6
-rw-r--r--drivers/vhost/iotlb.c177
-rw-r--r--drivers/vhost/net.c5
-rw-r--r--drivers/vhost/scsi.c2
-rw-r--r--drivers/vhost/vdpa.c883
-rw-r--r--drivers/vhost/vhost.c233
-rw-r--r--drivers/vhost/vhost.h45
-rw-r--r--drivers/vhost/vringh.c421
-rw-r--r--drivers/vhost/vsock.c2
-rw-r--r--drivers/video/backlight/corgi_lcd.c68
-rw-r--r--drivers/video/backlight/pwm_bl.c19
-rw-r--r--drivers/video/fbdev/core/fbcon.c3
-rw-r--r--drivers/virtio/Kconfig14
-rw-r--r--drivers/virtio/Makefile1
-rw-r--r--drivers/virtio/virtio_balloon.c180
-rw-r--r--drivers/virtio/virtio_vdpa.c396
-rw-r--r--drivers/watchdog/Kconfig8
-rw-r--r--drivers/watchdog/Makefile1
-rw-r--r--drivers/watchdog/imx2_wdt.c37
-rw-r--r--drivers/watchdog/imx7ulp_wdt.c1
-rw-r--r--drivers/watchdog/imx_sc_wdt.c2
-rw-r--r--drivers/watchdog/npcm_wdt.c19
-rw-r--r--drivers/watchdog/orion_wdt.c2
-rw-r--r--drivers/watchdog/pm8916_wdt.c25
-rw-r--r--drivers/watchdog/qcom-wdt.c34
-rw-r--r--drivers/watchdog/rti_wdt.c255
-rw-r--r--drivers/watchdog/watchdog_core.c12
-rw-r--r--drivers/watchdog/watchdog_dev.c1
-rw-r--r--drivers/watchdog/wm831x_wdt.c27
-rw-r--r--drivers/watchdog/ziirave_wdt.c2
-rw-r--r--drivers/xen/events/events_2l.c16
-rw-r--r--drivers/xen/events/events_base.c93
-rw-r--r--drivers/xen/events/events_fifo.c22
-rw-r--r--drivers/xen/events/events_internal.h30
-rw-r--r--drivers/xen/evtchn.c13
-rw-r--r--drivers/xen/gntdev-common.h3
-rw-r--r--drivers/xen/gntdev.c2
-rw-r--r--drivers/xen/pvcalls-back.c5
-rw-r--r--drivers/xen/pvcalls-front.c15
-rw-r--r--drivers/xen/xen-pciback/xenbus.c7
-rw-r--r--drivers/xen/xen-scsiback.c3
-rw-r--r--drivers/xen/xenbus/xenbus_client.c6
-rw-r--r--fs/binfmt_elf.c48
-rw-r--r--fs/ceph/addr.c90
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c536
-rw-r--r--fs/ceph/debugfs.c16
-rw-r--r--fs/ceph/dir.c132
-rw-r--r--fs/ceph/export.c5
-rw-r--r--fs/ceph/file.c486
-rw-r--r--fs/ceph/inode.c84
-rw-r--r--fs/ceph/ioctl.c2
-rw-r--r--fs/ceph/locks.c31
-rw-r--r--fs/ceph/mds_client.c240
-rw-r--r--fs/ceph/mds_client.h30
-rw-r--r--fs/ceph/super.c28
-rw-r--r--fs/ceph/super.h70
-rw-r--r--fs/cifs/Kconfig2
-rw-r--r--fs/cifs/cifs_debug.c6
-rw-r--r--fs/cifs/cifsfs.c4
-rw-r--r--fs/cifs/cifsglob.h4
-rw-r--r--fs/cifs/file.c61
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/cifs/readdir.c2
-rw-r--r--fs/cifs/smb2misc.c14
-rw-r--r--fs/cifs/smb2proto.h6
-rw-r--r--fs/cifs/smb2transport.c87
-rw-r--r--fs/cifs/smbdirect.c313
-rw-r--r--fs/cifs/smbdirect.h7
-rw-r--r--fs/dax.c59
-rw-r--r--fs/eventpoll.c64
-rw-r--r--fs/f2fs/Kconfig9
-rw-r--r--fs/f2fs/checkpoint.c42
-rw-r--r--fs/f2fs/compress.c317
-rw-r--r--fs/f2fs/data.c141
-rw-r--r--fs/f2fs/debug.c3
-rw-r--r--fs/f2fs/dir.c16
-rw-r--r--fs/f2fs/f2fs.h206
-rw-r--r--fs/f2fs/file.c91
-rw-r--r--fs/f2fs/gc.c51
-rw-r--r--fs/f2fs/inode.c29
-rw-r--r--fs/f2fs/namei.c12
-rw-r--r--fs/f2fs/node.c33
-rw-r--r--fs/f2fs/recovery.c12
-rw-r--r--fs/f2fs/segment.c54
-rw-r--r--fs/f2fs/segment.h2
-rw-r--r--fs/f2fs/shrinker.c2
-rw-r--r--fs/f2fs/super.c89
-rw-r--r--fs/f2fs/sysfs.c50
-rw-r--r--fs/f2fs/xattr.c67
-rw-r--r--fs/f2fs/xattr.h9
-rw-r--r--fs/filesystems.c4
-rw-r--r--fs/hfsplus/attributes.c4
-rw-r--r--fs/hostfs/hostfs_kern.c12
-rw-r--r--fs/io-wq.c12
-rw-r--r--fs/io-wq.h2
-rw-r--r--fs/io_uring.c428
-rw-r--r--fs/iomap/buffered-io.c17
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/callback.h4
-rw-r--r--fs/nfs/callback_proc.c69
-rw-r--r--fs/nfs/delegation.c319
-rw-r--r--fs/nfs/dir.c79
-rw-r--r--fs/nfs/direct.c197
-rw-r--r--fs/nfs/filelayout/filelayout.c165
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c229
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h2
-rw-r--r--fs/nfs/fs_context.c9
-rw-r--r--fs/nfs/inode.c28
-rw-r--r--fs/nfs/internal.h36
-rw-r--r--fs/nfs/namespace.c67
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4file.c3
-rw-r--r--fs/nfs/nfs4namespace.c2
-rw-r--r--fs/nfs/nfs4proc.c19
-rw-r--r--fs/nfs/nfs4state.c24
-rw-r--r--fs/nfs/nfs4trace.h8
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/pagelist.c367
-rw-r--r--fs/nfs/pnfs.c241
-rw-r--r--fs/nfs/pnfs.h143
-rw-r--r--fs/nfs/pnfs_nfs.c515
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/super.c35
-rw-r--r--fs/nfs/unlink.c4
-rw-r--r--fs/nfs/write.c276
-rw-r--r--fs/ocfs2/alloc.c4
-rw-r--r--fs/orangefs/file.c34
-rw-r--r--fs/orangefs/inode.c39
-rw-r--r--fs/orangefs/orangefs-kernel.h4
-rw-r--r--fs/overlayfs/copy_up.c16
-rw-r--r--fs/overlayfs/dir.c31
-rw-r--r--fs/overlayfs/export.c40
-rw-r--r--fs/overlayfs/inode.c99
-rw-r--r--fs/overlayfs/namei.c5
-rw-r--r--fs/overlayfs/overlayfs.h25
-rw-r--r--fs/overlayfs/ovl_entry.h2
-rw-r--r--fs/overlayfs/readdir.c25
-rw-r--r--fs/overlayfs/super.c258
-rw-r--r--fs/overlayfs/util.c40
-rw-r--r--fs/proc/array.c39
-rw-r--r--fs/proc/base.c10
-rw-r--r--fs/proc/cpuinfo.c1
-rw-r--r--fs/proc/generic.c31
-rw-r--r--fs/proc/inode.c188
-rw-r--r--fs/proc/internal.h6
-rw-r--r--fs/proc/kmsg.c1
-rw-r--r--fs/proc/stat.c1
-rw-r--r--fs/proc/task_mmu.c95
-rw-r--r--fs/read_write.c3
-rw-r--r--fs/reiserfs/do_balan.c2
-rw-r--r--fs/reiserfs/ioctl.c11
-rw-r--r--fs/reiserfs/namei.c10
-rw-r--r--fs/seq_file.c35
-rw-r--r--fs/ubifs/io.c16
-rw-r--r--fs/ubifs/journal.c1
-rw-r--r--fs/ubifs/orphan.c13
-rw-r--r--fs/udf/ecma_167.h2
-rw-r--r--fs/udf/osta_udf.h2
-rw-r--r--fs/userfaultfd.c106
-rw-r--r--fs/xfs/libxfs/xfs_sb.c32
-rw-r--r--fs/xfs/xfs_buf.c11
-rw-r--r--fs/xfs/xfs_dquot.c6
-rw-r--r--fs/xfs/xfs_dquot_item.c3
-rw-r--r--fs/xfs/xfs_export.c14
-rw-r--r--fs/xfs/xfs_file.c16
-rw-r--r--fs/xfs/xfs_inode.c174
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_inode_item.c31
-rw-r--r--fs/xfs/xfs_log.c372
-rw-r--r--fs/xfs/xfs_log.h4
-rw-r--r--fs/xfs/xfs_log_cil.c55
-rw-r--r--fs/xfs/xfs_log_priv.h75
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_qm.c14
-rw-r--r--fs/xfs/xfs_super.c17
-rw-r--r--fs/xfs/xfs_symlink.c1
-rw-r--r--fs/xfs/xfs_trace.h15
-rw-r--r--fs/xfs/xfs_trans.c27
-rw-r--r--fs/xfs/xfs_trans_ail.c88
-rw-r--r--fs/xfs/xfs_trans_priv.h6
-rw-r--r--include/asm-generic/pgtable.h1
-rw-r--r--include/asm-generic/pgtable_uffd.h66
-rw-r--r--include/asm-generic/tlb.h3
-rw-r--r--include/clocksource/timer-ti-dm.h3
-rw-r--r--include/crypto/curve25519.h6
-rw-r--r--include/drm/bridge/analogix_dp.h5
-rw-r--r--include/drm/drm_legacy.h2
-rw-r--r--include/dt-bindings/clock/k210-clk.h20
-rw-r--r--include/dt-bindings/leds/common.h36
-rw-r--r--include/linux/acpi.h23
-rw-r--r--include/linux/bitops.h4
-rw-r--r--include/linux/bits.h22
-rw-r--r--include/linux/blk-cgroup.h43
-rw-r--r--include/linux/ceph/ceph_fs.h18
-rw-r--r--include/linux/ceph/debugfs.h14
-rw-r--r--include/linux/ceph/libceph.h1
-rw-r--r--include/linux/ceph/osd_client.h17
-rw-r--r--include/linux/cma.h14
-rw-r--r--include/linux/compiler.h2
-rw-r--r--include/linux/compiler_types.h11
-rw-r--r--include/linux/dax.h21
-rw-r--r--include/linux/devfreq_cooling.h2
-rw-r--r--include/linux/device-mapper.h3
-rw-r--r--include/linux/device.h9
-rw-r--r--include/linux/f2fs_fs.h1
-rw-r--r--include/linux/gfp.h2
-rw-r--r--include/linux/huge_mm.h2
-rw-r--r--include/linux/hugetlb.h12
-rw-r--r--include/linux/iio/common/cros_ec_sensors_core.h104
-rw-r--r--include/linux/iio/iio.h2
-rw-r--r--include/linux/io.h2
-rw-r--r--include/linux/iommu.h35
-rw-r--r--include/linux/leds.h1
-rw-r--r--include/linux/leds_pwm.h22
-rw-r--r--include/linux/libnvdimm.h2
-rw-r--r--include/linux/memblock.h3
-rw-r--r--include/linux/memory.h1
-rw-r--r--include/linux/memory_hotplug.h34
-rw-r--r--include/linux/memremap.h10
-rw-r--r--include/linux/mfd/iqs62x.h139
-rw-r--r--include/linux/mfd/rk808.h2
-rw-r--r--include/linux/mfd/rn5t618.h26
-rw-r--r--include/linux/mfd/sc27xx-pmic.h7
-rw-r--r--include/linux/mfd/wm831x/pdata.h1
-rw-r--r--include/linux/mm.h57
-rw-r--r--include/linux/mm_inline.h15
-rw-r--r--include/linux/mm_types.h4
-rw-r--r--include/linux/mmzone.h48
-rw-r--r--include/linux/nfs_fs.h1
-rw-r--r--include/linux/nfs_page.h5
-rw-r--r--include/linux/nfs_xdr.h32
-rw-r--r--include/linux/numa.h30
-rw-r--r--include/linux/nvme-fc-driver.h4
-rw-r--r--include/linux/page-flags.h16
-rw-r--r--include/linux/page_reporting.h26
-rw-r--r--include/linux/pagemap.h4
-rw-r--r--include/linux/percpu_counter.h4
-rw-r--r--include/linux/pid.h1
-rw-r--r--include/linux/platform_data/cros_ec_proto.h4
-rw-r--r--include/linux/platform_data/cros_ec_sensorhub.h163
-rw-r--r--include/linux/platform_data/cros_usbpd_notify.h17
-rw-r--r--include/linux/platform_data/leds-kirkwood-ns2.h38
-rw-r--r--include/linux/platform_data/pwm_omap_dmtimer.h90
-rw-r--r--include/linux/platform_data/wilco-ec.h8
-rw-r--r--include/linux/power/bq2415x_charger.h2
-rw-r--r--include/linux/printk.h5
-rw-r--r--include/linux/proc_fs.h17
-rw-r--r--include/linux/pwm.h4
-rw-r--r--include/linux/pwm_backlight.h2
-rw-r--r--include/linux/refcount.h23
-rw-r--r--include/linux/seq_file.h1
-rw-r--r--include/linux/shmem_fs.h10
-rw-r--r--include/linux/skbuff.h38
-rw-r--r--include/linux/slab.h2
-rw-r--r--include/linux/spi/corgi_lcd.h3
-rw-r--r--include/linux/stackdepot.h2
-rw-r--r--include/linux/sunrpc/sched.h1
-rw-r--r--include/linux/sunrpc/xdr.h1
-rw-r--r--include/linux/swapops.h5
-rw-r--r--include/linux/thermal.h10
-rw-r--r--include/linux/userfaultfd_k.h42
-rw-r--r--include/linux/vdpa.h253
-rw-r--r--include/linux/vhost_iotlb.h47
-rw-r--r--include/linux/vm_event_item.h5
-rw-r--r--include/linux/vringh.h36
-rw-r--r--include/sound/soc-dai.h2
-rw-r--r--include/trace/events/f2fs.h3
-rw-r--r--include/trace/events/huge_memory.h1
-rw-r--r--include/trace/events/mmflags.h1
-rw-r--r--include/trace/events/rpcrdma.h153
-rw-r--r--include/trace/events/vmscan.h2
-rw-r--r--include/uapi/linux/input-event-codes.h2
-rw-r--r--include/uapi/linux/um_timetravel.h128
-rw-r--r--include/uapi/linux/userfaultfd.h40
-rw-r--r--include/uapi/linux/vhost.h24
-rw-r--r--include/uapi/linux/vhost_types.h8
-rw-r--r--include/uapi/linux/virtio_balloon.h1
-rw-r--r--include/uapi/linux/virtio_iommu.h12
-rw-r--r--include/uapi/linux/virtio_net.h102
-rw-r--r--include/xen/events.h22
-rw-r--r--include/xen/interface/event_channel.h2
-rw-r--r--include/xen/xenbus.h5
-rw-r--r--init/Kconfig12
-rw-r--r--init/Makefile2
-rw-r--r--init/main.c1
-rw-r--r--ipc/mqueue.c5
-rw-r--r--ipc/shm.c2
-rw-r--r--ipc/util.c3
-rw-r--r--kernel/configs/tiny.config1
-rw-r--r--kernel/dma/debug.c9
-rw-r--r--kernel/dma/direct.c3
-rw-r--r--kernel/events/core.c85
-rw-r--r--kernel/extable.c3
-rw-r--r--kernel/fork.c10
-rw-r--r--kernel/gcov/fs.c4
-rw-r--r--kernel/gcov/gcc_3_4.c6
-rw-r--r--kernel/gcov/gcc_4_7.c2
-rw-r--r--kernel/kallsyms.c2
-rw-r--r--kernel/kmod.c6
-rw-r--r--kernel/locking/lockdep.c51
-rw-r--r--kernel/locking/percpu-rwsem.c3
-rw-r--r--kernel/module.c5
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/power/user.c101
-rw-r--r--kernel/printk/internal.h5
-rw-r--r--kernel/printk/printk.c34
-rw-r--r--kernel/printk/printk_safe.c11
-rw-r--r--kernel/sched/core.c10
-rw-r--r--kernel/sched/debug.c44
-rw-r--r--kernel/sched/fair.c48
-rw-r--r--kernel/sched/sched.h7
-rw-r--r--kernel/workqueue.c6
-rw-r--r--lib/Kconfig3
-rw-r--r--lib/Kconfig.debug26
-rw-r--r--lib/Kconfig.ubsan49
-rw-r--r--lib/Makefile8
-rw-r--r--lib/bch.c2
-rw-r--r--lib/dynamic_debug.c2
-rw-r--r--lib/raid6/algos.c8
-rw-r--r--lib/raid6/avx2.c4
-rw-r--r--lib/raid6/recov_avx2.c6
-rw-r--r--lib/raid6/recov_ssse3.c6
-rw-r--r--lib/raid6/test/Makefile9
-rw-r--r--lib/rbtree.c4
-rw-r--r--lib/scatterlist.c2
-rw-r--r--lib/stackdepot.c39
-rw-r--r--lib/test_bitmap.c2
-rw-r--r--lib/test_kmod.c2
-rw-r--r--lib/test_lockup.c599
-rw-r--r--lib/test_stackinit.c28
-rw-r--r--lib/ts_bm.c2
-rw-r--r--lib/ts_fsm.c2
-rw-r--r--lib/ts_kmp.c2
-rw-r--r--lib/ubsan.c47
-rw-r--r--mm/Kconfig140
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c6
-rw-r--r--mm/cma.c16
-rw-r--r--mm/compaction.c3
-rw-r--r--mm/dmapool.c4
-rw-r--r--mm/filemap.c14
-rw-r--r--mm/gup.c16
-rw-r--r--mm/huge_memory.c36
-rw-r--r--mm/hugetlb.c110
-rw-r--r--mm/hugetlb_cgroup.c6
-rw-r--r--mm/internal.h2
-rw-r--r--mm/kasan/common.c23
-rw-r--r--mm/kasan/report.c10
-rw-r--r--mm/khugepaged.c39
-rw-r--r--mm/ksm.c5
-rw-r--r--mm/list_lru.c2
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c8
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c206
-rw-r--r--mm/memory_hotplug.c60
-rw-r--r--mm/mempolicy.c44
-rw-r--r--mm/memremap.c40
-rw-r--r--mm/migrate.c118
-rw-r--r--mm/mm_init.c2
-rw-r--r--mm/mmap.c14
-rw-r--r--mm/mprotect.c80
-rw-r--r--mm/page_alloc.c169
-rw-r--r--mm/page_ext.c5
-rw-r--r--mm/page_isolation.c6
-rw-r--r--mm/page_reporting.c364
-rw-r--r--mm/page_reporting.h54
-rw-r--r--mm/rmap.c23
-rw-r--r--mm/shmem.c166
-rw-r--r--mm/shuffle.c12
-rw-r--r--mm/shuffle.h6
-rw-r--r--mm/slab_common.c3
-rw-r--r--mm/slub.c3
-rw-r--r--mm/sparse.c136
-rw-r--r--mm/swap.c20
-rw-r--r--mm/swapfile.c1
-rw-r--r--mm/userfaultfd.c94
-rw-r--r--mm/vmalloc.c2
-rw-r--r--mm/vmscan.c12
-rw-r--r--mm/vmstat.c3
-rw-r--r--mm/zsmalloc.c10
-rw-r--r--mm/zswap.c24
-rw-r--r--net/ceph/debugfs.c20
-rw-r--r--net/ceph/mon_client.c8
-rw-r--r--net/ceph/osd_client.c82
-rw-r--r--net/core/neighbour.c10
-rw-r--r--net/core/sock.c2
-rw-r--r--net/dsa/slave.c2
-rw-r--r--net/ipv6/addrconf.c11
-rw-r--r--net/ipv6/ndisc.c4
-rw-r--r--net/ipv6/rpl.c6
-rw-r--r--net/ipv6/rpl_iptunnel.c2
-rw-r--r--net/mptcp/options.c2
-rw-r--r--net/mptcp/pm.c2
-rw-r--r--net/mptcp/pm_netlink.c2
-rw-r--r--net/mptcp/protocol.c109
-rw-r--r--net/mptcp/protocol.h2
-rw-r--r--net/mptcp/subflow.c3
-rw-r--r--net/mptcp/token.c9
-rw-r--r--net/netfilter/Makefile2
-rw-r--r--net/netfilter/nf_tables_api.c2
-rw-r--r--net/netfilter/nft_set_pipapo.c2
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.h4
-rw-r--r--net/openvswitch/flow_table.c10
-rw-r--r--net/sched/cls_tcindex.c45
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c94
-rw-r--r--net/sunrpc/clnt.c8
-rw-r--r--net/sunrpc/sched.c22
-rw-r--r--net/sunrpc/xdr.c55
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c8
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c154
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c32
-rw-r--r--net/sunrpc/xprtrdma/transport.c72
-rw-r--r--net/sunrpc/xprtrdma/verbs.c683
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h89
-rw-r--r--net/sunrpc/xprtsock.c2
-rw-r--r--samples/hw_breakpoint/data_breakpoint.c11
-rw-r--r--scripts/Kconfig.include3
-rw-r--r--scripts/Makefile4
-rw-r--r--scripts/Makefile.build19
-rw-r--r--scripts/Makefile.clean1
-rw-r--r--scripts/Makefile.extrawarn1
-rw-r--r--scripts/Makefile.host23
-rw-r--r--scripts/Makefile.ubsan16
-rwxr-xr-xscripts/checkpatch.pl155
-rwxr-xr-xscripts/dummy-tools/gcc91
-rwxr-xr-xscripts/dummy-tools/ld30
l---------scripts/dummy-tools/nm1
l---------scripts/dummy-tools/objcopy1
-rwxr-xr-xscripts/gcc-plugin.sh55
-rw-r--r--scripts/gcc-plugins/Kconfig12
-rw-r--r--scripts/gcc-plugins/Makefile21
-rw-r--r--scripts/kconfig/qconf.cc90
-rw-r--r--scripts/kconfig/qconf.h2
-rwxr-xr-xscripts/mkcompile_h6
-rw-r--r--sound/core/oss/pcm_plugin.c22
-rw-r--r--sound/pci/hda/hda_beep.c6
-rw-r--r--sound/pci/hda/hda_intel.c16
-rw-r--r--sound/pci/hda/patch_realtek.c127
-rw-r--r--sound/pci/ice1712/prodigy_hifi.c4
-rw-r--r--sound/soc/amd/raven/acp3x-i2s.c1
-rw-r--r--sound/soc/amd/raven/acp3x.h2
-rw-r--r--sound/soc/bcm/bcm63xx-pcm-whistler.c2
-rw-r--r--sound/soc/codecs/cs4270.c40
-rw-r--r--sound/soc/codecs/rt5645.c8
-rw-r--r--sound/soc/codecs/rt5682.c5
-rw-r--r--sound/soc/intel/atom/sst-atom-controls.c4
-rw-r--r--sound/soc/intel/atom/sst/sst_pvt.c2
-rw-r--r--sound/soc/intel/boards/bdw-rt5650.c1
-rw-r--r--sound/soc/intel/boards/bdw-rt5677.c1
-rw-r--r--sound/soc/intel/boards/broadwell.c1
-rw-r--r--sound/soc/intel/boards/bytcr_rt5640.c11
-rw-r--r--sound/soc/intel/boards/haswell.c1
-rw-r--r--sound/soc/qcom/qdsp6/q6asm-dai.c4
-rw-r--r--sound/soc/soc-dai.c8
-rw-r--r--sound/soc/soc-dapm.c8
-rw-r--r--sound/soc/soc-ops.c4
-rw-r--r--sound/soc/soc-pcm.c6
-rw-r--r--sound/soc/soc-topology.c2
-rw-r--r--sound/soc/sof/loader.c2
-rw-r--r--sound/soc/stm/stm32_sai_sub.c4
-rw-r--r--sound/usb/mixer_maps.c28
-rw-r--r--sound/usb/quirks-table.h42
-rw-r--r--sound/usb/quirks.c1
-rw-r--r--tools/laptop/freefall/freefall.c2
-rw-r--r--tools/lib/rbtree.c4
-rw-r--r--tools/objtool/Makefile6
-rw-r--r--tools/testing/nvdimm/Kbuild4
-rw-r--r--tools/testing/nvdimm/test/Kbuild4
-rw-r--r--tools/testing/nvdimm/test/nfit.c2
-rw-r--r--tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c67
-rwxr-xr-xtools/testing/selftests/kmod/kmod.sh43
-rwxr-xr-xtools/testing/selftests/powerpc/eeh/eeh-basic.sh5
-rw-r--r--tools/testing/selftests/powerpc/tm/Makefile1
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c225
-rw-r--r--tools/thermal/tmon/tmon.c26
-rw-r--r--tools/virtio/Makefile27
-rw-r--r--usr/include/Makefile2
1251 files changed, 39672 insertions, 16036 deletions
diff --git a/.mailmap b/.mailmap
index 9198a93c2f5c..db3754a41018 100644
--- a/.mailmap
+++ b/.mailmap
@@ -210,6 +210,7 @@ Oleksij Rempel <linux@rempel-privat.de> <external.Oleksij.Rempel@de.bosch.com>
Oleksij Rempel <linux@rempel-privat.de> <fixed-term.Oleksij.Rempel@de.bosch.com>
Oleksij Rempel <linux@rempel-privat.de> <o.rempel@pengutronix.de>
Oleksij Rempel <linux@rempel-privat.de> <ore@pengutronix.de>
+Pali Rohár <pali@kernel.org> <pali.rohar@gmail.com>
Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Patrick Mochel <mochel@digitalimplant.org>
Paul Burton <paulburton@kernel.org> <paul.burton@imgtec.com>
@@ -248,6 +249,7 @@ Sakari Ailus <sakari.ailus@linux.intel.com> <sakari.ailus@iki.fi>
Sean Nyekjaer <sean@geanix.com> <sean.nyekjaer@prevas.dk>
Sebastian Reichel <sre@kernel.org> <sre@debian.org>
Sebastian Reichel <sre@kernel.org> <sebastian.reichel@collabora.co.uk>
+Sedat Dilek <sedat.dilek@gmail.com> <sedat.dilek@credativ.de>
Shiraz Hashim <shiraz.linux.kernel@gmail.com> <shiraz.hashim@st.com>
Shuah Khan <shuah@kernel.org> <shuahkhan@gmail.com>
Shuah Khan <shuah@kernel.org> <shuah.khan@hp.com>
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 1a6cd5397129..bd8a0d19abe6 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -318,3 +318,8 @@ Date: September 2019
Contact: "Hridya Valsaraju" <hridya@google.com>
Description: Average number of valid blocks.
Available when CONFIG_F2FS_STAT_FS=y.
+
+What: /sys/fs/f2fs/<disk>/mounted_time_sec
+Date: February 2020
+Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description: Show the mounted time in secs of this partition.
diff --git a/Documentation/ABI/testing/sysfs-platform-dell-laptop b/Documentation/ABI/testing/sysfs-platform-dell-laptop
index 8c6a0b8e1131..9b917c7453de 100644
--- a/Documentation/ABI/testing/sysfs-platform-dell-laptop
+++ b/Documentation/ABI/testing/sysfs-platform-dell-laptop
@@ -2,7 +2,7 @@ What: /sys/class/leds/dell::kbd_backlight/als_enabled
Date: December 2014
KernelVersion: 3.19
Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>,
- Pali Rohár <pali.rohar@gmail.com>
+ Pali Rohár <pali@kernel.org>
Description:
This file allows to control the automatic keyboard
illumination mode on some systems that have an ambient
@@ -13,7 +13,7 @@ What: /sys/class/leds/dell::kbd_backlight/als_setting
Date: December 2014
KernelVersion: 3.19
Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>,
- Pali Rohár <pali.rohar@gmail.com>
+ Pali Rohár <pali@kernel.org>
Description:
This file allows to specifiy the on/off threshold value,
as reported by the ambient light sensor.
@@ -22,7 +22,7 @@ What: /sys/class/leds/dell::kbd_backlight/start_triggers
Date: December 2014
KernelVersion: 3.19
Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>,
- Pali Rohár <pali.rohar@gmail.com>
+ Pali Rohár <pali@kernel.org>
Description:
This file allows to control the input triggers that
turn on the keyboard backlight illumination that is
@@ -45,7 +45,7 @@ What: /sys/class/leds/dell::kbd_backlight/stop_timeout
Date: December 2014
KernelVersion: 3.19
Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>,
- Pali Rohár <pali.rohar@gmail.com>
+ Pali Rohár <pali@kernel.org>
Description:
This file allows to specify the interval after which the
keyboard illumination is disabled because of inactivity.
diff --git a/Documentation/admin-guide/binderfs.rst b/Documentation/admin-guide/binderfs.rst
index c009671f8434..8243af9b3510 100644
--- a/Documentation/admin-guide/binderfs.rst
+++ b/Documentation/admin-guide/binderfs.rst
@@ -33,6 +33,12 @@ max
a per-instance limit. If ``max=<count>`` is set then only ``<count>`` number
of binder devices can be allocated in this binderfs instance.
+stats
+ Using ``stats=global`` enables global binder statistics.
+ ``stats=global`` is only available for a binderfs instance mounted in the
+ initial user namespace. An attempt to use the option to mount a binderfs
+ instance in another user namespace will return a permission error.
+
Allocating binder Devices
-------------------------
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2d31d811a52d..f2a93c8679e8 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -685,7 +685,7 @@
coredump_filter=
[KNL] Change the default value for
/proc/<pid>/coredump_filter.
- See also Documentation/filesystems/proc.txt.
+ See also Documentation/filesystems/proc.rst.
coresight_cpu_debug.enable
[ARM,ARM64]
@@ -962,7 +962,7 @@
edid/1680x1050.bin, or edid/1920x1080.bin is given
and no file with the same name exists. Details and
instructions how to build your own EDID data are
- available in Documentation/driver-api/edid.rst. An EDID
+ available in Documentation/admin-guide/edid.rst. An EDID
data set will only be used for a particular connector,
if its name and a colon are prepended to the EDID
name. Each connector may use a unique EDID data
@@ -992,10 +992,6 @@
Documentation/admin-guide/dynamic-debug-howto.rst
for details.
- nompx [X86] Disables Intel Memory Protection Extensions.
- See Documentation/x86/intel_mpx.rst for more
- information about the feature.
-
nopku [X86] Disable Memory Protection Keys CPU feature found
in some Intel CPUs.
@@ -1475,6 +1471,14 @@
hpet_mmap= [X86, HPET_MMAP] Allow userspace to mmap HPET
registers. Default set by CONFIG_HPET_MMAP_DEFAULT.
+ hugetlb_cma= [HW] The size of a cma area used for allocation
+ of gigantic hugepages.
+ Format: nn[KMGTPE]
+
+ Reserve a cma area of given size and allocate gigantic
+ hugepages using the cma allocator. If enabled, the
+ boot-time allocation of gigantic hugepages is skipped.
+
hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot.
hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages.
On x86-64 and powerpc, this option can be specified
@@ -2573,13 +2577,22 @@
For details see: Documentation/admin-guide/hw-vuln/mds.rst
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
- Amount of memory to be used when the kernel is not able
- to see the whole system memory or for test.
+ Amount of memory to be used in cases as follows:
+
+ 1 for test;
+ 2 when the kernel is not able to see the whole system memory;
+ 3 memory that lies after 'mem=' boundary is excluded from
+ the hypervisor, then assigned to KVM guests.
+
[X86] Work as limiting max address. Use together
with memmap= to avoid physical address space collisions.
Without memmap= PCI devices could be placed at addresses
belonging to unused RAM.
+ Note that this only takes effects during boot time since
+ in above case 3, memory may need be hot added after boot
+ if system memory of hypervisor is not sufficient.
+
mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel
memory.
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index bd5714547cee..2f31de8f7c74 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -310,6 +310,11 @@ thp_fault_fallback
is incremented if a page fault fails to allocate
a huge page and instead falls back to using small pages.
+thp_fault_fallback_charge
+ is incremented if a page fault fails to charge a huge page and
+ instead falls back to using small pages even though the
+ allocation was successful.
+
thp_collapse_alloc_failed
is incremented if khugepaged found a range
of pages that should be collapsed into one huge page but failed
@@ -319,6 +324,15 @@ thp_file_alloc
is incremented every time a file huge page is successfully
allocated.
+thp_file_fallback
+ is incremented if a file huge page is attempted to be allocated
+ but fails and instead falls back to using small pages.
+
+thp_file_fallback_charge
+ is incremented if a file huge page cannot be charged and instead
+ falls back to using small pages even though the allocation was
+ successful.
+
thp_file_mapped
is incremented every time a file huge page is mapped into
user address space.
diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
index 5048cf661a8a..c30176e67900 100644
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -108,6 +108,57 @@ UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an
half copied page since it'll keep userfaulting until the copy has
finished.
+Notes:
+
+- If you requested UFFDIO_REGISTER_MODE_MISSING when registering then
+ you must provide some kind of page in your thread after reading from
+ the uffd. You must provide either UFFDIO_COPY or UFFDIO_ZEROPAGE.
+ The normal behavior of the OS automatically providing a zero page on
+ an annonymous mmaping is not in place.
+
+- None of the page-delivering ioctls default to the range that you
+ registered with. You must fill in all fields for the appropriate
+ ioctl struct including the range.
+
+- You get the address of the access that triggered the missing page
+ event out of a struct uffd_msg that you read in the thread from the
+ uffd. You can supply as many pages as you want with UFFDIO_COPY or
+ UFFDIO_ZEROPAGE. Keep in mind that unless you used DONTWAKE then
+ the first of any of those IOCTLs wakes up the faulting thread.
+
+- Be sure to test for all errors including (pollfd[0].revents &
+ POLLERR). This can happen, e.g. when ranges supplied were
+ incorrect.
+
+Write Protect Notifications
+---------------------------
+
+This is equivalent to (but faster than) using mprotect and a SIGSEGV
+signal handler.
+
+Firstly you need to register a range with UFFDIO_REGISTER_MODE_WP.
+Instead of using mprotect(2) you use ioctl(uffd, UFFDIO_WRITEPROTECT,
+struct *uffdio_writeprotect) while mode = UFFDIO_WRITEPROTECT_MODE_WP
+in the struct passed in. The range does not default to and does not
+have to be identical to the range you registered with. You can write
+protect as many ranges as you like (inside the registered range).
+Then, in the thread reading from uffd the struct will have
+msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP set. Now you send
+ioctl(uffd, UFFDIO_WRITEPROTECT, struct *uffdio_writeprotect) again
+while pagefault.mode does not have UFFDIO_WRITEPROTECT_MODE_WP set.
+This wakes up the thread which will continue to run with writes. This
+allows you to do the bookkeeping about the write in the uffd reading
+thread before the ioctl.
+
+If you registered with both UFFDIO_REGISTER_MODE_MISSING and
+UFFDIO_REGISTER_MODE_WP then you need to think about the sequence in
+which you supply a page and undo write protect. Note that there is a
+difference between writes into a WP area and into a !WP area. The
+former will have UFFD_PAGEFAULT_FLAG_WP set, the latter
+UFFD_PAGEFAULT_FLAG_WRITE. The latter did not fail on protection but
+you still need to supply a page when UFFDIO_REGISTER_MODE_MISSING was
+used.
+
QEMU/KVM
========
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 335696d3360d..39c95c0e13d3 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -446,6 +446,27 @@ Notes:
successful IPC object allocation. If an IPC object allocation syscall
fails, it is undefined if the value remains unmodified or is reset to -1.
+modprobe:
+=========
+
+The path to the usermode helper for autoloading kernel modules, by
+default "/sbin/modprobe". This binary is executed when the kernel
+requests a module. For example, if userspace passes an unknown
+filesystem type to mount(), then the kernel will automatically request
+the corresponding filesystem module by executing this usermode helper.
+This usermode helper should insert the needed module into the kernel.
+
+This sysctl only affects module autoloading. It has no effect on the
+ability to explicitly insert modules.
+
+If this sysctl is set to the empty string, then module autoloading is
+completely disabled. The kernel will not try to execute a usermode
+helper at all, nor will it call the kernel_module_request LSM hook.
+
+If CONFIG_STATIC_USERMODEHELPER=y is set in the kernel configuration,
+then the configured static usermode helper overrides this sysctl,
+except that the empty string is still accepted to completely disable
+module autoloading as described above.
nmi_watchdog
============
diff --git a/Documentation/admin-guide/sysrq.rst b/Documentation/admin-guide/sysrq.rst
index 72b2cfb066f4..a46209f4636c 100644
--- a/Documentation/admin-guide/sysrq.rst
+++ b/Documentation/admin-guide/sysrq.rst
@@ -48,9 +48,10 @@ always allowed (by a user with admin privileges).
How do I use the magic SysRq key?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-On x86 - You press the key combo :kbd:`ALT-SysRq-<command key>`.
+On x86
+ You press the key combo :kbd:`ALT-SysRq-<command key>`.
-.. note::
+ .. note::
Some
keyboards may not have a key labeled 'SysRq'. The 'SysRq' key is
also known as the 'Print Screen' key. Also some keyboards cannot
@@ -58,14 +59,15 @@ On x86 - You press the key combo :kbd:`ALT-SysRq-<command key>`.
have better luck with press :kbd:`Alt`, press :kbd:`SysRq`,
release :kbd:`SysRq`, press :kbd:`<command key>`, release everything.
-On SPARC - You press :kbd:`ALT-STOP-<command key>`, I believe.
+On SPARC
+ You press :kbd:`ALT-STOP-<command key>`, I believe.
On the serial console (PC style standard serial ports only)
You send a ``BREAK``, then within 5 seconds a command key. Sending
``BREAK`` twice is interpreted as a normal BREAK.
On PowerPC
- Press :kbd:`ALT - Print Screen` (or :kbd:`F13`) - :kbd:`<command key>`,
+ Press :kbd:`ALT - Print Screen` (or :kbd:`F13`) - :kbd:`<command key>`.
:kbd:`Print Screen` (or :kbd:`F13`) - :kbd:`<command key>` may suffice.
On other
@@ -73,7 +75,7 @@ On other
let me know so I can add them to this section.
On all
- write a character to /proc/sysrq-trigger. e.g.::
+ Write a character to /proc/sysrq-trigger. e.g.::
echo t > /proc/sysrq-trigger
@@ -282,7 +284,7 @@ Just ask them on the linux-kernel mailing list:
Credits
~~~~~~~
-Written by Mydraal <vulpyne@vulpyne.net>
-Updated by Adam Sulmicki <adam@cfar.umd.edu>
-Updated by Jeremy M. Dolan <jmd@turbogeek.org> 2001/01/28 10:15:59
-Added to by Crutcher Dunnavant <crutcher+kernel@datastacks.com>
+- Written by Mydraal <vulpyne@vulpyne.net>
+- Updated by Adam Sulmicki <adam@cfar.umd.edu>
+- Updated by Jeremy M. Dolan <jmd@turbogeek.org> 2001/01/28 10:15:59
+- Added to by Crutcher Dunnavant <crutcher+kernel@datastacks.com>
diff --git a/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt b/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt
index 8cac6fafaab0..623fedf12180 100644
--- a/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt
+++ b/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt
@@ -166,6 +166,17 @@ Required properties:
followed by "fsl,imx-sc-key";
- linux,keycodes: See Documentation/devicetree/bindings/input/input.yaml
+Thermal bindings based on SCU Message Protocol
+------------------------------------------------------------
+
+Required properties:
+- compatible: Should be :
+ "fsl,imx8qxp-sc-thermal"
+ followed by "fsl,imx-sc-thermal";
+
+- #thermal-sensor-cells: See Documentation/devicetree/bindings/thermal/thermal.txt
+ for a description.
+
Example (imx8qxp):
-------------
aliases {
@@ -238,6 +249,11 @@ firmware {
compatible = "fsl,imx8qxp-sc-wdt", "fsl,imx-sc-wdt";
timeout-sec = <60>;
};
+
+ tsens: thermal-sensor {
+ compatible = "fsl,imx8qxp-sc-thermal", "fsl,imx-sc-thermal";
+ #thermal-sensor-cells = <1>;
+ };
};
};
diff --git a/Documentation/devicetree/bindings/display/panel/panel-dpi.yaml b/Documentation/devicetree/bindings/display/panel/panel-dpi.yaml
index f63870384c00..0cd74c8dab42 100644
--- a/Documentation/devicetree/bindings/display/panel/panel-dpi.yaml
+++ b/Documentation/devicetree/bindings/display/panel/panel-dpi.yaml
@@ -21,15 +21,6 @@ properties:
- {}
- const: panel-dpi
- data-mapping:
- enum:
- - rgb24
- - rgb565
- - bgr666
- description: |
- Describes the media format, how the display panel is connected
- to the display interface.
-
backlight: true
enable-gpios: true
height-mm: true
@@ -52,7 +43,6 @@ examples:
compatible = "osddisplays,osd057T0559-34ts", "panel-dpi";
label = "osddisplay";
power-supply = <&vcc_supply>;
- data-mapping = "rgb565";
backlight = <&backlight>;
port {
diff --git a/Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml b/Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml
index cac61a998203..eb04c2330698 100644
--- a/Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml
+++ b/Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml
@@ -65,7 +65,7 @@ properties:
ports:
type: object
description:
- Ports as described in Documentation/devictree/bindings/graph.txt
+ Ports as described in Documentation/devicetree/bindings/graph.txt
properties:
"#address-cells":
const: 1
@@ -121,7 +121,7 @@ examples:
#include <dt-bindings/interrupt-controller/irq.h>
#include <dt-bindings/soc/ti,sci_pm_domain.h>
- dss: dss@04a00000 {
+ dss: dss@4a00000 {
compatible = "ti,am65x-dss";
reg = <0x0 0x04a00000 0x0 0x1000>, /* common */
<0x0 0x04a02000 0x0 0x1000>, /* vidl1 */
diff --git a/Documentation/devicetree/bindings/display/ti/ti,j721e-dss.yaml b/Documentation/devicetree/bindings/display/ti/ti,j721e-dss.yaml
index ade9b2f513f5..eb4b1a266210 100644
--- a/Documentation/devicetree/bindings/display/ti/ti,j721e-dss.yaml
+++ b/Documentation/devicetree/bindings/display/ti/ti,j721e-dss.yaml
@@ -98,7 +98,7 @@ properties:
ports:
type: object
description:
- Ports as described in Documentation/devictree/bindings/graph.txt
+ Ports as described in Documentation/devicetree/bindings/graph.txt
properties:
"#address-cells":
const: 1
@@ -154,7 +154,7 @@ examples:
#include <dt-bindings/interrupt-controller/irq.h>
#include <dt-bindings/soc/ti,sci_pm_domain.h>
- dss: dss@04a00000 {
+ dss: dss@4a00000 {
compatible = "ti,j721e-dss";
reg = <0x00 0x04a00000 0x00 0x10000>, /* common_m */
<0x00 0x04a10000 0x00 0x10000>, /* common_s0*/
diff --git a/Documentation/devicetree/bindings/display/ti/ti,k2g-dss.yaml b/Documentation/devicetree/bindings/display/ti/ti,k2g-dss.yaml
index 385bd060ccf9..8f87b82c6695 100644
--- a/Documentation/devicetree/bindings/display/ti/ti,k2g-dss.yaml
+++ b/Documentation/devicetree/bindings/display/ti/ti,k2g-dss.yaml
@@ -56,7 +56,7 @@ properties:
port:
type: object
description:
- Port as described in Documentation/devictree/bindings/graph.txt.
+ Port as described in Documentation/devicetree/bindings/graph.txt.
The DSS DPI output port node
max-memory-bandwidth:
@@ -81,7 +81,7 @@ examples:
#include <dt-bindings/interrupt-controller/arm-gic.h>
#include <dt-bindings/interrupt-controller/irq.h>
- dss: dss@02540000 {
+ dss: dss@2540000 {
compatible = "ti,k2g-dss";
reg = <0x02540000 0x400>,
<0x02550000 0x1000>,
diff --git a/Documentation/devicetree/bindings/fpga/fpga-region.txt b/Documentation/devicetree/bindings/fpga/fpga-region.txt
index 90c44694a30b..8ab19d1d3f9a 100644
--- a/Documentation/devicetree/bindings/fpga/fpga-region.txt
+++ b/Documentation/devicetree/bindings/fpga/fpga-region.txt
@@ -263,7 +263,7 @@ Overlay contains:
gpio@10040 {
compatible = "altr,pio-1.0";
reg = <0x10040 0x20>;
- altr,gpio-bank-width = <4>;
+ altr,ngpio = <4>;
#gpio-cells = <2>;
clocks = <2>;
gpio-controller;
@@ -468,8 +468,7 @@ programming is the FPGA based bridge of fpga_region1.
compatible = "altr,pio-1.0";
reg = <0x10040 0x20>;
clocks = <0x2>;
- altr,gpio-bank-width = <0x4>;
- resetvalue = <0x0>;
+ altr,ngpio = <0x4>;
#gpio-cells = <0x2>;
gpio-controller;
};
diff --git a/Documentation/devicetree/bindings/input/iqs62x-keys.yaml b/Documentation/devicetree/bindings/input/iqs62x-keys.yaml
new file mode 100644
index 000000000000..5625c222903a
--- /dev/null
+++ b/Documentation/devicetree/bindings/input/iqs62x-keys.yaml
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/iqs62x-keys.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Azoteq IQS620A/621/622/624/625 Keys and Switches
+
+maintainers:
+ - Jeff LaBundy <jeff@labundy.com>
+
+description: |
+ The Azoteq IQS620A, IQS621, IQS622, IQS624 and IQS625 multi-function sensors
+ feature a variety of self-capacitive, mutual-inductive and Hall-effect sens-
+ ing capabilities that can facilitate a variety of contactless key and switch
+ applications.
+
+ These functions are collectively represented by a "keys" child node from the
+ parent MFD driver. See Documentation/devicetree/bindings/mfd/iqs62x.yaml for
+ further details and examples. Sensor hardware configuration (self-capacitive
+ vs. mutual-inductive, etc.) is selected based on the device's firmware.
+
+properties:
+ compatible:
+ enum:
+ - azoteq,iqs620a-keys
+ - azoteq,iqs621-keys
+ - azoteq,iqs622-keys
+ - azoteq,iqs624-keys
+ - azoteq,iqs625-keys
+
+ linux,keycodes:
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32-array
+ - minItems: 1
+ maxItems: 16
+ description: |
+ Specifies the numeric keycodes associated with each available touch or
+ proximity event according to the following table. An 'x' indicates the
+ event is supported for a given device. Specify 0 for unused events.
+
+ -------------------------------------------------------------------------
+ | # | Event | IQS620A | IQS621 | IQS622 | IQS624 | IQS625 |
+ -------------------------------------------------------------------------
+ | 0 | CH0 Touch | x | x | x | x | x |
+ | | Antenna 1 Touch* | x | | | | |
+ -------------------------------------------------------------------------
+ | 1 | CH0 Proximity | x | x | x | x | x |
+ | | Antenna 1 Prox.* | x | | | | |
+ -------------------------------------------------------------------------
+ | 2 | CH1 Touch | x | x | x | x | x |
+ | | Ant. 1 Deep Touch* | x | | | | |
+ -------------------------------------------------------------------------
+ | 3 | CH1 Proximity | x | x | x | x | x |
+ -------------------------------------------------------------------------
+ | 4 | CH2 Touch | x | | | | |
+ -------------------------------------------------------------------------
+ | 5 | CH2 Proximity | x | | | | |
+ | | Antenna 2 Prox.* | x | | | | |
+ -------------------------------------------------------------------------
+ | 6 | Metal (+) Touch** | x | x | | | |
+ | | Ant. 2 Deep Touch* | x | | | | |
+ -------------------------------------------------------------------------
+ | 7 | Metal (+) Prox.** | x | x | | | |
+ | | Antenna 2 Touch* | x | | | | |
+ -------------------------------------------------------------------------
+ | 8 | Metal (-) Touch** | x | x | | | |
+ -------------------------------------------------------------------------
+ | 9 | Metal (-) Prox.** | x | x | | | |
+ -------------------------------------------------------------------------
+ | 10 | SAR Active*** | x | | x | | |
+ -------------------------------------------------------------------------
+ | 11 | SAR Quick Rel.*** | x | | x | | |
+ -------------------------------------------------------------------------
+ | 12 | SAR Movement*** | x | | x | | |
+ -------------------------------------------------------------------------
+ | 13 | SAR Filter Halt*** | x | | x | | |
+ -------------------------------------------------------------------------
+ | 14 | Wheel Up | | | | x | |
+ -------------------------------------------------------------------------
+ | 15 | Wheel Down | | | | x | |
+ -------------------------------------------------------------------------
+ * Two-channel SAR. Replaces CH0-2 plus metal touch and proximity events
+ if enabled via firmware.
+ ** "+" and "-" refer to the polarity of a channel's delta (LTA - counts),
+ where "LTA" is defined as the channel's long-term average.
+ *** One-channel SAR. Replaces CH0-2 touch and proximity events if enabled
+ via firmware.
+
+patternProperties:
+ "^hall-switch-(north|south)$":
+ type: object
+ description:
+ Represents north/south-field Hall-effect sensor touch or proximity
+ events. Note that north/south-field orientation is reversed on the
+ IQS620AXzCSR device due to its flip-chip package.
+
+ properties:
+ linux,code:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Numeric switch code associated with the event.
+
+ azoteq,use-prox:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ If present, specifies that Hall-effect sensor reporting should
+ use the device's wide-range proximity threshold instead of its
+ close-range touch threshold (default).
+
+ required:
+ - linux,code
+
+ additionalProperties: false
+
+if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - azoteq,iqs624-keys
+ - azoteq,iqs625-keys
+then:
+ patternProperties:
+ "^hall-switch-(north|south)$": false
+
+required:
+ - compatible
+ - linux,keycodes
+
+additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.txt b/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.txt
deleted file mode 100644
index 0e57315e9cbd..000000000000
--- a/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-FocalTech EDT-FT5x06 Polytouch driver
-=====================================
-
-There are 5 variants of the chip for various touch panel sizes
-FT5206GE1 2.8" .. 3.8"
-FT5306DE4 4.3" .. 7"
-FT5406EE8 7" .. 8.9"
-FT5506EEG 7" .. 8.9"
-FT5726NEI 5.7” .. 11.6"
-
-The software interface is identical for all those chips, so that
-currently there is no need for the driver to distinguish between the
-different chips. Nevertheless distinct compatible strings are used so
-that a distinction can be added if necessary without changing the DT
-bindings.
-
-
-Required properties:
- - compatible: "edt,edt-ft5206"
- or: "edt,edt-ft5306"
- or: "edt,edt-ft5406"
- or: "edt,edt-ft5506"
- or: "evervision,ev-ft5726"
- or: "focaltech,ft6236"
-
- - reg: I2C slave address of the chip (0x38)
- - interrupts: interrupt specification for the touchdetect
- interrupt
-
-Optional properties:
- - reset-gpios: GPIO specification for the RESET input
- - wake-gpios: GPIO specification for the WAKE input
- - vcc-supply: Regulator that supplies the touchscreen
-
- - pinctrl-names: should be "default"
- - pinctrl-0: a phandle pointing to the pin settings for the
- control gpios
-
- - wakeup-source: If present the device will act as wakeup-source
-
- - threshold: allows setting the "click"-threshold in the range
- from 0 to 80.
-
- - gain: allows setting the sensitivity in the range from 0 to
- 31. Note that lower values indicate higher
- sensitivity.
-
- - offset: allows setting the edge compensation in the range from
- 0 to 31.
-
- - offset-x: Same as offset, but applies only to the horizontal position.
- Range from 0 to 80, only supported by evervision,ev-ft5726
- devices.
-
- - offset-y: Same as offset, but applies only to the vertical position.
- Range from 0 to 80, only supported by evervision,ev-ft5726
- devices.
-
- - touchscreen-size-x : See touchscreen.txt
- - touchscreen-size-y : See touchscreen.txt
- - touchscreen-fuzz-x : See touchscreen.txt
- - touchscreen-fuzz-y : See touchscreen.txt
- - touchscreen-inverted-x : See touchscreen.txt
- - touchscreen-inverted-y : See touchscreen.txt
- - touchscreen-swapped-x-y : See touchscreen.txt
-
-Example:
- polytouch: edt-ft5x06@38 {
- compatible = "edt,edt-ft5406", "edt,edt-ft5x06";
- reg = <0x38>;
- pinctrl-names = "default";
- pinctrl-0 = <&edt_ft5x06_pins>;
- interrupt-parent = <&gpio2>;
- interrupts = <5 IRQ_TYPE_EDGE_FALLING>;
- reset-gpios = <&gpio2 6 GPIO_ACTIVE_LOW>;
- wake-gpios = <&gpio4 9 GPIO_ACTIVE_HIGH>;
- };
diff --git a/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml b/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml
new file mode 100644
index 000000000000..8d58709d4b47
--- /dev/null
+++ b/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/touchscreen/edt-ft5x06.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: FocalTech EDT-FT5x06 Polytouch Bindings
+
+description: |
+ There are 5 variants of the chip for various touch panel sizes
+ FT5206GE1 2.8" .. 3.8"
+ FT5306DE4 4.3" .. 7"
+ FT5406EE8 7" .. 8.9"
+ FT5506EEG 7" .. 8.9"
+ FT5726NEI 5.7” .. 11.6"
+
+maintainers:
+ - Dmitry Torokhov <dmitry.torokhov@gmail.com>
+
+allOf:
+ - $ref: touchscreen.yaml#
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - evervision,ev-ft5726
+
+ then:
+ properties:
+ offset-x: true
+ offset-y: true
+
+properties:
+ compatible:
+ enum:
+ - edt,edt-ft5206
+ - edt,edt-ft5306
+ - edt,edt-ft5406
+ - edt,edt-ft5506
+ - evervision,ev-ft5726
+ - focaltech,ft6236
+
+ reg:
+ const: 0x38
+
+ interrupts:
+ maxItems: 1
+
+ reset-gpios:
+ maxItems: 1
+
+ wake-gpios:
+ maxItems: 1
+
+ wakeup-source: true
+
+ vcc-supply:
+ maxItems: 1
+
+ gain:
+ description: Allows setting the sensitivity in the range from 0 to 31.
+ Note that lower values indicate higher sensitivity.
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32
+ - minimum: 0
+ - maximum: 31
+
+ offset:
+ description: Allows setting the edge compensation in the range from 0 to 31.
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32
+ - minimum: 0
+ - maximum: 31
+
+ offset-x:
+ description: Same as offset, but applies only to the horizontal position.
+ Range from 0 to 80, only supported by evervision,ev-ft5726 devices.
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32
+ - minimum: 0
+ - maximum: 80
+
+ offset-y:
+ description: Same as offset, but applies only to the vertical position.
+ Range from 0 to 80, only supported by evervision,ev-ft5726 devices.
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32
+ - minimum: 0
+ - maximum: 80
+
+ touchscreen-size-x: true
+ touchscreen-size-y: true
+ touchscreen-fuzz-x: true
+ touchscreen-fuzz-y: true
+ touchscreen-inverted-x: true
+ touchscreen-inverted-y: true
+ touchscreen-swapped-x-y: true
+ interrupt-controller: true
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ i2c@00000000 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ edt-ft5x06@38 {
+ compatible = "edt,edt-ft5406";
+ reg = <0x38>;
+ interrupt-parent = <&gpio2>;
+ interrupts = <5 IRQ_TYPE_EDGE_FALLING>;
+ reset-gpios = <&gpio2 6 GPIO_ACTIVE_LOW>;
+ wake-gpios = <&gpio4 9 GPIO_ACTIVE_HIGH>;
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/input/touchscreen/goodix.yaml b/Documentation/devicetree/bindings/input/touchscreen/goodix.yaml
index c99ed3934d7e..c8ea9434c9cc 100644
--- a/Documentation/devicetree/bindings/input/touchscreen/goodix.yaml
+++ b/Documentation/devicetree/bindings/input/touchscreen/goodix.yaml
@@ -21,6 +21,8 @@ properties:
- goodix,gt911
- goodix,gt9110
- goodix,gt912
+ - goodix,gt9147
+ - goodix,gt917s
- goodix,gt927
- goodix,gt9271
- goodix,gt928
diff --git a/Documentation/devicetree/bindings/ipmi/aspeed-kcs-bmc.txt b/Documentation/devicetree/bindings/ipmi/aspeed-kcs-bmc.txt
index d98a9bf45d6c..193e71ca96b0 100644
--- a/Documentation/devicetree/bindings/ipmi/aspeed-kcs-bmc.txt
+++ b/Documentation/devicetree/bindings/ipmi/aspeed-kcs-bmc.txt
@@ -1,9 +1,10 @@
-* Aspeed KCS (Keyboard Controller Style) IPMI interface
+# Aspeed KCS (Keyboard Controller Style) IPMI interface
The Aspeed SOCs (AST2400 and AST2500) are commonly used as BMCs
(Baseboard Management Controllers) and the KCS interface can be
used to perform in-band IPMI communication with their host.
+## v1
Required properties:
- compatible : should be one of
"aspeed,ast2400-kcs-bmc"
@@ -12,14 +13,21 @@ Required properties:
- kcs_chan : The LPC channel number in the controller
- kcs_addr : The host CPU IO map address
+## v2
+Required properties:
+- compatible : should be one of
+ "aspeed,ast2400-kcs-bmc-v2"
+ "aspeed,ast2500-kcs-bmc-v2"
+- reg : The address and size of the IDR, ODR and STR registers
+- interrupts : interrupt generated by the controller
+- aspeed,lpc-io-reg : The host CPU LPC IO address for the device
Example:
- kcs3: kcs3@0 {
- compatible = "aspeed,ast2500-kcs-bmc";
- reg = <0x0 0x80>;
+ kcs3: kcs@24 {
+ compatible = "aspeed,ast2500-kcs-bmc-v2";
+ reg = <0x24 0x1>, <0x30 0x1>, <0x3c 0x1>;
+ aspeed,lpc-reg = <0xca2>;
interrupts = <8>;
- kcs_chan = <3>;
- kcs_addr = <0xCA2>;
status = "okay";
};
diff --git a/Documentation/devicetree/bindings/mfd/iqs62x.yaml b/Documentation/devicetree/bindings/mfd/iqs62x.yaml
new file mode 100644
index 000000000000..541b06d80e73
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/iqs62x.yaml
@@ -0,0 +1,179 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mfd/iqs62x.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Azoteq IQS620A/621/622/624/625 Multi-Function Sensors
+
+maintainers:
+ - Jeff LaBundy <jeff@labundy.com>
+
+description: |
+ The Azoteq IQS620A, IQS621, IQS622, IQS624 and IQS625 multi-function sensors
+ integrate multiple sensing technologies in a single package.
+
+ Link to datasheets: https://www.azoteq.com/
+
+properties:
+ compatible:
+ enum:
+ - azoteq,iqs620a
+ - azoteq,iqs621
+ - azoteq,iqs622
+ - azoteq,iqs624
+ - azoteq,iqs625
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ firmware-name:
+ $ref: /schemas/types.yaml#/definitions/string
+ description:
+ Specifies the name of the calibration and configuration file selected by
+ the driver. If this property is omitted, the name is chosen based on the
+ device name with ".bin" as the extension (e.g. iqs620a.bin for IQS620A).
+
+ keys:
+ $ref: ../input/iqs62x-keys.yaml
+
+ pwm:
+ $ref: ../pwm/iqs620a-pwm.yaml
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+additionalProperties: false
+
+examples:
+ - |
+ /*
+ * Dual capacitive buttons with proximity-activated function, unipolar lid
+ * switch and panel-mounted LED.
+ */
+ #include <dt-bindings/input/input.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ iqs620a@44 {
+ compatible = "azoteq,iqs620a";
+ reg = <0x44>;
+ interrupt-parent = <&gpio>;
+ interrupts = <17 IRQ_TYPE_LEVEL_LOW>;
+
+ keys {
+ compatible = "azoteq,iqs620a-keys";
+
+ linux,keycodes = <KEY_SELECT>,
+ <KEY_MENU>,
+ <KEY_OK>,
+ <KEY_MENU>;
+
+ hall-switch-south {
+ linux,code = <SW_LID>;
+ azoteq,use-prox;
+ };
+ };
+
+ iqs620a_pwm: pwm {
+ compatible = "azoteq,iqs620a-pwm";
+ #pwm-cells = <2>;
+ };
+ };
+ };
+
+ pwmleds {
+ compatible = "pwm-leds";
+
+ panel {
+ pwms = <&iqs620a_pwm 0 1000000>;
+ max-brightness = <255>;
+ };
+ };
+
+ - |
+ /* Single inductive button with bipolar dock/tablet-mode switch. */
+ #include <dt-bindings/input/input.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ iqs620a@44 {
+ compatible = "azoteq,iqs620a";
+ reg = <0x44>;
+ interrupt-parent = <&gpio>;
+ interrupts = <17 IRQ_TYPE_LEVEL_LOW>;
+
+ firmware-name = "iqs620a_coil.bin";
+
+ keys {
+ compatible = "azoteq,iqs620a-keys";
+
+ linux,keycodes = <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <KEY_MUTE>;
+
+ hall-switch-north {
+ linux,code = <SW_DOCK>;
+ };
+
+ hall-switch-south {
+ linux,code = <SW_TABLET_MODE>;
+ };
+ };
+ };
+ };
+
+ - |
+ /* Dual capacitive buttons with volume knob. */
+ #include <dt-bindings/input/input.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ iqs624@44 {
+ compatible = "azoteq,iqs624";
+ reg = <0x44>;
+ interrupt-parent = <&gpio>;
+ interrupts = <17 IRQ_TYPE_LEVEL_LOW>;
+
+ keys {
+ compatible = "azoteq,iqs624-keys";
+
+ linux,keycodes = <BTN_0>,
+ <0>,
+ <BTN_1>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <0>,
+ <KEY_VOLUMEUP>,
+ <KEY_VOLUMEDOWN>;
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/mfd/rn5t618.txt b/Documentation/devicetree/bindings/mfd/rn5t618.txt
index b74e5e94d1cb..16778ea00dbc 100644
--- a/Documentation/devicetree/bindings/mfd/rn5t618.txt
+++ b/Documentation/devicetree/bindings/mfd/rn5t618.txt
@@ -15,6 +15,8 @@ Required properties:
- reg: the I2C slave address of the device
Optional properties:
+ - interrupts: interrupt mapping for IRQ
+ See Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
- system-power-controller:
See Documentation/devicetree/bindings/power/power-controller.txt
@@ -32,6 +34,8 @@ Example:
pmic@32 {
compatible = "ricoh,rn5t618";
reg = <0x32>;
+ interrupt-parent = <&gpio5>;
+ interrupts = <11 IRQ_TYPE_EDGE_FALLING>;
system-power-controller;
regulators {
diff --git a/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt b/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt
deleted file mode 100644
index f22d74c7a8db..000000000000
--- a/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt
+++ /dev/null
@@ -1,90 +0,0 @@
-* ROHM BD71837 and BD71847 Power Management Integrated Circuit bindings
-
-BD71837MWV and BD71847MWV are programmable Power Management ICs for powering
-single-core, dual-core, and quad-core SoCs such as NXP-i.MX 8M. They are
-optimized for low BOM cost and compact solution footprint. BD71837MWV
-integrates 8 Buck regulators and 7 LDOs. BD71847MWV contains 6 Buck regulators
-and 6 LDOs.
-
-Datasheet for BD71837 is available at:
-https://www.rohm.com/datasheet/BD71837MWV/bd71837mwv-e
-Datasheet for BD71847 is available at:
-https://www.rohm.com/datasheet/BD71847AMWV/bd71847amwv-e
-
-Required properties:
- - compatible : Should be "rohm,bd71837" for bd71837
- "rohm,bd71847" for bd71847.
- - reg : I2C slave address.
- - interrupt-parent : Phandle to the parent interrupt controller.
- - interrupts : The interrupt line the device is connected to.
- - clocks : The parent clock connected to PMIC. If this is missing
- 32768 KHz clock is assumed.
- - #clock-cells : Should be 0.
- - regulators: : List of child nodes that specify the regulators.
- Please see ../regulator/rohm,bd71837-regulator.txt
-
-Optional properties:
-- clock-output-names : Should contain name for output clock.
-- rohm,reset-snvs-powered : Transfer BD718x7 to SNVS state at reset.
-
-The BD718x7 supports two different HW states as reset target states. States
-are called as SNVS and READY. At READY state all the PMIC power outputs go
-down and OTP is reload. At the SNVS state all other logic and external
-devices apart from the SNVS power domain are shut off. Please refer to NXP
-i.MX8 documentation for further information regarding SNVS state. When a
-reset is done via SNVS state the PMIC OTP data is not reload. This causes
-power outputs that have been under SW control to stay down when reset has
-switched power state to SNVS. If reset is done via READY state the power
-outputs will be returned to HW control by OTP loading. Thus the reset
-target state is set to READY by default. If SNVS state is used the boot
-crucial regulators must have the regulator-always-on and regulator-boot-on
-properties set in regulator node.
-
-- rohm,short-press-ms : Short press duration in milliseconds
-- rohm,long-press-ms : Long press duration in milliseconds
-
-Configure the "short press" and "long press" timers for the power button.
-Values are rounded to what hardware supports (500ms multiple for short and
-1000ms multiple for long). If these properties are not present the existing
-configuration (from bootloader or OTP) is not touched.
-
-Example:
-
- /* external oscillator node */
- osc: oscillator {
- compatible = "fixed-clock";
- #clock-cells = <1>;
- clock-frequency = <32768>;
- clock-output-names = "osc";
- };
-
- pmic: pmic@4b {
- compatible = "rohm,bd71837";
- reg = <0x4b>;
- interrupt-parent = <&gpio1>;
- interrupts = <29 GPIO_ACTIVE_LOW>;
- interrupt-names = "irq";
- #clock-cells = <0>;
- clocks = <&osc 0>;
- clock-output-names = "bd71837-32k-out";
- rohm,reset-snvs-powered;
-
- regulators {
- buck1: BUCK1 {
- regulator-name = "buck1";
- regulator-min-microvolt = <700000>;
- regulator-max-microvolt = <1300000>;
- regulator-boot-on;
- regulator-always-on;
- regulator-ramp-delay = <1250>;
- };
- // [...]
- };
- };
-
- /* Clock consumer node */
- rtc@0 {
- compatible = "company,my-rtc";
- clock-names = "my-clock";
- clocks = <&pmic>;
- };
diff --git a/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.yaml b/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.yaml
new file mode 100644
index 000000000000..aa922c560fcc
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.yaml
@@ -0,0 +1,236 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mfd/rohm,bd71837-pmic.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ROHM BD71837 Power Management Integrated Circuit bindings
+
+maintainers:
+ - Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
+
+description: |
+ BD71837MWV is programmable Power Management ICs for powering single-core,
+ dual-core, and quad-core SoCs such as NXP-i.MX 8M. It is optimized for low
+ BOM cost and compact solution footprint. BD71837MWV integrates 8 Buck
+ regulators and 7 LDOs.
+ Datasheet for BD71837 is available at
+ https://www.rohm.com/products/power-management/power-management-ic-for-system/industrial-consumer-applications/nxp-imx/bd71837amwv-product
+
+properties:
+ compatible:
+ const: rohm,bd71837
+
+ reg:
+ description:
+ I2C slave address.
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 0
+
+# The BD718x7 supports two different HW states as reset target states. States
+# are called as SNVS and READY. At READY state all the PMIC power outputs go
+# down and OTP is reload. At the SNVS state all other logic and external
+# devices apart from the SNVS power domain are shut off. Please refer to NXP
+# i.MX8 documentation for further information regarding SNVS state. When a
+# reset is done via SNVS state the PMIC OTP data is not reload. This causes
+# power outputs that have been under SW control to stay down when reset has
+# switched power state to SNVS. If reset is done via READY state the power
+# outputs will be returned to HW control by OTP loading. Thus the reset
+# target state is set to READY by default. If SNVS state is used the boot
+# crucial regulators must have the regulator-always-on and regulator-boot-on
+# properties set in regulator node.
+
+ rohm,reset-snvs-powered:
+ description: |
+ Transfer PMIC to SNVS state at reset
+ type: boolean
+
+# Configure the "short press" and "long press" timers for the power button.
+# Values are rounded to what hardware supports
+# Short-press:
+# Shortest being 10ms, next 500ms and then multiple of 500ms up to 7,5s
+# Long-press:
+# Shortest being 10ms, next 1000ms and then multiple of 1000ms up to 15s
+# If these properties are not present the existing configuration (from
+# bootloader or OTP) is not touched.
+
+ rohm,short-press-ms:
+ description:
+ Short press duration in milliseconds
+ enum:
+ - 10
+ - 500
+ - 1000
+ - 1500
+ - 2000
+ - 2500
+ - 3000
+ - 3500
+ - 4000
+ - 4500
+ - 5000
+ - 5500
+ - 6000
+ - 6500
+ - 7000
+
+ rohm,long-press-ms:
+ description:
+ Long press duration in milliseconds
+ enum:
+ - 10
+ - 1000
+ - 2000
+ - 3000
+ - 4000
+ - 5000
+ - 6000
+ - 7000
+ - 8000
+ - 9000
+ - 10000
+ - 11000
+ - 12000
+ - 13000
+ - 14000
+
+ regulators:
+ $ref: ../regulator/rohm,bd71837-regulator.yaml
+ description:
+ List of child nodes that specify the regulators.
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - "#clock-cells"
+ - regulators
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/leds/common.h>
+
+ i2c {
+ pmic: pmic@4b {
+ compatible = "rohm,bd71837";
+ reg = <0x4b>;
+ interrupt-parent = <&gpio1>;
+ interrupts = <29 IRQ_TYPE_LEVEL_LOW>;
+ #clock-cells = <0>;
+ clocks = <&osc 0>;
+ rohm,reset-snvs-powered;
+ rohm,short-press-ms = <10>;
+ rohm,long-press-ms = <2000>;
+
+ regulators {
+ buck1: BUCK1 {
+ regulator-name = "buck1";
+ regulator-min-microvolt = <700000>;
+ regulator-max-microvolt = <1300000>;
+ regulator-boot-on;
+ regulator-always-on;
+ regulator-ramp-delay = <1250>;
+ rohm,dvs-run-voltage = <900000>;
+ rohm,dvs-idle-voltage = <850000>;
+ rohm,dvs-suspend-voltage = <800000>;
+ };
+ buck2: BUCK2 {
+ regulator-name = "buck2";
+ regulator-min-microvolt = <700000>;
+ regulator-max-microvolt = <1300000>;
+ regulator-boot-on;
+ regulator-always-on;
+ regulator-ramp-delay = <1250>;
+ rohm,dvs-run-voltage = <1000000>;
+ rohm,dvs-idle-voltage = <900000>;
+ };
+ buck3: BUCK3 {
+ regulator-name = "buck3";
+ regulator-min-microvolt = <700000>;
+ regulator-max-microvolt = <1300000>;
+ regulator-boot-on;
+ rohm,dvs-run-voltage = <1000000>;
+ };
+ buck4: BUCK4 {
+ regulator-name = "buck4";
+ regulator-min-microvolt = <700000>;
+ regulator-max-microvolt = <1300000>;
+ regulator-boot-on;
+ rohm,dvs-run-voltage = <1000000>;
+ };
+ buck5: BUCK5 {
+ regulator-name = "buck5";
+ regulator-min-microvolt = <700000>;
+ regulator-max-microvolt = <1350000>;
+ regulator-boot-on;
+ };
+ buck6: BUCK6 {
+ regulator-name = "buck6";
+ regulator-min-microvolt = <3000000>;
+ regulator-max-microvolt = <3300000>;
+ regulator-boot-on;
+ };
+ buck7: BUCK7 {
+ regulator-name = "buck7";
+ regulator-min-microvolt = <1605000>;
+ regulator-max-microvolt = <1995000>;
+ regulator-boot-on;
+ };
+ buck8: BUCK8 {
+ regulator-name = "buck8";
+ regulator-min-microvolt = <800000>;
+ regulator-max-microvolt = <1400000>;
+ };
+
+ ldo1: LDO1 {
+ regulator-name = "ldo1";
+ regulator-min-microvolt = <3000000>;
+ regulator-max-microvolt = <3300000>;
+ regulator-boot-on;
+ };
+ ldo2: LDO2 {
+ regulator-name = "ldo2";
+ regulator-min-microvolt = <900000>;
+ regulator-max-microvolt = <900000>;
+ regulator-boot-on;
+ };
+ ldo3: LDO3 {
+ regulator-name = "ldo3";
+ regulator-min-microvolt = <1800000>;
+ regulator-max-microvolt = <3300000>;
+ };
+ ldo4: LDO4 {
+ regulator-name = "ldo4";
+ regulator-min-microvolt = <900000>;
+ regulator-max-microvolt = <1800000>;
+ };
+ ldo5: LDO5 {
+ regulator-name = "ldo5";
+ regulator-min-microvolt = <1800000>;
+ regulator-max-microvolt = <3300000>;
+ };
+ ldo6: LDO6 {
+ regulator-name = "ldo6";
+ regulator-min-microvolt = <900000>;
+ regulator-max-microvolt = <1800000>;
+ };
+ ldo7_reg: LDO7 {
+ regulator-name = "ldo7";
+ regulator-min-microvolt = <1800000>;
+ regulator-max-microvolt = <3300000>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/mfd/rohm,bd71847-pmic.yaml b/Documentation/devicetree/bindings/mfd/rohm,bd71847-pmic.yaml
new file mode 100644
index 000000000000..402e40dfe0b8
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/rohm,bd71847-pmic.yaml
@@ -0,0 +1,222 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mfd/rohm,bd71847-pmic.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ROHM BD71847 and BD71850 Power Management Integrated Circuit bindings
+
+maintainers:
+ - Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
+
+description: |
+ BD71847AMWV and BD71850MWV are programmable Power Management ICs for powering
+ single-core, dual-core, and quad-core SoCs such as NXP-i.MX 8M. It is
+ optimized for low BOM cost and compact solution footprint. BD71847MWV and
+ BD71850MWV integrate 6 Buck regulators and 6 LDOs.
+ Datasheets are available at
+ https://www.rohm.com/products/power-management/power-management-ic-for-system/industrial-consumer-applications/nxp-imx/bd71847amwv-product
+ https://www.rohm.com/products/power-management/power-management-ic-for-system/industrial-consumer-applications/nxp-imx/bd71850mwv-product
+
+properties:
+ compatible:
+ enum:
+ - rohm,bd71847
+ - rohm,bd71850
+
+ reg:
+ description:
+ I2C slave address.
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 0
+
+# The BD71847 abd BD71850 support two different HW states as reset target
+# states. States are called as SNVS and READY. At READY state all the PMIC
+# power outputs go down and OTP is reload. At the SNVS state all other logic
+# and external devices apart from the SNVS power domain are shut off. Please
+# refer to NXP i.MX8 documentation for further information regarding SNVS
+# state. When a reset is done via SNVS state the PMIC OTP data is not reload.
+# This causes power outputs that have been under SW control to stay down when
+# reset has switched power state to SNVS. If reset is done via READY state the
+# power outputs will be returned to HW control by OTP loading. Thus the reset
+# target state is set to READY by default. If SNVS state is used the boot
+# crucial regulators must have the regulator-always-on and regulator-boot-on
+# properties set in regulator node.
+
+ rohm,reset-snvs-powered:
+ description:
+ Transfer PMIC to SNVS state at reset.
+ type: boolean
+
+# Configure the "short press" and "long press" timers for the power button.
+# Values are rounded to what hardware supports
+# Short-press:
+# Shortest being 10ms, next 500ms and then multiple of 500ms up to 7,5s
+# Long-press:
+# Shortest being 10ms, next 1000ms and then multiple of 1000ms up to 15s
+# If these properties are not present the existing # configuration (from
+# bootloader or OTP) is not touched.
+
+ rohm,short-press-ms:
+ description:
+ Short press duration in milliseconds
+ enum:
+ - 10
+ - 500
+ - 1000
+ - 1500
+ - 2000
+ - 2500
+ - 3000
+ - 3500
+ - 4000
+ - 4500
+ - 5000
+ - 5500
+ - 6000
+ - 6500
+ - 7000
+ - 7500
+
+ rohm,long-press-ms:
+ description:
+ Long press duration in milliseconds
+ enum:
+ - 10
+ - 1000
+ - 2000
+ - 3000
+ - 4000
+ - 5000
+ - 6000
+ - 7000
+ - 8000
+ - 9000
+ - 10000
+ - 11000
+ - 12000
+ - 13000
+ - 14000
+ - 15000
+
+ regulators:
+ $ref: ../regulator/rohm,bd71847-regulator.yaml
+ description:
+ List of child nodes that specify the regulators.
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - "#clock-cells"
+ - regulators
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/leds/common.h>
+
+ i2c {
+ pmic: pmic@4b {
+ compatible = "rohm,bd71847";
+ reg = <0x4b>;
+ interrupt-parent = <&gpio1>;
+ interrupts = <29 IRQ_TYPE_LEVEL_LOW>;
+ #clock-cells = <0>;
+ clocks = <&osc 0>;
+ rohm,reset-snvs-powered;
+ rohm,short-press-ms = <10>;
+ rohm,long-press-ms = <2000>;
+
+ regulators {
+ buck1: BUCK1 {
+ regulator-name = "buck1";
+ regulator-min-microvolt = <700000>;
+ regulator-max-microvolt = <1300000>;
+ regulator-boot-on;
+ regulator-always-on;
+ regulator-ramp-delay = <1250>;
+ rohm,dvs-run-voltage = <900000>;
+ rohm,dvs-idle-voltage = <850000>;
+ rohm,dvs-suspend-voltage = <800000>;
+ };
+ buck2: BUCK2 {
+ regulator-name = "buck2";
+ regulator-min-microvolt = <700000>;
+ regulator-max-microvolt = <1300000>;
+ regulator-boot-on;
+ regulator-always-on;
+ regulator-ramp-delay = <1250>;
+ rohm,dvs-run-voltage = <1000000>;
+ rohm,dvs-idle-voltage = <900000>;
+ };
+ buck3: BUCK3 {
+ regulator-name = "buck3";
+ regulator-min-microvolt = <550000>;
+ regulator-max-microvolt = <1350000>;
+ regulator-boot-on;
+ };
+ buck4: BUCK4 {
+ regulator-name = "buck4";
+ regulator-min-microvolt = <2600000>;
+ regulator-max-microvolt = <3300000>;
+ regulator-boot-on;
+ };
+ buck5: BUCK5 {
+ regulator-name = "buck5";
+ regulator-min-microvolt = <1605000>;
+ regulator-max-microvolt = <1995000>;
+ regulator-boot-on;
+ };
+ buck8: BUCK6 {
+ regulator-name = "buck6";
+ regulator-min-microvolt = <800000>;
+ regulator-max-microvolt = <1400000>;
+ };
+
+ ldo1: LDO1 {
+ regulator-name = "ldo1";
+ regulator-min-microvolt = <1600000>;
+ regulator-max-microvolt = <3300000>;
+ regulator-boot-on;
+ };
+ ldo2: LDO2 {
+ regulator-name = "ldo2";
+ regulator-min-microvolt = <800000>;
+ regulator-max-microvolt = <900000>;
+ regulator-boot-on;
+ };
+ ldo3: LDO3 {
+ regulator-name = "ldo3";
+ regulator-min-microvolt = <1800000>;
+ regulator-max-microvolt = <3300000>;
+ };
+ ldo4: LDO4 {
+ regulator-name = "ldo4";
+ regulator-min-microvolt = <900000>;
+ regulator-max-microvolt = <1800000>;
+ };
+ ldo5: LDO5 {
+ regulator-name = "ldo5";
+ regulator-min-microvolt = <800000>;
+ regulator-max-microvolt = <3300000>;
+ };
+ ldo6: LDO6 {
+ regulator-name = "ldo6";
+ regulator-min-microvolt = <900000>;
+ regulator-max-microvolt = <1800000>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/mfd/st,stm32-lptimer.yaml b/Documentation/devicetree/bindings/mfd/st,stm32-lptimer.yaml
index 1a4cc5f3fb33..ddf190cb800b 100644
--- a/Documentation/devicetree/bindings/mfd/st,stm32-lptimer.yaml
+++ b/Documentation/devicetree/bindings/mfd/st,stm32-lptimer.yaml
@@ -39,6 +39,8 @@ properties:
"#size-cells":
const: 0
+ wakeup-source: true
+
pwm:
type: object
@@ -81,6 +83,16 @@ patternProperties:
required:
- compatible
+ timer:
+ type: object
+
+ properties:
+ compatible:
+ const: st,stm32-lptimer-timer
+
+ required:
+ - compatible
+
required:
- "#address-cells"
- "#size-cells"
@@ -115,6 +127,10 @@ examples:
counter {
compatible = "st,stm32-lptimer-counter";
};
+
+ timer {
+ compatible = "st,stm32-lptimer-timer";
+ };
};
...
diff --git a/Documentation/devicetree/bindings/net/marvell,mvusb.yaml b/Documentation/devicetree/bindings/net/marvell,mvusb.yaml
index 9458f6659be1..68573762294b 100644
--- a/Documentation/devicetree/bindings/net/marvell,mvusb.yaml
+++ b/Documentation/devicetree/bindings/net/marvell,mvusb.yaml
@@ -38,28 +38,27 @@ required:
examples:
- |
/* USB host controller */
- &usb1 {
- mvusb: mdio@1 {
+ usb {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ mdio@1 {
compatible = "usb1286,1fa4";
reg = <1>;
#address-cells = <1>;
#size-cells = <0>;
- };
- };
- /* MV88E6390X devboard */
- &mvusb {
- switch@0 {
- compatible = "marvell,mv88e6190";
- status = "ok";
- reg = <0x0>;
+ switch@0 {
+ compatible = "marvell,mv88e6190";
+ reg = <0x0>;
- ports {
- /* Port definitions */
- };
+ ports {
+ /* Port definitions */
+ };
- mdio {
- /* PHY definitions */
+ mdio {
+ /* PHY definitions */
+ };
};
};
};
diff --git a/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.txt b/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.txt
deleted file mode 100644
index 472bd46ab5a4..000000000000
--- a/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-* PWM controlled by ChromeOS EC
-
-Google's ChromeOS EC PWM is a simple PWM attached to the Embedded Controller
-(EC) and controlled via a host-command interface.
-
-An EC PWM node should be only found as a sub-node of the EC node (see
-Documentation/devicetree/bindings/mfd/cros-ec.txt).
-
-Required properties:
-- compatible: Must contain "google,cros-ec-pwm"
-- #pwm-cells: Should be 1. The cell specifies the PWM index.
-
-Example:
- cros-ec@0 {
- compatible = "google,cros-ec-spi";
-
- ...
-
- cros_ec_pwm: ec-pwm {
- compatible = "google,cros-ec-pwm";
- #pwm-cells = <1>;
- };
- };
diff --git a/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml b/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml
new file mode 100644
index 000000000000..24c217b76580
--- /dev/null
+++ b/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pwm/google,cros-ec-pwm.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: PWM controlled by ChromeOS EC
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - '"Uwe Kleine-König" <u.kleine-koenig@pengutronix.de>'
+
+description: |
+ Google's ChromeOS EC PWM is a simple PWM attached to the Embedded Controller
+ (EC) and controlled via a host-command interface.
+ An EC PWM node should be only found as a sub-node of the EC node (see
+ Documentation/devicetree/bindings/mfd/cros-ec.txt).
+
+properties:
+ compatible:
+ const: google,cros-ec-pwm
+ "#pwm-cells":
+ description: The cell specifies the PWM index.
+ const: 1
+
+required:
+ - compatible
+ - '#pwm-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ cros-ec@0 {
+ compatible = "google,cros-ec-spi";
+ cros_ec_pwm: ec-pwm {
+ compatible = "google,cros-ec-pwm";
+ #pwm-cells = <1>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/pwm/iqs620a-pwm.yaml b/Documentation/devicetree/bindings/pwm/iqs620a-pwm.yaml
new file mode 100644
index 000000000000..1d7c27be50da
--- /dev/null
+++ b/Documentation/devicetree/bindings/pwm/iqs620a-pwm.yaml
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pwm/iqs620a-pwm.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Azoteq IQS620A PWM Generator
+
+maintainers:
+ - Jeff LaBundy <jeff@labundy.com>
+
+description: |
+ The Azoteq IQS620A multi-function sensor generates a fixed-frequency PWM
+ output represented by a "pwm" child node from the parent MFD driver. See
+ Documentation/devicetree/bindings/mfd/iqs62x.yaml for further details as
+ well as an example.
+
+properties:
+ compatible:
+ enum:
+ - azoteq,iqs620a-pwm
+
+ "#pwm-cells":
+ const: 2
+
+required:
+ - compatible
+ - "#pwm-cells"
+
+additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/pwm/nvidia,tegra20-pwm.txt b/Documentation/devicetree/bindings/pwm/nvidia,tegra20-pwm.txt
index 0a69eadf44ce..74c41e34c3b6 100644
--- a/Documentation/devicetree/bindings/pwm/nvidia,tegra20-pwm.txt
+++ b/Documentation/devicetree/bindings/pwm/nvidia,tegra20-pwm.txt
@@ -9,6 +9,7 @@ Required properties:
- "nvidia,tegra132-pwm", "nvidia,tegra20-pwm": for Tegra132
- "nvidia,tegra210-pwm", "nvidia,tegra20-pwm": for Tegra210
- "nvidia,tegra186-pwm": for Tegra186
+ - "nvidia,tegra194-pwm": for Tegra194
- reg: physical base address and length of the controller's registers
- #pwm-cells: should be 2. See pwm.yaml in this directory for a description of
the cells format.
diff --git a/Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.txt b/Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.txt
deleted file mode 100644
index cbce62c22b60..000000000000
--- a/Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.txt
+++ /dev/null
@@ -1,162 +0,0 @@
-ROHM BD71837 and BD71847 Power Management Integrated Circuit regulator bindings
-
-Required properties:
- - regulator-name: should be "buck1", ..., "buck8" and "ldo1", ..., "ldo7" for
- BD71837. For BD71847 names should be "buck1", ..., "buck6"
- and "ldo1", ..., "ldo6"
-
-List of regulators provided by this controller. BD71837 regulators node
-should be sub node of the BD71837 MFD node. See BD71837 MFD bindings at
-Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt
-Regulator nodes should be named to BUCK_<number> and LDO_<number>. The
-definition for each of these nodes is defined using the standard
-binding for regulators at
-Documentation/devicetree/bindings/regulator/regulator.txt.
-Note that if BD71837 starts at RUN state you probably want to use
-regulator-boot-on at least for BUCK6 and BUCK7 so that those are not
-disabled by driver at startup. LDO5 and LDO6 are supplied by those and
-if they are disabled at startup the voltage monitoring for LDO5/LDO6 will
-cause PMIC to reset.
-
-The valid names for BD71837 regulator nodes are:
-BUCK1, BUCK2, BUCK3, BUCK4, BUCK5, BUCK6, BUCK7, BUCK8
-LDO1, LDO2, LDO3, LDO4, LDO5, LDO6, LDO7
-
-The valid names for BD71847 regulator nodes are:
-BUCK1, BUCK2, BUCK3, BUCK4, BUCK5, BUCK6
-LDO1, LDO2, LDO3, LDO4, LDO5, LDO6
-
-Optional properties:
-- rohm,dvs-run-voltage : PMIC default "RUN" state voltage in uV.
- See below table for bucks which support this.
-- rohm,dvs-idle-voltage : PMIC default "IDLE" state voltage in uV.
- See below table for bucks which support this.
-- rohm,dvs-suspend-voltage : PMIC default "SUSPEND" state voltage in uV.
- See below table for bucks which support this.
-- Any optional property defined in bindings/regulator/regulator.txt
-
-Supported default DVS states:
-
-BD71837:
-buck | dvs-run-voltage | dvs-idle-voltage | dvs-suspend-voltage
------------------------------------------------------------------------------
-1 | supported | supported | supported
-----------------------------------------------------------------------------
-2 | supported | supported | not supported
-----------------------------------------------------------------------------
-3 | supported | not supported | not supported
-----------------------------------------------------------------------------
-4 | supported | not supported | not supported
-----------------------------------------------------------------------------
-rest | not supported | not supported | not supported
-
-BD71847:
-buck | dvs-run-voltage | dvs-idle-voltage | dvs-suspend-voltage
------------------------------------------------------------------------------
-1 | supported | supported | supported
-----------------------------------------------------------------------------
-2 | supported | supported | not supported
-----------------------------------------------------------------------------
-rest | not supported | not supported | not supported
-
-Example:
-regulators {
- buck1: BUCK1 {
- regulator-name = "buck1";
- regulator-min-microvolt = <700000>;
- regulator-max-microvolt = <1300000>;
- regulator-boot-on;
- regulator-always-on;
- regulator-ramp-delay = <1250>;
- rohm,dvs-run-voltage = <900000>;
- rohm,dvs-idle-voltage = <850000>;
- rohm,dvs-suspend-voltage = <800000>;
- };
- buck2: BUCK2 {
- regulator-name = "buck2";
- regulator-min-microvolt = <700000>;
- regulator-max-microvolt = <1300000>;
- regulator-boot-on;
- regulator-always-on;
- regulator-ramp-delay = <1250>;
- rohm,dvs-run-voltage = <1000000>;
- rohm,dvs-idle-voltage = <900000>;
- };
- buck3: BUCK3 {
- regulator-name = "buck3";
- regulator-min-microvolt = <700000>;
- regulator-max-microvolt = <1300000>;
- regulator-boot-on;
- rohm,dvs-run-voltage = <1000000>;
- };
- buck4: BUCK4 {
- regulator-name = "buck4";
- regulator-min-microvolt = <700000>;
- regulator-max-microvolt = <1300000>;
- regulator-boot-on;
- rohm,dvs-run-voltage = <1000000>;
- };
- buck5: BUCK5 {
- regulator-name = "buck5";
- regulator-min-microvolt = <700000>;
- regulator-max-microvolt = <1350000>;
- regulator-boot-on;
- };
- buck6: BUCK6 {
- regulator-name = "buck6";
- regulator-min-microvolt = <3000000>;
- regulator-max-microvolt = <3300000>;
- regulator-boot-on;
- };
- buck7: BUCK7 {
- regulator-name = "buck7";
- regulator-min-microvolt = <1605000>;
- regulator-max-microvolt = <1995000>;
- regulator-boot-on;
- };
- buck8: BUCK8 {
- regulator-name = "buck8";
- regulator-min-microvolt = <800000>;
- regulator-max-microvolt = <1400000>;
- };
-
- ldo1: LDO1 {
- regulator-name = "ldo1";
- regulator-min-microvolt = <3000000>;
- regulator-max-microvolt = <3300000>;
- regulator-boot-on;
- };
- ldo2: LDO2 {
- regulator-name = "ldo2";
- regulator-min-microvolt = <900000>;
- regulator-max-microvolt = <900000>;
- regulator-boot-on;
- };
- ldo3: LDO3 {
- regulator-name = "ldo3";
- regulator-min-microvolt = <1800000>;
- regulator-max-microvolt = <3300000>;
- };
- ldo4: LDO4 {
- regulator-name = "ldo4";
- regulator-min-microvolt = <900000>;
- regulator-max-microvolt = <1800000>;
- };
- ldo5: LDO5 {
- regulator-name = "ldo5";
- regulator-min-microvolt = <1800000>;
- regulator-max-microvolt = <3300000>;
- };
- ldo6: LDO6 {
- regulator-name = "ldo6";
- regulator-min-microvolt = <900000>;
- regulator-max-microvolt = <1800000>;
- };
- ldo7_reg: LDO7 {
- regulator-name = "ldo7";
- regulator-min-microvolt = <1800000>;
- regulator-max-microvolt = <3300000>;
- };
-};
-
-
diff --git a/Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.yaml b/Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.yaml
new file mode 100644
index 000000000000..a323b1696eee
--- /dev/null
+++ b/Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.yaml
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/regulator/rohm,bd71837-regulator.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ROHM BD71837 Power Management Integrated Circuit regulators
+
+maintainers:
+ - Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
+
+description: |
+ List of regulators provided by this controller. BD71837 regulators node
+ should be sub node of the BD71837 MFD node. See BD71837 MFD bindings at
+ Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.yaml
+ Regulator nodes should be named to BUCK_<number> and LDO_<number>. The
+ definition for each of these nodes is defined using the standard
+ binding for regulators at
+ Documentation/devicetree/bindings/regulator/regulator.txt.
+ Note that if BD71837 starts at RUN state you probably want to use
+ regulator-boot-on at least for BUCK6 and BUCK7 so that those are not
+ disabled by driver at startup. LDO5 and LDO6 are supplied by those and
+ if they are disabled at startup the voltage monitoring for LDO5/LDO6 will
+ cause PMIC to reset.
+
+#The valid names for BD71837 regulator nodes are:
+#BUCK1, BUCK2, BUCK3, BUCK4, BUCK5, BUCK6, BUCK7, BUCK8
+#LDO1, LDO2, LDO3, LDO4, LDO5, LDO6, LDO7
+
+patternProperties:
+ "^LDO[1-7]$":
+ type: object
+ allOf:
+ - $ref: regulator.yaml#
+ description:
+ Properties for single LDO regulator.
+
+ properties:
+ regulator-name:
+ pattern: "^ldo[1-7]$"
+ description:
+ should be "ldo1", ..., "ldo7"
+
+ "^BUCK[1-8]$":
+ type: object
+ allOf:
+ - $ref: regulator.yaml#
+ description:
+ Properties for single BUCK regulator.
+
+ properties:
+ regulator-name:
+ pattern: "^buck[1-8]$"
+ description:
+ should be "buck1", ..., "buck8"
+
+ rohm,dvs-run-voltage:
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/uint32"
+ - minimum: 0
+ maximum: 1300000
+ description:
+ PMIC default "RUN" state voltage in uV. See below table for
+ bucks which support this. 0 means disabled.
+
+ rohm,dvs-idle-voltage:
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/uint32"
+ - minimum: 0
+ maximum: 1300000
+ description:
+ PMIC default "IDLE" state voltage in uV. See below table for
+ bucks which support this. 0 means disabled.
+
+ rohm,dvs-suspend-voltage:
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/uint32"
+ - minimum: 0
+ maximum: 1300000
+ description:
+ PMIC default "SUSPEND" state voltage in uV. See below table for
+ bucks which support this. 0 means disabled.
+
+ # Supported default DVS states:
+ #
+ # BD71837:
+ # buck | dvs-run-voltage | dvs-idle-voltage | dvs-suspend-voltage
+ # ----------------------------------------------------------------
+ # 1 | supported | supported | supported
+ # ----------------------------------------------------------------
+ # 2 | supported | supported | not supported
+ # ----------------------------------------------------------------
+ # 3 | supported | not supported | not supported
+ # ----------------------------------------------------------------
+ # 4 | supported | not supported | not supported
+ # ----------------------------------------------------------------
+ # rest | not supported | not supported | not supported
+
+
+ required:
+ - regulator-name
+ additionalProperties: false
+additionalProperties: false
diff --git a/Documentation/devicetree/bindings/regulator/rohm,bd71847-regulator.yaml b/Documentation/devicetree/bindings/regulator/rohm,bd71847-regulator.yaml
new file mode 100644
index 000000000000..526fd00bcb16
--- /dev/null
+++ b/Documentation/devicetree/bindings/regulator/rohm,bd71847-regulator.yaml
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/regulator/rohm,bd71847-regulator.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ROHM BD71847 and BD71850 Power Management Integrated Circuit regulators
+
+maintainers:
+ - Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
+
+description: |
+ List of regulators provided by this controller. BD71847 regulators node
+ should be sub node of the BD71847 MFD node. See BD71847 MFD bindings at
+ Documentation/devicetree/bindings/mfd/rohm,bd71847-pmic.yaml
+ Regulator nodes should be named to BUCK_<number> and LDO_<number>. The
+ definition for each of these nodes is defined using the standard
+ binding for regulators at
+ Documentation/devicetree/bindings/regulator/regulator.txt.
+ Note that if BD71847 starts at RUN state you probably want to use
+ regulator-boot-on at least for BUCK5. LDO6 is supplied by it and it must
+ not be disabled by driver at startup. If BUCK5 is disabled at startup the
+ voltage monitoring for LDO5/LDO6 can cause PMIC to reset.
+
+#The valid names for BD71847 regulator nodes are:
+#BUCK1, BUCK2, BUCK3, BUCK4, BUCK5, BUCK6
+#LDO1, LDO2, LDO3, LDO4, LDO5, LDO6
+
+patternProperties:
+ "^LDO[1-6]$":
+ type: object
+ allOf:
+ - $ref: regulator.yaml#
+ description:
+ Properties for single LDO regulator.
+
+ properties:
+ regulator-name:
+ pattern: "^ldo[1-6]$"
+ description:
+ should be "ldo1", ..., "ldo6"
+
+ "^BUCK[1-6]$":
+ type: object
+ allOf:
+ - $ref: regulator.yaml#
+ description:
+ Properties for single BUCK regulator.
+
+ properties:
+ regulator-name:
+ pattern: "^buck[1-6]$"
+ description:
+ should be "buck1", ..., "buck6"
+
+ rohm,dvs-run-voltage:
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/uint32"
+ - minimum: 0
+ maximum: 1300000
+ description:
+ PMIC default "RUN" state voltage in uV. See below table for
+ bucks which support this. 0 means disabled.
+
+ rohm,dvs-idle-voltage:
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/uint32"
+ - minimum: 0
+ maximum: 1300000
+ description:
+ PMIC default "IDLE" state voltage in uV. See below table for
+ bucks which support this. 0 means disabled.
+
+ rohm,dvs-suspend-voltage:
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/uint32"
+ - minimum: 0
+ maximum: 1300000
+ description:
+ PMIC default "SUSPEND" state voltage in uV. See below table for
+ bucks which support this. 0 means disabled.
+
+ # Supported default DVS states:
+ #
+ # BD71847:
+ # buck | dvs-run-voltage | dvs-idle-voltage | dvs-suspend-voltage
+ # ----------------------------------------------------------------
+ # 1 | supported | supported | supported
+ # ----------------------------------------------------------------
+ # 2 | supported | supported | not supported
+ # ----------------------------------------------------------------
+ # rest | not supported | not supported | not supported
+
+ required:
+ - regulator-name
+ additionalProperties: false
+additionalProperties: false
diff --git a/Documentation/devicetree/bindings/sound/cirrus,cs42l51.yaml b/Documentation/devicetree/bindings/sound/cirrus,cs42l51.yaml
index efce847a3408..83f44f07ac3f 100644
--- a/Documentation/devicetree/bindings/sound/cirrus,cs42l51.yaml
+++ b/Documentation/devicetree/bindings/sound/cirrus,cs42l51.yaml
@@ -49,7 +49,7 @@ required:
examples:
- |
#include <dt-bindings/gpio/gpio.h>
- i2c@0 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
diff --git a/Documentation/devicetree/bindings/thermal/imx8mm-thermal.txt b/Documentation/devicetree/bindings/thermal/imx8mm-thermal.txt
new file mode 100644
index 000000000000..3629d3c7e76a
--- /dev/null
+++ b/Documentation/devicetree/bindings/thermal/imx8mm-thermal.txt
@@ -0,0 +1,15 @@
+* Thermal Monitoring Unit (TMU) on Freescale i.MX8MM SoC
+
+Required properties:
+- compatible : Must be "fsl,imx8mm-tmu" or "fsl,imx8mp-tmu".
+- reg : Address range of TMU registers.
+- clocks : TMU's clock source.
+- #thermal-sensor-cells : Should be 0 or 1. See ./thermal.txt for a description.
+
+Example:
+tmu: tmu@30260000 {
+ compatible = "fsl,imx8mm-tmu";
+ reg = <0x30260000 0x10000>;
+ clocks = <&clk IMX8MM_CLK_TMU_ROOT>;
+ #thermal-sensor-cells = <0>;
+};
diff --git a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml
index a57b76ad7dea..2ddd39d96766 100644
--- a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml
+++ b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml
@@ -38,11 +38,11 @@ properties:
- enum:
- qcom,msm8996-tsens
- qcom,msm8998-tsens
+ - qcom,sc7180-tsens
- qcom,sdm845-tsens
- const: qcom,tsens-v2
reg:
- maxItems: 2
items:
- description: TM registers
- description: SROT registers
diff --git a/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.txt b/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.txt
index 12c740b975f7..2993fa720195 100644
--- a/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.txt
@@ -11,6 +11,7 @@ Required properties:
- "renesas,r8a774b1-thermal" (RZ/G2N)
- "renesas,r8a7795-thermal" (R-Car H3)
- "renesas,r8a7796-thermal" (R-Car M3-W)
+ - "renesas,r8a77961-thermal" (R-Car M3-W+)
- "renesas,r8a77965-thermal" (R-Car M3-N)
- "renesas,r8a77980-thermal" (R-Car V3H)
- reg : Address ranges of the thermal registers. Each sensor
diff --git a/Documentation/devicetree/bindings/thermal/sprd-thermal.yaml b/Documentation/devicetree/bindings/thermal/sprd-thermal.yaml
new file mode 100644
index 000000000000..058c4cc06ba6
--- /dev/null
+++ b/Documentation/devicetree/bindings/thermal/sprd-thermal.yaml
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/thermal/sprd-thermal.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Spreadtrum thermal sensor controller bindings
+
+maintainers:
+ - Orson Zhai <orsonzhai@gmail.com>
+ - Baolin Wang <baolin.wang7@gmail.com>
+ - Chunyan Zhang <zhang.lyra@gmail.com>
+
+properties:
+ compatible:
+ const: sprd,ums512-thermal
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ items:
+ - const: enable
+
+ nvmem-cells:
+ maxItems: 2
+ description:
+ Reference to nvmem nodes for the calibration data.
+
+ nvmem-cell-names:
+ items:
+ - const: thm_sign_cal
+ - const: thm_ratio_cal
+
+ "#thermal-sensor-cells":
+ const: 1
+
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+patternProperties:
+ "^([a-z]*-)?sensor(-section)?@[0-9]+$":
+ type: object
+ description:
+ Represent one thermal sensor.
+
+ properties:
+ reg:
+ description: Specify the sensor id.
+ maxItems: 1
+
+ nvmem-cells:
+ maxItems: 1
+ description:
+ Reference to an nvmem node for the calibration data.
+
+ nvmem-cell-names:
+ const: sen_delta_cal
+
+ required:
+ - reg
+ - nvmem-cells
+ - nvmem-cell-names
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - nvmem-cells
+ - nvmem-cell-names
+ - "#thermal-sensor-cells"
+ - "#address-cells"
+ - "#size-cells"
+
+examples:
+ - |
+ ap_thm0: thermal@32200000 {
+ compatible = "sprd,ums512-thermal";
+ reg = <0 0x32200000 0 0x10000>;
+ clock-names = "enable";
+ clocks = <&aonapb_gate 32>;
+ #thermal-sensor-cells = <1>;
+ nvmem-cells = <&thm0_sign>, <&thm0_ratio>;
+ nvmem-cell-names = "thm_sign_cal", "thm_ratio_cal";
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ prometheus-sensor@0 {
+ reg = <0>;
+ nvmem-cells = <&thm0_sen0>;
+ nvmem-cell-names = "sen_delta_cal";
+ };
+
+ ank-sensor@1 {
+ reg = <1>;
+ nvmem-cells = <&thm0_sen1>;
+ nvmem-cell-names = "sen_delta_cal";
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/thermal/thermal.txt b/Documentation/devicetree/bindings/thermal/thermal.txt
index ca14ba959e0d..f78bec19ca35 100644
--- a/Documentation/devicetree/bindings/thermal/thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/thermal.txt
@@ -142,11 +142,11 @@ Required properties:
- trips: A sub-node which is a container of only trip point nodes
Type: sub-node required to describe the thermal zone.
+Optional property:
- cooling-maps: A sub-node which is a container of only cooling device
Type: sub-node map nodes, used to describe the relation between trips
and cooling devices.
-Optional property:
- coefficients: An array of integers (one signed cell) containing
Type: array coefficients to compose a linear relation between
Elem size: one cell the sensors listed in the thermal-sensors property.
diff --git a/Documentation/devicetree/bindings/watchdog/ti,rti-wdt.yaml b/Documentation/devicetree/bindings/watchdog/ti,rti-wdt.yaml
new file mode 100644
index 000000000000..e83026fef2e9
--- /dev/null
+++ b/Documentation/devicetree/bindings/watchdog/ti,rti-wdt.yaml
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/watchdog/ti,rti-wdt.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Texas Instruments K3 SoC Watchdog Timer
+
+maintainers:
+ - Tero Kristo <t-kristo@ti.com>
+
+description:
+ The TI K3 SoC watchdog timer is implemented via the RTI (Real Time
+ Interrupt) IP module. This timer adds a support for windowed watchdog
+ mode, which will signal an error if it is pinged outside the watchdog
+ time window, meaning either too early or too late. The error signal
+ generated can be routed to either interrupt a safety controller or
+ to directly reset the SoC.
+
+allOf:
+ - $ref: "watchdog.yaml#"
+
+properties:
+ compatible:
+ enum:
+ - ti,j7-rti-wdt
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ power-domains:
+ maxItems: 1
+
+ assigned-clocks:
+ maxItems: 1
+
+ assigned-clocks-parents:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - power-domains
+
+examples:
+ - |
+ /*
+ * RTI WDT in main domain on J721e SoC. Assigned clocks are used to
+ * select the source clock for the watchdog, forcing it to tick with
+ * a 32kHz clock in this case.
+ */
+ #include <dt-bindings/soc/ti,sci_pm_domain.h>
+
+ watchdog0: rti@2200000 {
+ compatible = "ti,rti-wdt";
+ reg = <0x0 0x2200000 0x0 0x100>;
+ clocks = <&k3_clks 252 1>;
+ power-domains = <&k3_pds 252 TI_SCI_PD_EXCLUSIVE>;
+ assigned-clocks = <&k3_clks 252 1>;
+ assigned-clock-parents = <&k3_clks 252 5>;
+ };
diff --git a/Documentation/driver-api/thermal/cpu-idle-cooling.rst b/Documentation/driver-api/thermal/cpu-idle-cooling.rst
index 9f0016ee4cfb..a1c3edecae00 100644
--- a/Documentation/driver-api/thermal/cpu-idle-cooling.rst
+++ b/Documentation/driver-api/thermal/cpu-idle-cooling.rst
@@ -105,8 +105,8 @@ and this variation will modulate the cooling effect.
idle <-------------->
running
- <----------------------------->
- duty cycle 33%
+ <--------------------->
+ duty cycle 33%
^
diff --git a/Documentation/driver-api/usb/writing_usb_driver.rst b/Documentation/driver-api/usb/writing_usb_driver.rst
index 4fe1c06b6a13..0b3d9ff221bb 100644
--- a/Documentation/driver-api/usb/writing_usb_driver.rst
+++ b/Documentation/driver-api/usb/writing_usb_driver.rst
@@ -314,11 +314,8 @@ http://www.linux-usb.org/
Linux Hotplug Project:
http://linux-hotplug.sourceforge.net/
-Linux USB Working Devices List:
-http://www.qbik.ch/usb/devices/
-
-linux-usb-devel Mailing List Archives:
-http://marc.theaimsgroup.com/?l=linux-usb-devel
+linux-usb Mailing List Archives:
+https://lore.kernel.org/linux-usb/
Programming Guide for Linux USB Device Drivers:
http://lmu.web.psi.ch/docu/manuals/software_manuals/linux_sl/usb_linux_programming_guide.pdf
diff --git a/Documentation/driver-api/w1.rst b/Documentation/driver-api/w1.rst
index 9963cca788a1..bda3ad60f655 100644
--- a/Documentation/driver-api/w1.rst
+++ b/Documentation/driver-api/w1.rst
@@ -7,9 +7,6 @@ W1: Dallas' 1-wire bus
W1 API internal to the kernel
=============================
-W1 API internal to the kernel
------------------------------
-
include/linux/w1.h
~~~~~~~~~~~~~~~~~~
diff --git a/Documentation/filesystems/9p.rst b/Documentation/filesystems/9p.rst
index f054d1c45e86..671fef39a802 100644
--- a/Documentation/filesystems/9p.rst
+++ b/Documentation/filesystems/9p.rst
@@ -158,6 +158,16 @@ Options
/sys/fs/9p/caches. (applies only to cache=fscache)
============= ===============================================================
+Behavior
+========
+
+This section aims at describing 9p 'quirks' that can be different
+from a local filesystem behaviors.
+
+ - Setting O_NONBLOCK on a file will make client reads return as early
+ as the server returns some data instead of trying to fill the read
+ buffer with the requested amount of bytes or end of file is reached.
+
Resources
=========
diff --git a/Documentation/filesystems/ceph.rst b/Documentation/filesystems/ceph.rst
index b46a7218248f..0aa70750df0f 100644
--- a/Documentation/filesystems/ceph.rst
+++ b/Documentation/filesystems/ceph.rst
@@ -107,17 +107,17 @@ Mount Options
address its connection to the monitor originates from.
wsize=X
- Specify the maximum write size in bytes. Default: 16 MB.
+ Specify the maximum write size in bytes. Default: 64 MB.
rsize=X
- Specify the maximum read size in bytes. Default: 16 MB.
+ Specify the maximum read size in bytes. Default: 64 MB.
rasize=X
Specify the maximum readahead size in bytes. Default: 8 MB.
mount_timeout=X
Specify the timeout value for mount (in seconds), in the case
- of a non-responsive Ceph file system. The default is 30
+ of a non-responsive Ceph file system. The default is 60
seconds.
caps_max=X
diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index d681203728d7..87d794bc75a4 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -243,8 +243,8 @@ checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enabl
hide up to all remaining free space. The actual space that
would be unusable can be viewed at /sys/fs/f2fs/<disk>/unusable
This space is reclaimed once checkpoint=enable.
-compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo"
- and "lz4" algorithm.
+compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo",
+ "lz4" and "zstd" algorithm.
compress_log_size=%u Support configuring compress cluster size, the size will
be 4KB * (1 << %u), 16KB is minimum size, also it's
default size.
diff --git a/Documentation/filesystems/orangefs.rst b/Documentation/filesystems/orangefs.rst
index 7d6d4cad73c4..e41369709c5b 100644
--- a/Documentation/filesystems/orangefs.rst
+++ b/Documentation/filesystems/orangefs.rst
@@ -41,16 +41,6 @@ Documentation
http://www.orangefs.org/documentation/
-
-Userspace Filesystem Source
-===========================
-
-http://www.orangefs.org/download
-
-Orangefs versions prior to 2.9.3 would not be compatible with the
-upstream version of the kernel client.
-
-
Running ORANGEFS On a Single Server
===================================
@@ -94,6 +84,14 @@ Mount the filesystem::
mount -t pvfs2 tcp://localhost:3334/orangefs /pvfsmnt
+Userspace Filesystem Source
+===========================
+
+http://www.orangefs.org/download
+
+Orangefs versions prior to 2.9.3 would not be compatible with the
+upstream version of the kernel client.
+
Building ORANGEFS on a Single Server
====================================
@@ -107,18 +105,24 @@ default, we will probably be changing the default to LMDB soon.
::
- ./configure --prefix=/opt/ofs --with-db-backend=lmdb
+ ./configure --prefix=/opt/ofs --with-db-backend=lmdb --disable-usrint
make
make install
-Create an orangefs config file::
+Create an orangefs config file by running pvfs2-genconfig and
+specifying a target config file. Pvfs2-genconfig will prompt you
+through. Generally it works fine to take the defaults, but you
+should use your server's hostname, rather than "localhost" when
+it comes to that question::
/opt/ofs/bin/pvfs2-genconfig /etc/pvfs2.conf
Create an /etc/pvfs2tab file::
+Localhost is fine for your pvfs2tab file:
+
echo tcp://localhost:3334/orangefs /pvfsmnt pvfs2 defaults,noauto 0 0 > \
/etc/pvfs2tab
@@ -132,7 +136,7 @@ Bootstrap the server::
Start the server::
- /opt/osf/sbin/pvfs2-server /etc/pvfs2.conf
+ /opt/ofs/sbin/pvfs2-server /etc/pvfs2.conf
Now the server should be running. Pvfs2-ls is a simple
test to verify that the server is running::
@@ -142,11 +146,11 @@ test to verify that the server is running::
If stuff seems to be working, load the kernel module and
turn on the client core::
- /opt/ofs/sbin/pvfs2-client -p /opt/osf/sbin/pvfs2-client-core
+ /opt/ofs/sbin/pvfs2-client -p /opt/ofs/sbin/pvfs2-client-core
Mount your filesystem::
- mount -t pvfs2 tcp://localhost:3334/orangefs /pvfsmnt
+ mount -t pvfs2 tcp://`hostname`:3334/orangefs /pvfsmnt
Running xfstests
diff --git a/Documentation/filesystems/overlayfs.rst b/Documentation/filesystems/overlayfs.rst
index e443be7928db..c9d2bf96b02d 100644
--- a/Documentation/filesystems/overlayfs.rst
+++ b/Documentation/filesystems/overlayfs.rst
@@ -40,13 +40,46 @@ On 64bit systems, even if all overlay layers are not on the same
underlying filesystem, the same compliant behavior could be achieved
with the "xino" feature. The "xino" feature composes a unique object
identifier from the real object st_ino and an underlying fsid index.
+
If all underlying filesystems support NFS file handles and export file
handles with 32bit inode number encoding (e.g. ext4), overlay filesystem
will use the high inode number bits for fsid. Even when the underlying
filesystem uses 64bit inode numbers, users can still enable the "xino"
feature with the "-o xino=on" overlay mount option. That is useful for the
case of underlying filesystems like xfs and tmpfs, which use 64bit inode
-numbers, but are very unlikely to use the high inode number bit.
+numbers, but are very unlikely to use the high inode number bits. In case
+the underlying inode number does overflow into the high xino bits, overlay
+filesystem will fall back to the non xino behavior for that inode.
+
+The following table summarizes what can be expected in different overlay
+configurations.
+
+Inode properties
+````````````````
+
++--------------+------------+------------+-----------------+----------------+
+|Configuration | Persistent | Uniform | st_ino == d_ino | d_ino == i_ino |
+| | st_ino | st_dev | | [*] |
++==============+=====+======+=====+======+========+========+========+=======+
+| | dir | !dir | dir | !dir | dir + !dir | dir | !dir |
++--------------+-----+------+-----+------+--------+--------+--------+-------+
+| All layers | Y | Y | Y | Y | Y | Y | Y | Y |
+| on same fs | | | | | | | | |
++--------------+-----+------+-----+------+--------+--------+--------+-------+
+| Layers not | N | Y | Y | N | N | Y | N | Y |
+| on same fs, | | | | | | | | |
+| xino=off | | | | | | | | |
++--------------+-----+------+-----+------+--------+--------+--------+-------+
+| xino=on/auto | Y | Y | Y | Y | Y | Y | Y | Y |
+| | | | | | | | | |
++--------------+-----+------+-----+------+--------+--------+--------+-------+
+| xino=on/auto,| N | Y | Y | N | N | Y | N | Y |
+| ino overflow | | | | | | | | |
++--------------+-----+------+-----+------+--------+--------+--------+-------+
+
+[*] nfsd v3 readdirplus verifies d_ino == i_ino. i_ino is exposed via several
+/proc files, such as /proc/locks and /proc/self/fdinfo/<fd> of an inotify
+file descriptor.
Upper and Lower
@@ -248,6 +281,50 @@ overlay filesystem (though an operation on the name of the file such as
rename or unlink will of course be noticed and handled).
+Permission model
+----------------
+
+Permission checking in the overlay filesystem follows these principles:
+
+ 1) permission check SHOULD return the same result before and after copy up
+
+ 2) task creating the overlay mount MUST NOT gain additional privileges
+
+ 3) non-mounting task MAY gain additional privileges through the overlay,
+ compared to direct access on underlying lower or upper filesystems
+
+This is achieved by performing two permission checks on each access
+
+ a) check if current task is allowed access based on local DAC (owner,
+ group, mode and posix acl), as well as MAC checks
+
+ b) check if mounting task would be allowed real operation on lower or
+ upper layer based on underlying filesystem permissions, again including
+ MAC checks
+
+Check (a) ensures consistency (1) since owner, group, mode and posix acls
+are copied up. On the other hand it can result in server enforced
+permissions (used by NFS, for example) being ignored (3).
+
+Check (b) ensures that no task gains permissions to underlying layers that
+the mounting task does not have (2). This also means that it is possible
+to create setups where the consistency rule (1) does not hold; normally,
+however, the mounting task will have sufficient privileges to perform all
+operations.
+
+Another way to demonstrate this model is drawing parallels between
+
+ mount -t overlay overlay -olowerdir=/lower,upperdir=/upper,... /merged
+
+and
+
+ cp -a /lower /upper
+ mount --bind /upper /merged
+
+The resulting access permissions should be the same. The difference is in
+the time of copy (on-demand vs. up-front).
+
+
Multiple lower layers
---------------------
@@ -383,7 +460,8 @@ guarantee that the values of st_ino and st_dev returned by stat(2) and the
value of d_ino returned by readdir(3) will act like on a normal filesystem.
E.g. the value of st_dev may be different for two objects in the same
overlay filesystem and the value of st_ino for directory objects may not be
-persistent and could change even while the overlay filesystem is mounted.
+persistent and could change even while the overlay filesystem is mounted, as
+summarized in the `Inode properties`_ table above.
Changes to underlying filesystems
diff --git a/Documentation/filesystems/qnx6.rst b/Documentation/filesystems/qnx6.rst
index b71308314070..fd13433d362c 100644
--- a/Documentation/filesystems/qnx6.rst
+++ b/Documentation/filesystems/qnx6.rst
@@ -185,7 +185,7 @@ tree structures are treated as system blocks.
The rational behind that is that a write request can work on a new snapshot
(system area of the inactive - resp. lower serial numbered superblock) while
-at the same time there is still a complete stable filesystem structer in the
+at the same time there is still a complete stable filesystem structure in the
other half of the system area.
When finished with writing (a sync write is completed, the maximum sync leap
diff --git a/Documentation/firmware-guide/acpi/namespace.rst b/Documentation/firmware-guide/acpi/namespace.rst
index 3eb763d6656d..6193582a2204 100644
--- a/Documentation/firmware-guide/acpi/namespace.rst
+++ b/Documentation/firmware-guide/acpi/namespace.rst
@@ -56,13 +56,13 @@ are illustrated in the following diagram::
+- - - -+ | +-------------------| |
| Entry | - - - - - - - -+ | | Definition Blocks | |
+- - - -+ | | +-------------------+ |
- | | +- - - - - - - - - -+ |
- +-|->| SSDT | |
+ | | +- - - - - - - - - -+ |
+ +-|->| SSDT | |
| +-------------------+ |
| | Definition Blocks | |
| +- - - - - - - - - -+ |
+------------------------+
- |
+ |
OSPM Loading |
\|/
+----------------+
diff --git a/Documentation/kbuild/kbuild.rst b/Documentation/kbuild/kbuild.rst
index 510f38d7e78a..2d1fc03d346e 100644
--- a/Documentation/kbuild/kbuild.rst
+++ b/Documentation/kbuild/kbuild.rst
@@ -262,3 +262,8 @@ KBUILD_BUILD_USER, KBUILD_BUILD_HOST
These two variables allow to override the user@host string displayed during
boot and in /proc/version. The default value is the output of the commands
whoami and host, respectively.
+
+LLVM
+----
+If this variable is set to 1, Kbuild will use Clang and LLVM utilities instead
+of GCC and GNU binutils to build the kernel.
diff --git a/Documentation/kbuild/llvm.rst b/Documentation/kbuild/llvm.rst
index d6c79eb4e23e..c776b6eee969 100644
--- a/Documentation/kbuild/llvm.rst
+++ b/Documentation/kbuild/llvm.rst
@@ -47,14 +47,21 @@ example:
LLVM Utilities
--------------
-LLVM has substitutes for GNU binutils utilities. These can be invoked as
-additional parameters to `make`.
+LLVM has substitutes for GNU binutils utilities. Kbuild supports `LLVM=1`
+to enable them.
- make CC=clang AS=clang LD=ld.lld AR=llvm-ar NM=llvm-nm STRIP=llvm-strip \\
- OBJCOPY=llvm-objcopy OBJDUMP=llvm-objdump OBJSIZE=llvm-objsize \\
+ make LLVM=1
+
+They can be enabled individually. The full list of the parameters:
+
+ make CC=clang LD=ld.lld AR=llvm-ar NM=llvm-nm STRIP=llvm-strip \\
+ OBJCOPY=llvm-objcopy OBJDUMP=llvm-objdump OBJSIZE=llvm-size \\
READELF=llvm-readelf HOSTCC=clang HOSTCXX=clang++ HOSTAR=llvm-ar \\
HOSTLD=ld.lld
+Currently, the integrated assembler is disabled by default. You can pass
+`LLVM_IAS=1` to enable it.
+
Getting Help
------------
diff --git a/Documentation/openrisc/openrisc_port.rst b/Documentation/openrisc/openrisc_port.rst
index a18747a8d191..4b2c437942a0 100644
--- a/Documentation/openrisc/openrisc_port.rst
+++ b/Documentation/openrisc/openrisc_port.rst
@@ -37,8 +37,8 @@ or Stafford's toolchain build and release scripts.
Build the Linux kernel as usual::
- make ARCH=openrisc defconfig
- make ARCH=openrisc
+ make ARCH=openrisc CROSS_COMPILE="or1k-linux-" defconfig
+ make ARCH=openrisc CROSS_COMPILE="or1k-linux-"
3) Running on FPGA (optional)
diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index e47863575917..91c5ff8e161e 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -31,7 +31,7 @@ you probably needn't concern yourself with pcmciautils.
====================== =============== ========================================
GNU C 4.6 gcc --version
GNU make 3.81 make --version
-binutils 2.21 ld -v
+binutils 2.23 ld -v
flex 2.5.35 flex --version
bison 2.0 bison --version
util-linux 2.10o fdformat --version
@@ -76,7 +76,7 @@ You will need GNU make 3.81 or later to build the kernel.
Binutils
--------
-Binutils 2.21 or newer is needed to build the kernel.
+Binutils 2.23 or newer is needed to build the kernel.
pkg-config
----------
diff --git a/Documentation/vm/free_page_reporting.rst b/Documentation/vm/free_page_reporting.rst
new file mode 100644
index 000000000000..8c05e62d8b2b
--- /dev/null
+++ b/Documentation/vm/free_page_reporting.rst
@@ -0,0 +1,40 @@
+.. _free_page_reporting:
+
+=====================
+Free Page Reporting
+=====================
+
+Free page reporting is an API by which a device can register to receive
+lists of pages that are currently unused by the system. This is useful in
+the case of virtualization where a guest is then able to use this data to
+notify the hypervisor that it is no longer using certain pages in memory.
+
+For the driver, typically a balloon driver, to use of this functionality
+it will allocate and initialize a page_reporting_dev_info structure. The
+field within the structure it will populate is the "report" function
+pointer used to process the scatterlist. It must also guarantee that it can
+handle at least PAGE_REPORTING_CAPACITY worth of scatterlist entries per
+call to the function. A call to page_reporting_register will register the
+page reporting interface with the reporting framework assuming no other
+page reporting devices are already registered.
+
+Once registered the page reporting API will begin reporting batches of
+pages to the driver. The API will start reporting pages 2 seconds after
+the interface is registered and will continue to do so 2 seconds after any
+page of a sufficiently high order is freed.
+
+Pages reported will be stored in the scatterlist passed to the reporting
+function with the final entry having the end bit set in entry nent - 1.
+While pages are being processed by the report function they will not be
+accessible to the allocator. Once the report function has been completed
+the pages will be returned to the free area from which they were obtained.
+
+Prior to removing a driver that is making use of free page reporting it
+is necessary to call page_reporting_unregister to have the
+page_reporting_dev_info structure that is currently in use by free page
+reporting removed. Doing this will prevent further reports from being
+issued via the interface. If another driver or the same driver is
+registered it is possible for it to resume where the previous driver had
+left off in terms of reporting free pages.
+
+Alexander Duyck, Dec 04, 2019
diff --git a/Documentation/vm/zswap.rst b/Documentation/vm/zswap.rst
index 61f6185188cd..f8c6a79d7c70 100644
--- a/Documentation/vm/zswap.rst
+++ b/Documentation/vm/zswap.rst
@@ -35,9 +35,11 @@ Zswap evicts pages from compressed cache on an LRU basis to the backing swap
device when the compressed pool reaches its size limit. This requirement had
been identified in prior community discussions.
-Zswap is disabled by default but can be enabled at boot time by setting
-the ``enabled`` attribute to 1 at boot time. ie: ``zswap.enabled=1``. Zswap
-can also be enabled and disabled at runtime using the sysfs interface.
+Whether Zswap is enabled at the boot time depends on whether
+the ``CONFIG_ZSWAP_DEFAULT_ON`` Kconfig option is enabled or not.
+This setting can then be overridden by providing the kernel command line
+``zswap.enabled=`` option, for example ``zswap.enabled=0``.
+Zswap can also be enabled and disabled at runtime using the sysfs interface.
An example command to enable zswap at runtime, assuming sysfs is mounted
at ``/sys``, is::
@@ -64,9 +66,10 @@ allocation in zpool is not directly accessible by address. Rather, a handle is
returned by the allocation routine and that handle must be mapped before being
accessed. The compressed memory pool grows on demand and shrinks as compressed
pages are freed. The pool is not preallocated. By default, a zpool
-of type zbud is created, but it can be selected at boot time by
-setting the ``zpool`` attribute, e.g. ``zswap.zpool=zbud``. It can
-also be changed at runtime using the sysfs ``zpool`` attribute, e.g.::
+of type selected in ``CONFIG_ZSWAP_ZPOOL_DEFAULT`` Kconfig option is created,
+but it can be overridden at boot time by setting the ``zpool`` attribute,
+e.g. ``zswap.zpool=zbud``. It can also be changed at runtime using the sysfs
+``zpool`` attribute, e.g.::
echo zbud > /sys/module/zswap/parameters/zpool
@@ -97,8 +100,9 @@ controlled policy:
* max_pool_percent - The maximum percentage of memory that the compressed
pool can occupy.
-The default compressor is lzo, but it can be selected at boot time by
-setting the ``compressor`` attribute, e.g. ``zswap.compressor=lzo``.
+The default compressor is selected in ``CONFIG_ZSWAP_COMPRESSOR_DEFAULT``
+Kconfig option, but it can be overridden at boot time by setting the
+``compressor`` attribute, e.g. ``zswap.compressor=lzo``.
It can also be changed at runtime using the sysfs "compressor"
attribute, e.g.::
diff --git a/MAINTAINERS b/MAINTAINERS
index 534a8dc4f84a..0d186a01a37a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -77,21 +77,13 @@ Tips for patch submitters
8. Happy hacking.
-Descriptions of section entries
--------------------------------
+Descriptions of section entries and preferred order
+---------------------------------------------------
M: *Mail* patches to: FullName <address@domain>
R: Designated *Reviewer*: FullName <address@domain>
These reviewers should be CCed on patches.
L: *Mailing list* that is relevant to this area
- W: *Web-page* with status/info
- B: URI for where to file *bugs*. A web-page with detailed bug
- filing info, a direct bug tracker link, or a mailto: URI.
- C: URI for *chat* protocol, server and channel where developers
- usually hang out, for example irc://server/channel.
- Q: *Patchwork* web based patch tracking system site
- T: *SCM* tree type and location.
- Type is one of: git, hg, quilt, stgit, topgit
S: *Status*, one of the following:
Supported: Someone is actually paid to look after this.
Maintained: Someone actually looks after it.
@@ -102,30 +94,39 @@ Descriptions of section entries
Obsolete: Old code. Something tagged obsolete generally means
it has been replaced by a better system and you
should be using that.
+ W: *Web-page* with status/info
+ Q: *Patchwork* web based patch tracking system site
+ B: URI for where to file *bugs*. A web-page with detailed bug
+ filing info, a direct bug tracker link, or a mailto: URI.
+ C: URI for *chat* protocol, server and channel where developers
+ usually hang out, for example irc://server/channel.
P: Subsystem Profile document for more details submitting
patches to the given subsystem. This is either an in-tree file,
or a URI. See Documentation/maintainer/maintainer-entry-profile.rst
for details.
+ T: *SCM* tree type and location.
+ Type is one of: git, hg, quilt, stgit, topgit
F: *Files* and directories wildcard patterns.
A trailing slash includes all files and subdirectory files.
F: drivers/net/ all files in and below drivers/net
F: drivers/net/* all files in drivers/net, but not below
F: */net/* all files in "any top level directory"/net
One pattern per line. Multiple F: lines acceptable.
+ X: *Excluded* files and directories that are NOT maintained, same
+ rules as F:. Files exclusions are tested before file matches.
+ Can be useful for excluding a specific subdirectory, for instance:
+ F: net/
+ X: net/ipv6/
+ matches all files in and below net excluding net/ipv6/
N: Files and directories *Regex* patterns.
- N: [^a-z]tegra all files whose path contains the word tegra
+ N: [^a-z]tegra all files whose path contains tegra
+ (not including files like integrator)
One pattern per line. Multiple N: lines acceptable.
scripts/get_maintainer.pl has different behavior for files that
match F: pattern and matches of N: patterns. By default,
get_maintainer will not look at git log history when an F: pattern
match occurs. When an N: match occurs, git log history is used
to also notify the people that have git commit signatures.
- X: *Excluded* files and directories that are NOT maintained, same
- rules as F:. Files exclusions are tested before file matches.
- Can be useful for excluding a specific subdirectory, for instance:
- F: net/
- X: net/ipv6/
- matches all files in and below net excluding net/ipv6/
K: *Content regex* (perl extended) pattern match in a patch or file.
For instance:
K: of_get_profile
@@ -726,7 +727,7 @@ L: linux-alpha@vger.kernel.org
F: arch/alpha/
ALPS PS/2 TOUCHPAD DRIVER
-R: Pali Rohár <pali.rohar@gmail.com>
+R: Pali Rohár <pali@kernel.org>
F: drivers/input/mouse/alps.*
ALTERA I2C CONTROLLER DRIVER
@@ -737,7 +738,6 @@ F: drivers/i2c/busses/i2c-altera.c
ALTERA MAILBOX DRIVER
M: Ley Foon Tan <ley.foon.tan@intel.com>
-L: nios2-dev@lists.rocketboards.org (moderated for non-subscribers)
S: Maintained
F: drivers/mailbox/mailbox-altera.c
@@ -765,14 +765,12 @@ F: include/dt-bindings/reset/altr,rst-mgr-a10sr.h
ALTERA TRIPLE SPEED ETHERNET DRIVER
M: Thor Thayer <thor.thayer@linux.intel.com>
L: netdev@vger.kernel.org
-L: nios2-dev@lists.rocketboards.org (moderated for non-subscribers)
S: Maintained
F: drivers/net/ethernet/altera/
ALTERA UART/JTAG UART SERIAL DRIVERS
M: Tobias Klauser <tklauser@distanz.ch>
L: linux-serial@vger.kernel.org
-L: nios2-dev@lists.rocketboards.org (moderated for non-subscribers)
S: Maintained
F: drivers/tty/serial/altera_uart.c
F: drivers/tty/serial/altera_jtaguart.c
@@ -1443,6 +1441,7 @@ M: Will Deacon <will@kernel.org>
R: Robin Murphy <robin.murphy@arm.com>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Maintained
+F: Documentation/devicetree/bindings/iommu/arm,smmu*
F: drivers/iommu/arm-smmu*
F: drivers/iommu/io-pgtable-arm.c
F: drivers/iommu/io-pgtable-arm-v7s.c
@@ -4772,23 +4771,23 @@ F: drivers/net/fddi/defza.*
DELL LAPTOP DRIVER
M: Matthew Garrett <mjg59@srcf.ucam.org>
-M: Pali Rohár <pali.rohar@gmail.com>
+M: Pali Rohár <pali@kernel.org>
L: platform-driver-x86@vger.kernel.org
S: Maintained
F: drivers/platform/x86/dell-laptop.c
DELL LAPTOP FREEFALL DRIVER
-M: Pali Rohár <pali.rohar@gmail.com>
+M: Pali Rohár <pali@kernel.org>
S: Maintained
F: drivers/platform/x86/dell-smo8800.c
DELL LAPTOP RBTN DRIVER
-M: Pali Rohár <pali.rohar@gmail.com>
+M: Pali Rohár <pali@kernel.org>
S: Maintained
F: drivers/platform/x86/dell-rbtn.*
DELL LAPTOP SMM DRIVER
-M: Pali Rohár <pali.rohar@gmail.com>
+M: Pali Rohár <pali@kernel.org>
S: Maintained
F: drivers/hwmon/dell-smm-hwmon.c
F: include/uapi/linux/i8k.h
@@ -4800,7 +4799,7 @@ S: Maintained
F: drivers/platform/x86/dell_rbu.c
DELL SMBIOS DRIVER
-M: Pali Rohár <pali.rohar@gmail.com>
+M: Pali Rohár <pali@kernel.org>
M: Mario Limonciello <mario.limonciello@dell.com>
L: platform-driver-x86@vger.kernel.org
S: Maintained
@@ -4833,7 +4832,7 @@ F: drivers/platform/x86/dell-wmi-descriptor.c
DELL WMI NOTIFICATIONS DRIVER
M: Matthew Garrett <mjg59@srcf.ucam.org>
-M: Pali Rohár <pali.rohar@gmail.com>
+M: Pali Rohár <pali@kernel.org>
S: Maintained
F: drivers/platform/x86/dell-wmi.c
@@ -9668,6 +9667,7 @@ F: drivers/acpi/nfit/*
F: include/linux/nd.h
F: include/linux/libnvdimm.h
F: include/uapi/linux/ndctl.h
+F: tools/testing/nvdimm/
LICENSES and SPDX stuff
M: Thomas Gleixner <tglx@linutronix.de>
@@ -11922,7 +11922,6 @@ F: drivers/scsi/nsp32*
NIOS2 ARCHITECTURE
M: Ley Foon Tan <ley.foon.tan@intel.com>
-L: nios2-dev@lists.rocketboards.org (moderated for non-subscribers)
T: git git://git.kernel.org/pub/scm/linux/kernel/git/lftan/nios2.git
S: Maintained
F: arch/nios2/
@@ -11947,7 +11946,7 @@ F: drivers/media/i2c/et8ek8
F: drivers/media/i2c/ad5820.c
NOKIA N900 POWER SUPPLY DRIVERS
-R: Pali Rohár <pali.rohar@gmail.com>
+R: Pali Rohár <pali@kernel.org>
F: include/linux/power/bq2415x_charger.h
F: include/linux/power/bq27xxx_battery.h
F: drivers/power/supply/bq2415x_charger.c
@@ -17869,10 +17868,12 @@ L: virtualization@lists.linux-foundation.org
S: Maintained
F: Documentation/devicetree/bindings/virtio/
F: drivers/virtio/
+F: drivers/vdpa/
F: tools/virtio/
F: drivers/net/virtio_net.c
F: drivers/block/virtio_blk.c
F: include/linux/virtio*.h
+F: include/linux/vdpa.h
F: include/uapi/linux/virtio_*.h
F: drivers/crypto/virtio/
F: mm/balloon_compaction.c
@@ -17940,6 +17941,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git
S: Maintained
F: drivers/vhost/
F: include/uapi/linux/vhost.h
+F: include/linux/vhost_iotlb.h
VIRTIO INPUT DRIVER
M: Gerd Hoffmann <kraxel@redhat.com>
diff --git a/Makefile b/Makefile
index c91342953d9e..d5c907c0beb8 100644
--- a/Makefile
+++ b/Makefile
@@ -399,8 +399,13 @@ HOST_LFS_CFLAGS := $(shell getconf LFS_CFLAGS 2>/dev/null)
HOST_LFS_LDFLAGS := $(shell getconf LFS_LDFLAGS 2>/dev/null)
HOST_LFS_LIBS := $(shell getconf LFS_LIBS 2>/dev/null)
-HOSTCC = gcc
-HOSTCXX = g++
+ifneq ($(LLVM),)
+HOSTCC = clang
+HOSTCXX = clang++
+else
+HOSTCC = gcc
+HOSTCXX = g++
+endif
KBUILD_HOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 \
-fomit-frame-pointer -std=gnu89 $(HOST_LFS_CFLAGS) \
$(HOSTCFLAGS)
@@ -409,16 +414,28 @@ KBUILD_HOSTLDFLAGS := $(HOST_LFS_LDFLAGS) $(HOSTLDFLAGS)
KBUILD_HOSTLDLIBS := $(HOST_LFS_LIBS) $(HOSTLDLIBS)
# Make variables (CC, etc...)
-LD = $(CROSS_COMPILE)ld
-CC = $(CROSS_COMPILE)gcc
CPP = $(CC) -E
+ifneq ($(LLVM),)
+CC = clang
+LD = ld.lld
+AR = llvm-ar
+NM = llvm-nm
+OBJCOPY = llvm-objcopy
+OBJDUMP = llvm-objdump
+READELF = llvm-readelf
+OBJSIZE = llvm-size
+STRIP = llvm-strip
+else
+CC = $(CROSS_COMPILE)gcc
+LD = $(CROSS_COMPILE)ld
AR = $(CROSS_COMPILE)ar
NM = $(CROSS_COMPILE)nm
-STRIP = $(CROSS_COMPILE)strip
OBJCOPY = $(CROSS_COMPILE)objcopy
OBJDUMP = $(CROSS_COMPILE)objdump
-OBJSIZE = $(CROSS_COMPILE)size
READELF = $(CROSS_COMPILE)readelf
+OBJSIZE = $(CROSS_COMPILE)size
+STRIP = $(CROSS_COMPILE)strip
+endif
PAHOLE = pahole
LEX = flex
YACC = bison
@@ -538,7 +555,7 @@ endif
ifneq ($(GCC_TOOLCHAIN),)
CLANG_FLAGS += --gcc-toolchain=$(GCC_TOOLCHAIN)
endif
-ifeq ($(if $(AS),$(shell $(AS) --version 2>&1 | head -n 1 | grep clang)),)
+ifneq ($(LLVM_IAS),1)
CLANG_FLAGS += -no-integrated-as
endif
CLANG_FLAGS += -Werror=unknown-warning-option
@@ -747,8 +764,6 @@ ifdef CONFIG_CC_IS_CLANG
KBUILD_CPPFLAGS += -Qunused-arguments
KBUILD_CFLAGS += -Wno-format-invalid-specifier
KBUILD_CFLAGS += -Wno-gnu
-# Quiet clang warning: comparison of unsigned expression < 0 is always false
-KBUILD_CFLAGS += -Wno-tautological-compare
# CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the
# source of a reference will be _MergedGlobals and not on of the whitelisted names.
# See modpost pattern 2
@@ -1036,8 +1051,13 @@ init-y := $(patsubst %/, %/built-in.a, $(init-y))
core-y := $(patsubst %/, %/built-in.a, $(core-y))
drivers-y := $(patsubst %/, %/built-in.a, $(drivers-y))
net-y := $(patsubst %/, %/built-in.a, $(net-y))
+libs-y2 := $(patsubst %/, %/built-in.a, $(filter %/, $(libs-y)))
+ifdef CONFIG_MODULES
+libs-y1 := $(filter-out %/, $(libs-y))
+libs-y2 += $(patsubst %/, %/lib.a, $(filter %/, $(libs-y)))
+else
libs-y1 := $(patsubst %/, %/lib.a, $(libs-y))
-libs-y2 := $(patsubst %/, %/built-in.a, $(filter-out %.a, $(libs-y)))
+endif
virt-y := $(patsubst %/, %/built-in.a, $(virt-y))
# Externally visible symbols (used by link-vmlinux.sh)
diff --git a/arch/alpha/include/asm/mmzone.h b/arch/alpha/include/asm/mmzone.h
index 7ee144f484f1..9b521c857436 100644
--- a/arch/alpha/include/asm/mmzone.h
+++ b/arch/alpha/include/asm/mmzone.h
@@ -8,8 +8,6 @@
#include <asm/smp.h>
-struct bootmem_data_t; /* stupid forward decl. */
-
/*
* Following are macros that are specific to this numa platform.
*/
diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h
index f3fb2848470a..e241bd88880f 100644
--- a/arch/alpha/include/asm/page.h
+++ b/arch/alpha/include/asm/page.h
@@ -90,9 +90,6 @@ typedef struct page *pgtable_t;
#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
#endif /* CONFIG_DISCONTIGMEM */
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index 299791ce14b6..0267aa8a4f86 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h
@@ -268,7 +268,6 @@ extern inline void pud_clear(pud_t * pudp) { pud_val(*pudp) = 0; }
extern inline int pte_write(pte_t pte) { return !(pte_val(pte) & _PAGE_FOW); }
extern inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; }
extern inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
-extern inline int pte_special(pte_t pte) { return 0; }
extern inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) |= _PAGE_FOW; return pte; }
extern inline pte_t pte_mkclean(pte_t pte) { pte_val(pte) &= ~(__DIRTY_BITS); return pte; }
@@ -276,7 +275,6 @@ extern inline pte_t pte_mkold(pte_t pte) { pte_val(pte) &= ~(__ACCESS_BITS); ret
extern inline pte_t pte_mkwrite(pte_t pte) { pte_val(pte) &= ~_PAGE_FOW; return pte; }
extern inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= __DIRTY_BITS; return pte; }
extern inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= __ACCESS_BITS; return pte; }
-extern inline pte_t pte_mkspecial(pte_t pte) { return pte; }
#define PAGE_DIR_OFFSET(tsk,address) pgd_offset((tsk),(address))
diff --git a/arch/alpha/kernel/syscalls/syscallhdr.sh b/arch/alpha/kernel/syscalls/syscallhdr.sh
index e5b99bd2e5e7..1780e861492a 100644
--- a/arch/alpha/kernel/syscalls/syscallhdr.sh
+++ b/arch/alpha/kernel/syscalls/syscallhdr.sh
@@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
printf "#define __NR_syscalls\t%s\n" "${nxt}"
printf "#endif\n"
printf "\n"
- printf "#endif /* %s */" "${fileguard}"
+ printf "#endif /* %s */\n" "${fileguard}"
) > "$out"
diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h
index 0a32e8cfd074..b0dfed0f12be 100644
--- a/arch/arc/include/asm/page.h
+++ b/arch/arc/include/asm/page.h
@@ -102,7 +102,7 @@ typedef pte_t * pgtable_t;
#define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr))
/* Default Permissions for stack/heaps pages (Non Executable) */
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC
#define WANT_PAGE_VIRTUAL 1
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index c2b75cba26df..11b058a72a5b 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -161,9 +161,7 @@ extern int pfn_valid(unsigned long);
#endif /* !__ASSEMBLY__ */
-#define VM_DATA_DEFAULT_FLAGS \
- (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \
- VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC
#include <asm-generic/getorder.h>
diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h
index 0d3ea35c97fe..9e084a464a97 100644
--- a/arch/arm/include/asm/pgtable-2level.h
+++ b/arch/arm/include/asm/pgtable-2level.h
@@ -211,8 +211,6 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
#define pmd_addr_end(addr,end) (end)
#define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext)
-#define pte_special(pte) (0)
-static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
/*
* We don't have huge page support for short descriptors, for the moment
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 0483cf413315..befc8fcec98f 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -243,19 +243,8 @@ static inline void __sync_icache_dcache(pte_t pteval)
extern void __sync_icache_dcache(pte_t pteval);
#endif
-static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval)
-{
- unsigned long ext = 0;
-
- if (addr < TASK_SIZE && pte_valid_user(pteval)) {
- if (!pte_special(pteval))
- __sync_icache_dcache(pteval);
- ext |= PTE_EXT_NG;
- }
-
- set_pte_ext(ptep, pteval, ext);
-}
+void set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval);
static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot)
{
diff --git a/arch/arm/mach-omap2/omap-secure.c b/arch/arm/mach-omap2/omap-secure.c
index d00e3c72e37d..f70d561f37f7 100644
--- a/arch/arm/mach-omap2/omap-secure.c
+++ b/arch/arm/mach-omap2/omap-secure.c
@@ -5,7 +5,7 @@
* Copyright (C) 2011 Texas Instruments, Inc.
* Santosh Shilimkar <santosh.shilimkar@ti.com>
* Copyright (C) 2012 Ivaylo Dimitrov <freemangordon@abv.bg>
- * Copyright (C) 2013 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (C) 2013 Pali Rohár <pali@kernel.org>
*/
#include <linux/arm-smccc.h>
diff --git a/arch/arm/mach-omap2/omap-secure.h b/arch/arm/mach-omap2/omap-secure.h
index ba8c486c0454..4aaa95706d39 100644
--- a/arch/arm/mach-omap2/omap-secure.h
+++ b/arch/arm/mach-omap2/omap-secure.h
@@ -5,7 +5,7 @@
* Copyright (C) 2011 Texas Instruments, Inc.
* Santosh Shilimkar <santosh.shilimkar@ti.com>
* Copyright (C) 2012 Ivaylo Dimitrov <freemangordon@abv.bg>
- * Copyright (C) 2013 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (C) 2013 Pali Rohár <pali@kernel.org>
*/
#ifndef OMAP_ARCH_OMAP_SECURE_H
#define OMAP_ARCH_OMAP_SECURE_H
diff --git a/arch/arm/mach-omap2/omap-smc.S b/arch/arm/mach-omap2/omap-smc.S
index d4832845a4e8..7376f528034d 100644
--- a/arch/arm/mach-omap2/omap-smc.S
+++ b/arch/arm/mach-omap2/omap-smc.S
@@ -6,7 +6,7 @@
* Written by Santosh Shilimkar <santosh.shilimkar@ti.com>
*
* Copyright (C) 2012 Ivaylo Dimitrov <freemangordon@abv.bg>
- * Copyright (C) 2013 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (C) 2013 Pali Rohár <pali@kernel.org>
*/
#include <linux/linkage.h>
diff --git a/arch/arm/mach-pxa/cm-x300.c b/arch/arm/mach-pxa/cm-x300.c
index 425855f456f2..2e35354b61f5 100644
--- a/arch/arm/mach-pxa/cm-x300.c
+++ b/arch/arm/mach-pxa/cm-x300.c
@@ -312,7 +312,6 @@ static struct pwm_lookup cm_x300_pwm_lookup[] = {
static struct platform_pwm_backlight_data cm_x300_backlight_data = {
.max_brightness = 100,
.dft_brightness = 100,
- .enable_gpio = -1,
};
static struct platform_device cm_x300_backlight_device = {
diff --git a/arch/arm/mach-pxa/colibri-pxa270-income.c b/arch/arm/mach-pxa/colibri-pxa270-income.c
index dbad2f13706c..e5879e8b0682 100644
--- a/arch/arm/mach-pxa/colibri-pxa270-income.c
+++ b/arch/arm/mach-pxa/colibri-pxa270-income.c
@@ -202,7 +202,6 @@ static struct pwm_lookup income_pwm_lookup[] = {
static struct platform_pwm_backlight_data income_backlight_data = {
.max_brightness = 0x3ff,
.dft_brightness = 0x1ff,
- .enable_gpio = -1,
};
static struct platform_device income_backlight = {
diff --git a/arch/arm/mach-pxa/corgi.c b/arch/arm/mach-pxa/corgi.c
index f2d73289230f..593c7f793da5 100644
--- a/arch/arm/mach-pxa/corgi.c
+++ b/arch/arm/mach-pxa/corgi.c
@@ -563,13 +563,20 @@ static void corgi_bl_kick_battery(void)
}
}
+static struct gpiod_lookup_table corgi_lcdcon_gpio_table = {
+ .dev_id = "spi1.1",
+ .table = {
+ GPIO_LOOKUP("gpio-pxa", CORGI_GPIO_BACKLIGHT_CONT,
+ "BL_CONT", GPIO_ACTIVE_HIGH),
+ { },
+ },
+};
+
static struct corgi_lcd_platform_data corgi_lcdcon_info = {
.init_mode = CORGI_LCD_MODE_VGA,
.max_intensity = 0x2f,
.default_intensity = 0x1f,
.limit_mask = 0x0b,
- .gpio_backlight_cont = CORGI_GPIO_BACKLIGHT_CONT,
- .gpio_backlight_on = -1,
.kick_battery = corgi_bl_kick_battery,
};
@@ -609,6 +616,7 @@ static struct spi_board_info corgi_spi_devices[] = {
static void __init corgi_init_spi(void)
{
pxa2xx_set_spi_info(1, &corgi_spi_info);
+ gpiod_add_lookup_table(&corgi_lcdcon_gpio_table);
spi_register_board_info(ARRAY_AND_SIZE(corgi_spi_devices));
}
#else
diff --git a/arch/arm/mach-pxa/ezx.c b/arch/arm/mach-pxa/ezx.c
index ec10851b63cf..eb85950e7c0e 100644
--- a/arch/arm/mach-pxa/ezx.c
+++ b/arch/arm/mach-pxa/ezx.c
@@ -55,7 +55,6 @@ static struct pwm_lookup ezx_pwm_lookup[] __maybe_unused = {
static struct platform_pwm_backlight_data ezx_backlight_data = {
.max_brightness = 1023,
.dft_brightness = 1023,
- .enable_gpio = -1,
};
static struct platform_device ezx_backlight_device = {
diff --git a/arch/arm/mach-pxa/hx4700.c b/arch/arm/mach-pxa/hx4700.c
index 238a751a8797..1d4c5db54be2 100644
--- a/arch/arm/mach-pxa/hx4700.c
+++ b/arch/arm/mach-pxa/hx4700.c
@@ -556,7 +556,6 @@ static struct platform_device hx4700_lcd = {
static struct platform_pwm_backlight_data backlight_data = {
.max_brightness = 200,
.dft_brightness = 100,
- .enable_gpio = -1,
};
static struct platform_device backlight = {
diff --git a/arch/arm/mach-pxa/lpd270.c b/arch/arm/mach-pxa/lpd270.c
index 20e00e970385..6fc40bc06910 100644
--- a/arch/arm/mach-pxa/lpd270.c
+++ b/arch/arm/mach-pxa/lpd270.c
@@ -277,7 +277,6 @@ static struct pwm_lookup lpd270_pwm_lookup[] = {
static struct platform_pwm_backlight_data lpd270_backlight_data = {
.max_brightness = 1,
.dft_brightness = 1,
- .enable_gpio = -1,
};
static struct platform_device lpd270_backlight_device = {
diff --git a/arch/arm/mach-pxa/magician.c b/arch/arm/mach-pxa/magician.c
index 5d0591f93f4d..cd9fa465b9b2 100644
--- a/arch/arm/mach-pxa/magician.c
+++ b/arch/arm/mach-pxa/magician.c
@@ -401,7 +401,6 @@ static void magician_backlight_exit(struct device *dev)
static struct platform_pwm_backlight_data backlight_data = {
.max_brightness = 272,
.dft_brightness = 100,
- .enable_gpio = -1,
.init = magician_backlight_init,
.notify = magician_backlight_notify,
.exit = magician_backlight_exit,
diff --git a/arch/arm/mach-pxa/mainstone.c b/arch/arm/mach-pxa/mainstone.c
index 1b7882920164..d1010ec26e9f 100644
--- a/arch/arm/mach-pxa/mainstone.c
+++ b/arch/arm/mach-pxa/mainstone.c
@@ -256,7 +256,6 @@ static struct pwm_lookup mainstone_pwm_lookup[] = {
static struct platform_pwm_backlight_data mainstone_backlight_data = {
.max_brightness = 1023,
.dft_brightness = 1023,
- .enable_gpio = -1,
};
static struct platform_device mainstone_backlight_device = {
diff --git a/arch/arm/mach-pxa/mioa701.c b/arch/arm/mach-pxa/mioa701.c
index 0b8bae9610f1..d3af80317f2d 100644
--- a/arch/arm/mach-pxa/mioa701.c
+++ b/arch/arm/mach-pxa/mioa701.c
@@ -176,7 +176,6 @@ static struct pwm_lookup mioa701_pwm_lookup[] = {
static struct platform_pwm_backlight_data mioa701_backlight_data = {
.max_brightness = 100,
.dft_brightness = 50,
- .enable_gpio = -1,
};
/*
diff --git a/arch/arm/mach-pxa/palm27x.c b/arch/arm/mach-pxa/palm27x.c
index b600b63af3a6..0d246a1aebbc 100644
--- a/arch/arm/mach-pxa/palm27x.c
+++ b/arch/arm/mach-pxa/palm27x.c
@@ -318,7 +318,6 @@ static void palm27x_backlight_exit(struct device *dev)
static struct platform_pwm_backlight_data palm27x_backlight_data = {
.max_brightness = 0xfe,
.dft_brightness = 0x7e,
- .enable_gpio = -1,
.init = palm27x_backlight_init,
.notify = palm27x_backlight_notify,
.exit = palm27x_backlight_exit,
diff --git a/arch/arm/mach-pxa/palmtc.c b/arch/arm/mach-pxa/palmtc.c
index fda9deaaae02..455cb8ccaf26 100644
--- a/arch/arm/mach-pxa/palmtc.c
+++ b/arch/arm/mach-pxa/palmtc.c
@@ -174,6 +174,15 @@ static inline void palmtc_keys_init(void) {}
* Backlight
******************************************************************************/
#if defined(CONFIG_BACKLIGHT_PWM) || defined(CONFIG_BACKLIGHT_PWM_MODULE)
+
+static struct gpiod_lookup_table palmtc_pwm_bl_gpio_table = {
+ .dev_id = "pwm-backlight.0",
+ .table = {
+ GPIO_LOOKUP("gpio-pxa", GPIO_NR_PALMTC_BL_POWER,
+ "enable", GPIO_ACTIVE_HIGH),
+ },
+};
+
static struct pwm_lookup palmtc_pwm_lookup[] = {
PWM_LOOKUP("pxa25x-pwm.1", 0, "pwm-backlight.0", NULL, PALMTC_PERIOD_NS,
PWM_POLARITY_NORMAL),
@@ -182,7 +191,6 @@ static struct pwm_lookup palmtc_pwm_lookup[] = {
static struct platform_pwm_backlight_data palmtc_backlight_data = {
.max_brightness = PALMTC_MAX_INTENSITY,
.dft_brightness = PALMTC_MAX_INTENSITY,
- .enable_gpio = GPIO_NR_PALMTC_BL_POWER,
};
static struct platform_device palmtc_backlight = {
@@ -195,6 +203,7 @@ static struct platform_device palmtc_backlight = {
static void __init palmtc_pwm_init(void)
{
+ gpiod_add_lookup_table(&palmtc_pwm_bl_gpio_table);
pwm_add_table(palmtc_pwm_lookup, ARRAY_SIZE(palmtc_pwm_lookup));
platform_device_register(&palmtc_backlight);
}
diff --git a/arch/arm/mach-pxa/palmte2.c b/arch/arm/mach-pxa/palmte2.c
index 7171014fd311..e3bcf58b4e63 100644
--- a/arch/arm/mach-pxa/palmte2.c
+++ b/arch/arm/mach-pxa/palmte2.c
@@ -175,7 +175,6 @@ static void palmte2_backlight_exit(struct device *dev)
static struct platform_pwm_backlight_data palmte2_backlight_data = {
.max_brightness = PALMTE2_MAX_INTENSITY,
.dft_brightness = PALMTE2_MAX_INTENSITY,
- .enable_gpio = -1,
.init = palmte2_backlight_init,
.notify = palmte2_backlight_notify,
.exit = palmte2_backlight_exit,
diff --git a/arch/arm/mach-pxa/pcm990-baseboard.c b/arch/arm/mach-pxa/pcm990-baseboard.c
index cb1c56769fbc..bf613f88d70b 100644
--- a/arch/arm/mach-pxa/pcm990-baseboard.c
+++ b/arch/arm/mach-pxa/pcm990-baseboard.c
@@ -154,7 +154,6 @@ static struct pwm_lookup pcm990_pwm_lookup[] = {
static struct platform_pwm_backlight_data pcm990_backlight_data = {
.max_brightness = 1023,
.dft_brightness = 1023,
- .enable_gpio = -1,
};
static struct platform_device pcm990_backlight_device = {
diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c
index a4fdc399d152..371008e9bb02 100644
--- a/arch/arm/mach-pxa/spitz.c
+++ b/arch/arm/mach-pxa/spitz.c
@@ -525,13 +525,33 @@ static void spitz_bl_kick_battery(void)
}
}
+static struct gpiod_lookup_table spitz_lcdcon_gpio_table = {
+ .dev_id = "spi2.1",
+ .table = {
+ GPIO_LOOKUP("gpio-pxa", SPITZ_GPIO_BACKLIGHT_CONT,
+ "BL_CONT", GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP("gpio-pxa", SPITZ_GPIO_BACKLIGHT_ON,
+ "BL_ON", GPIO_ACTIVE_HIGH),
+ { },
+ },
+};
+
+static struct gpiod_lookup_table akita_lcdcon_gpio_table = {
+ .dev_id = "spi2.1",
+ .table = {
+ GPIO_LOOKUP("gpio-pxa", AKITA_GPIO_BACKLIGHT_CONT,
+ "BL_CONT", GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP("gpio-pxa", AKITA_GPIO_BACKLIGHT_ON,
+ "BL_ON", GPIO_ACTIVE_HIGH),
+ { },
+ },
+};
+
static struct corgi_lcd_platform_data spitz_lcdcon_info = {
.init_mode = CORGI_LCD_MODE_VGA,
.max_intensity = 0x2f,
.default_intensity = 0x1f,
.limit_mask = 0x0b,
- .gpio_backlight_cont = SPITZ_GPIO_BACKLIGHT_CONT,
- .gpio_backlight_on = SPITZ_GPIO_BACKLIGHT_ON,
.kick_battery = spitz_bl_kick_battery,
};
@@ -574,12 +594,10 @@ static struct pxa2xx_spi_controller spitz_spi_info = {
static void __init spitz_spi_init(void)
{
- struct corgi_lcd_platform_data *lcd_data = &spitz_lcdcon_info;
-
- if (machine_is_akita()) {
- lcd_data->gpio_backlight_cont = AKITA_GPIO_BACKLIGHT_CONT;
- lcd_data->gpio_backlight_on = AKITA_GPIO_BACKLIGHT_ON;
- }
+ if (machine_is_akita())
+ gpiod_add_lookup_table(&akita_lcdcon_gpio_table);
+ else
+ gpiod_add_lookup_table(&spitz_lcdcon_gpio_table);
pxa2xx_set_spi_info(2, &spitz_spi_info);
spi_register_board_info(ARRAY_AND_SIZE(spitz_spi_devices));
diff --git a/arch/arm/mach-pxa/tavorevb.c b/arch/arm/mach-pxa/tavorevb.c
index 93466fa3b0fe..a15eb3b9484d 100644
--- a/arch/arm/mach-pxa/tavorevb.c
+++ b/arch/arm/mach-pxa/tavorevb.c
@@ -178,13 +178,11 @@ static struct platform_pwm_backlight_data tavorevb_backlight_data[] = {
/* primary backlight */
.max_brightness = 100,
.dft_brightness = 100,
- .enable_gpio = -1,
},
[1] = {
/* secondary backlight */
.max_brightness = 100,
.dft_brightness = 100,
- .enable_gpio = -1,
},
};
diff --git a/arch/arm/mach-pxa/viper.c b/arch/arm/mach-pxa/viper.c
index c06031da6676..3aa34e9a15d3 100644
--- a/arch/arm/mach-pxa/viper.c
+++ b/arch/arm/mach-pxa/viper.c
@@ -404,7 +404,6 @@ static void viper_backlight_exit(struct device *dev)
static struct platform_pwm_backlight_data viper_backlight_data = {
.max_brightness = 100,
.dft_brightness = 100,
- .enable_gpio = -1,
.init = viper_backlight_init,
.notify = viper_backlight_notify,
.exit = viper_backlight_exit,
diff --git a/arch/arm/mach-pxa/z2.c b/arch/arm/mach-pxa/z2.c
index 900cefc4c5ea..21fd76bb09cd 100644
--- a/arch/arm/mach-pxa/z2.c
+++ b/arch/arm/mach-pxa/z2.c
@@ -210,13 +210,11 @@ static struct platform_pwm_backlight_data z2_backlight_data[] = {
/* Keypad Backlight */
.max_brightness = 1023,
.dft_brightness = 0,
- .enable_gpio = -1,
},
[1] = {
/* LCD Backlight */
.max_brightness = 1023,
.dft_brightness = 512,
- .enable_gpio = -1,
},
};
diff --git a/arch/arm/mach-pxa/zylonite.c b/arch/arm/mach-pxa/zylonite.c
index bf2ab5bd49ec..79f0025fa17a 100644
--- a/arch/arm/mach-pxa/zylonite.c
+++ b/arch/arm/mach-pxa/zylonite.c
@@ -117,7 +117,6 @@ static struct pwm_lookup zylonite_pwm_lookup[] = {
static struct platform_pwm_backlight_data zylonite_backlight_data = {
.max_brightness = 100,
.dft_brightness = 100,
- .enable_gpio = -1,
};
static struct platform_device zylonite_backlight_device = {
diff --git a/arch/arm/mach-s3c24xx/mach-h1940.c b/arch/arm/mach-s3c24xx/mach-h1940.c
index 74d6b68e91c7..e1c372e5447b 100644
--- a/arch/arm/mach-s3c24xx/mach-h1940.c
+++ b/arch/arm/mach-s3c24xx/mach-h1940.c
@@ -516,7 +516,6 @@ static void h1940_backlight_exit(struct device *dev)
static struct platform_pwm_backlight_data backlight_data = {
.max_brightness = 100,
.dft_brightness = 50,
- .enable_gpio = -1,
.init = h1940_backlight_init,
.notify = h1940_backlight_notify,
.exit = h1940_backlight_exit,
diff --git a/arch/arm/mach-s3c24xx/mach-rx1950.c b/arch/arm/mach-s3c24xx/mach-rx1950.c
index 03d8f27cdc32..fde98b175c75 100644
--- a/arch/arm/mach-s3c24xx/mach-rx1950.c
+++ b/arch/arm/mach-s3c24xx/mach-rx1950.c
@@ -534,7 +534,6 @@ static int rx1950_backlight_notify(struct device *dev, int brightness)
static struct platform_pwm_backlight_data rx1950_backlight_data = {
.max_brightness = 24,
.dft_brightness = 4,
- .enable_gpio = -1,
.init = rx1950_backlight_init,
.notify = rx1950_backlight_notify,
.exit = rx1950_backlight_exit,
diff --git a/arch/arm/mach-s3c64xx/dev-backlight.c b/arch/arm/mach-s3c64xx/dev-backlight.c
index 799cfdf0606b..09e6da305f60 100644
--- a/arch/arm/mach-s3c64xx/dev-backlight.c
+++ b/arch/arm/mach-s3c64xx/dev-backlight.c
@@ -65,7 +65,6 @@ static struct samsung_bl_drvdata samsung_dfl_bl_data __initdata = {
.plat_data = {
.max_brightness = 255,
.dft_brightness = 255,
- .enable_gpio = -1,
.init = samsung_bl_init,
.exit = samsung_bl_exit,
},
@@ -111,8 +110,6 @@ void __init samsung_bl_set(struct samsung_bl_gpio_info *gpio_info,
samsung_bl_data->dft_brightness = bl_data->dft_brightness;
if (bl_data->lth_brightness)
samsung_bl_data->lth_brightness = bl_data->lth_brightness;
- if (bl_data->enable_gpio >= 0)
- samsung_bl_data->enable_gpio = bl_data->enable_gpio;
if (bl_data->init)
samsung_bl_data->init = bl_data->init;
if (bl_data->notify)
diff --git a/arch/arm/mach-s3c64xx/mach-crag6410.c b/arch/arm/mach-s3c64xx/mach-crag6410.c
index 8ec6a4f5eb05..da9654255e3f 100644
--- a/arch/arm/mach-s3c64xx/mach-crag6410.c
+++ b/arch/arm/mach-s3c64xx/mach-crag6410.c
@@ -114,7 +114,6 @@ static struct pwm_lookup crag6410_pwm_lookup[] = {
static struct platform_pwm_backlight_data crag6410_backlight_data = {
.max_brightness = 1000,
.dft_brightness = 600,
- .enable_gpio = -1,
};
static struct platform_device crag6410_backlight_device = {
diff --git a/arch/arm/mach-s3c64xx/mach-hmt.c b/arch/arm/mach-s3c64xx/mach-hmt.c
index bfe9881d12cc..e7080215c624 100644
--- a/arch/arm/mach-s3c64xx/mach-hmt.c
+++ b/arch/arm/mach-s3c64xx/mach-hmt.c
@@ -115,7 +115,6 @@ static void hmt_bl_exit(struct device *dev)
static struct platform_pwm_backlight_data hmt_backlight_data = {
.max_brightness = 100 * 256,
.dft_brightness = 40 * 256,
- .enable_gpio = -1,
.init = hmt_bl_init,
.notify = hmt_bl_notify,
.exit = hmt_bl_exit,
diff --git a/arch/arm/mach-s3c64xx/mach-smartq.c b/arch/arm/mach-s3c64xx/mach-smartq.c
index 829d5dbd69ee..5025db607c0f 100644
--- a/arch/arm/mach-s3c64xx/mach-smartq.c
+++ b/arch/arm/mach-s3c64xx/mach-smartq.c
@@ -150,7 +150,6 @@ static int smartq_bl_init(struct device *dev)
static struct platform_pwm_backlight_data smartq_backlight_data = {
.max_brightness = 1000,
.dft_brightness = 600,
- .enable_gpio = -1,
.init = smartq_bl_init,
};
diff --git a/arch/arm/mach-s3c64xx/mach-smdk6410.c b/arch/arm/mach-s3c64xx/mach-smdk6410.c
index 908e5aa831c8..56f406c0c3dd 100644
--- a/arch/arm/mach-s3c64xx/mach-smdk6410.c
+++ b/arch/arm/mach-s3c64xx/mach-smdk6410.c
@@ -623,7 +623,7 @@ static struct pwm_lookup smdk6410_pwm_lookup[] = {
};
static struct platform_pwm_backlight_data smdk6410_bl_data = {
- .enable_gpio = -1,
+ /* Intentionally blank */
};
static struct dwc2_hsotg_plat smdk6410_hsotg_pdata;
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index b598e6978b29..2dd5c41cbb8d 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -189,7 +189,7 @@ void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
*/
static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma)
{
- unsigned int mask = VM_READ | VM_WRITE | VM_EXEC;
+ unsigned int mask = VM_ACCESS_FLAGS;
if ((fsr & FSR_WRITE) && !(fsr & FSR_CM))
mask = VM_WRITE;
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 69a337df619f..ec8d0008bfa1 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1646,3 +1646,17 @@ void __init early_mm_init(const struct machine_desc *mdesc)
build_mem_type_table();
early_paging_init(mdesc);
}
+
+void set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval)
+{
+ unsigned long ext = 0;
+
+ if (addr < TASK_SIZE && pte_valid_user(pteval)) {
+ if (!pte_special(pteval))
+ __sync_icache_dcache(pteval);
+ ext |= PTE_EXT_NG;
+ }
+
+ set_pte_ext(ptep, pteval, ext);
+}
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6e41c4b62607..40fb05d96c60 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1502,7 +1502,10 @@ config ARM64_PTR_AUTH
default y
depends on !KVM || ARM64_VHE
depends on (CC_HAS_SIGN_RETURN_ADDRESS || CC_HAS_BRANCH_PROT_PAC_RET) && AS_HAS_PAC
- depends on CC_IS_GCC || (CC_IS_CLANG && AS_HAS_CFI_NEGATE_RA_STATE)
+ # GCC 9.1 and later inserts a .note.gnu.property section note for PAC
+ # which is only understood by binutils starting with version 2.33.1.
+ depends on !CC_IS_GCC || GCC_VERSION < 90100 || LD_VERSION >= 233010000
+ depends on !CC_IS_CLANG || AS_HAS_CFI_NEGATE_RA_STATE
depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS)
help
Pointer authentication (part of the ARMv8.3 Extensions) provides
diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
index 1c906d932d6b..a1efa246c9ed 100644
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -52,19 +52,6 @@ config DEBUG_WX
If in doubt, say "Y".
-config DEBUG_ALIGN_RODATA
- depends on STRICT_KERNEL_RWX
- bool "Align linker sections up to SECTION_SIZE"
- help
- If this option is enabled, sections that may potentially be marked as
- read only or non-executable will be aligned up to the section size of
- the kernel. This prevents sections from being split into pages and
- avoids a potential TLB penalty. The downside is an increase in
- alignment and potentially wasted space. Turn on this option if
- performance is more important than memory pressure.
-
- If in doubt, say N.
-
config DEBUG_EFI
depends on EFI && DEBUG_INFO
bool "UEFI debugging"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index f15f92ba53e6..85e4149cc5d5 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -65,6 +65,10 @@ stack_protector_prepare: prepare0
include/generated/asm-offsets.h))
endif
+# Ensure that if the compiler supports branch protection we default it
+# off, this will be overridden if we are using branch protection.
+branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)
+
ifeq ($(CONFIG_ARM64_PTR_AUTH),y)
branch-prot-flags-$(CONFIG_CC_HAS_SIGN_RETURN_ADDRESS) := -msign-return-address=all
branch-prot-flags-$(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET) := -mbranch-protection=pac-ret+leaf
@@ -73,9 +77,10 @@ branch-prot-flags-$(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET) := -mbranch-protection=pa
# we pass it only to the assembler. This option is utilized only in case of non
# integrated assemblers.
branch-prot-flags-$(CONFIG_AS_HAS_PAC) += -Wa,-march=armv8.3-a
-KBUILD_CFLAGS += $(branch-prot-flags-y)
endif
+KBUILD_CFLAGS += $(branch-prot-flags-y)
+
ifeq ($(CONFIG_CPU_BIG_ENDIAN), y)
KBUILD_CPPFLAGS += -mbig-endian
CHECKFLAGS += -D__AARCH64EB__
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 2be67b232499..a1871bb32bb1 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -120,22 +120,12 @@
/*
* Alignment of kernel segments (e.g. .text, .data).
- */
-#if defined(CONFIG_DEBUG_ALIGN_RODATA)
-/*
- * 4 KB granule: 1 level 2 entry
- * 16 KB granule: 128 level 3 entries, with contiguous bit
- * 64 KB granule: 32 level 3 entries, with contiguous bit
- */
-#define SEGMENT_ALIGN SZ_2M
-#else
-/*
+ *
* 4 KB granule: 16 level 3 entries, with contiguous bit
* 16 KB granule: 4 level 3 entries, without contiguous bit
* 64 KB granule: 1 level 3 entry
*/
#define SEGMENT_ALIGN SZ_64K
-#endif
/*
* Memory types available.
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 75d6cd23a679..c01b52add377 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -36,9 +36,7 @@ extern int pfn_valid(unsigned long);
#endif /* !__ASSEMBLY__ */
-#define VM_DATA_DEFAULT_FLAGS \
- (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \
- VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC
#include <asm-generic/getorder.h>
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index 4cc581af2d96..c19aa81ddc8c 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -601,7 +601,7 @@ static struct undef_hook setend_hooks[] = {
},
{
/* Thumb mode */
- .instr_mask = 0x0000fff7,
+ .instr_mask = 0xfffffff7,
.instr_val = 0x0000b650,
.pstate_mask = (PSR_AA32_T_BIT | PSR_AA32_MODE_MASK),
.pstate_val = (PSR_AA32_T_BIT | PSR_AA32_MODE_USR),
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index a475c68cbfec..449386d76441 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -64,6 +64,4 @@ config KVM_ARM_PMU
config KVM_INDIRECT_VECTORS
def_bool KVM && (HARDEN_BRANCH_PREDICTOR || HARDEN_EL2_VECTORS)
-source "drivers/vhost/Kconfig"
-
endif # VIRTUALIZATION
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 1027851d469a..c9cedc0432d2 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -445,7 +445,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
const struct fault_info *inf;
struct mm_struct *mm = current->mm;
vm_fault_t fault, major = 0;
- unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
+ unsigned long vm_flags = VM_ACCESS_FLAGS;
unsigned int mm_flags = FAULT_FLAG_DEFAULT;
if (kprobe_page_fault(regs, esr))
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index b65dffdfb201..e42727e3568e 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -29,6 +29,7 @@
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/crash_dump.h>
+#include <linux/hugetlb.h>
#include <asm/boot.h>
#include <asm/fixmap.h>
@@ -457,6 +458,11 @@ void __init arm64_memblock_init(void)
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
dma_contiguous_reserve(arm64_dma32_phys_limit);
+
+#ifdef CONFIG_ARM64_4K_PAGES
+ hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
+#endif
+
}
void __init bootmem_init(void)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 9b08f7c7e6f0..a374e4f51a62 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1374,7 +1374,7 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
}
int arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
int ret, flags = 0;
@@ -1382,12 +1382,13 @@ int arch_add_memory(int nid, u64 start, u64 size,
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
- size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
+ size, params->pgprot, __pgd_pgtable_alloc,
+ flags);
memblock_clear_nomap(start, size);
ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
- restrictions);
+ params);
if (ret)
__remove_pgd_mapping(swapper_pg_dir,
__phys_to_virt(start), size);
diff --git a/arch/c6x/include/asm/page.h b/arch/c6x/include/asm/page.h
index 70db1e7632bc..40079899084d 100644
--- a/arch/c6x/include/asm/page.h
+++ b/arch/c6x/include/asm/page.h
@@ -2,10 +2,7 @@
#ifndef _ASM_C6X_PAGE_H
#define _ASM_C6X_PAGE_H
-#define VM_DATA_DEFAULT_FLAGS \
- (VM_READ | VM_WRITE | \
- ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC
#include <asm-generic/page.h>
diff --git a/arch/csky/include/asm/page.h b/arch/csky/include/asm/page.h
index 9738eacefdc7..9b98bf31d57c 100644
--- a/arch/csky/include/asm/page.h
+++ b/arch/csky/include/asm/page.h
@@ -85,9 +85,6 @@ extern unsigned long va_pa_offset;
PHYS_OFFSET_OFFSET)
#define virt_to_page(x) (mem_map + MAP_NR(x))
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
#define pfn_to_kaddr(x) __va(PFN_PHYS(x))
#include <asm-generic/memory_model.h>
diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h
index 9b7764cb7645..9ab4a445ad99 100644
--- a/arch/csky/include/asm/pgtable.h
+++ b/arch/csky/include/asm/pgtable.h
@@ -110,9 +110,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
extern void load_pgd(unsigned long pg_dir);
extern pte_t invalid_pte_table[PTRS_PER_PTE];
-static inline int pte_special(pte_t pte) { return 0; }
-static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
-
static inline void set_pte(pte_t *p, pte_t pte)
{
*p = pte;
diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c
index d3c61b83e195..4e6dc68f3258 100644
--- a/arch/csky/mm/fault.c
+++ b/arch/csky/mm/fault.c
@@ -141,7 +141,7 @@ good_area:
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
} else {
- if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
+ if (unlikely(!vma_is_accessible(vma)))
goto bad_area;
}
diff --git a/arch/h8300/include/asm/page.h b/arch/h8300/include/asm/page.h
index 8da5124ad344..53e037544239 100644
--- a/arch/h8300/include/asm/page.h
+++ b/arch/h8300/include/asm/page.h
@@ -6,8 +6,6 @@
#include <linux/types.h>
#define MAP_NR(addr) (((uintptr_t)(addr)-PAGE_OFFSET) >> PAGE_SHIFT)
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
#ifndef __ASSEMBLY__
extern unsigned long rom_length;
diff --git a/arch/h8300/include/uapi/asm/bitsperlong.h b/arch/h8300/include/uapi/asm/bitsperlong.h
deleted file mode 100644
index a33e358f1c1b..000000000000
--- a/arch/h8300/include/uapi/asm/bitsperlong.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI__ASM_H8300_BITS_PER_LONG
-#define _UAPI__ASM_H8300_BITS_PER_LONG
-
-#include <asm-generic/bitsperlong.h>
-
-#if !defined(__ASSEMBLY__)
-/* h8300-unknown-linux required long */
-#define __kernel_size_t __kernel_size_t
-typedef unsigned long __kernel_size_t;
-typedef long __kernel_ssize_t;
-typedef long __kernel_ptrdiff_t;
-#endif
-
-#endif /* _UAPI__ASM_H8300_BITS_PER_LONG */
diff --git a/arch/h8300/include/uapi/asm/posix_types.h b/arch/h8300/include/uapi/asm/posix_types.h
new file mode 100644
index 000000000000..3efc9dd59476
--- /dev/null
+++ b/arch/h8300/include/uapi/asm/posix_types.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_POSIX_TYPES_H
+#define _UAPI_ASM_POSIX_TYPES_H
+
+/* h8300-unknown-linux required long */
+#define __kernel_size_t __kernel_size_t
+typedef unsigned long __kernel_size_t;
+typedef long __kernel_ssize_t;
+typedef long __kernel_ptrdiff_t;
+
+#include <asm-generic/posix_types.h>
+
+#endif /* _UAPI_ASM_POSIX_TYPES_H */
diff --git a/arch/hexagon/include/asm/page.h b/arch/hexagon/include/asm/page.h
index ee31f36f48f3..7cbf719c578e 100644
--- a/arch/hexagon/include/asm/page.h
+++ b/arch/hexagon/include/asm/page.h
@@ -93,8 +93,7 @@ struct page;
#define virt_to_page(kaddr) pfn_to_page(PFN_DOWN(__pa(kaddr)))
/* Default vm area behavior is non-executable. */
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC
#define pfn_valid(pfn) ((pfn) < max_mapnr)
#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h
index 2fec20ad939e..d383e8bea5b2 100644
--- a/arch/hexagon/include/asm/pgtable.h
+++ b/arch/hexagon/include/asm/pgtable.h
@@ -158,8 +158,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* located in head.S */
/* Seems to be zero even in architectures where the zero page is firewalled? */
#define FIRST_USER_ADDRESS 0UL
-#define pte_special(pte) 0
-#define pte_mkspecial(pte) (pte)
/* HUGETLB not working currently */
#ifdef CONFIG_HUGETLB_PAGE
diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h
index 5798bd2b462c..b69a5499d75b 100644
--- a/arch/ia64/include/asm/page.h
+++ b/arch/ia64/include/asm/page.h
@@ -218,10 +218,7 @@ get_order (unsigned long size)
#define PAGE_OFFSET RGN_BASE(RGN_KERNEL)
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | \
- (((current->personality & READ_IMPLIES_EXEC) != 0) \
- ? VM_EXEC : 0))
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC
#define GATE_ADDR RGN_BASE(RGN_GATE)
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index d602e7c622db..0e7b645b76c6 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -298,7 +298,6 @@ extern unsigned long VMALLOC_END;
#define pte_exec(pte) ((pte_val(pte) & _PAGE_AR_RX) != 0)
#define pte_dirty(pte) ((pte_val(pte) & _PAGE_D) != 0)
#define pte_young(pte) ((pte_val(pte) & _PAGE_A) != 0)
-#define pte_special(pte) 0
/*
* Note: we convert AR_RWX to AR_RX and AR_RW to AR_R by clearing the 2nd bit in the
@@ -311,7 +310,6 @@ extern unsigned long VMALLOC_END;
#define pte_mkclean(pte) (__pte(pte_val(pte) & ~_PAGE_D))
#define pte_mkdirty(pte) (__pte(pte_val(pte) | _PAGE_D))
#define pte_mkhuge(pte) (__pte(pte_val(pte)))
-#define pte_mkspecial(pte) (pte)
/*
* Because ia64's Icache and Dcache is not coherent (on a cpu), we need to
diff --git a/arch/ia64/kernel/syscalls/syscallhdr.sh b/arch/ia64/kernel/syscalls/syscallhdr.sh
index 0c2d2c748565..f407b6e53283 100644
--- a/arch/ia64/kernel/syscalls/syscallhdr.sh
+++ b/arch/ia64/kernel/syscalls/syscallhdr.sh
@@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
printf "#define __NR_syscalls\t%s\n" "${nxt}"
printf "#endif\n"
printf "\n"
- printf "#endif /* %s */" "${fileguard}"
+ printf "#endif /* %s */\n" "${fileguard}"
) > "$out"
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 1ec6b703c5b4..6b5652ee76f9 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -54,6 +54,8 @@ SECTIONS {
CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
+ IRQENTRY_TEXT
+ SOFTIRQENTRY_TEXT
*(.gnu.linkonce.t*)
}
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index b01d68a2d5d9..d637b4ea3147 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -670,13 +670,16 @@ mem_init (void)
#ifdef CONFIG_MEMORY_HOTPLUG
int arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
- ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
+ if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
+ return -EINVAL;
+
+ ret = __add_pages(nid, start_pfn, nr_pages, params);
if (ret)
printk("%s: Problem encountered in __add_pages() as ret=%d\n",
__func__, ret);
diff --git a/arch/m68k/68000/timers.c b/arch/m68k/68000/timers.c
index 71ddb4c98726..1c8e8a83c325 100644
--- a/arch/m68k/68000/timers.c
+++ b/arch/m68k/68000/timers.c
@@ -68,14 +68,6 @@ static irqreturn_t hw_tick(int irq, void *dummy)
/***************************************************************************/
-static struct irqaction m68328_timer_irq = {
- .name = "timer",
- .flags = IRQF_TIMER,
- .handler = hw_tick,
-};
-
-/***************************************************************************/
-
static u64 m68328_read_clk(struct clocksource *cs)
{
unsigned long flags;
@@ -102,11 +94,17 @@ static struct clocksource m68328_clk = {
void hw_timer_init(irq_handler_t handler)
{
+ int ret;
+
/* disable timer 1 */
TCTL = 0;
/* set ISR */
- setup_irq(TMR_IRQ_NUM, &m68328_timer_irq);
+ ret = request_irq(TMR_IRQ_NUM, hw_tick, IRQF_TIMER, "timer", NULL);
+ if (ret) {
+ pr_err("Failed to request irq %d (timer): %pe\n", TMR_IRQ_NUM,
+ ERR_PTR(ret));
+ }
/* Restart mode, Enable int, Set clock source */
TCTL = TCTL_OM | TCTL_IRQEN | CLOCK_SOURCE;
diff --git a/arch/m68k/coldfire/pit.c b/arch/m68k/coldfire/pit.c
index eb6f16b0e2e6..fd1d9c915daa 100644
--- a/arch/m68k/coldfire/pit.c
+++ b/arch/m68k/coldfire/pit.c
@@ -111,14 +111,6 @@ static irqreturn_t pit_tick(int irq, void *dummy)
/***************************************************************************/
-static struct irqaction pit_irq = {
- .name = "timer",
- .flags = IRQF_TIMER,
- .handler = pit_tick,
-};
-
-/***************************************************************************/
-
static u64 pit_read_clk(struct clocksource *cs)
{
unsigned long flags;
@@ -146,6 +138,8 @@ static struct clocksource pit_clk = {
void hw_timer_init(irq_handler_t handler)
{
+ int ret;
+
cf_pit_clockevent.cpumask = cpumask_of(smp_processor_id());
cf_pit_clockevent.mult = div_sc(FREQ, NSEC_PER_SEC, 32);
cf_pit_clockevent.max_delta_ns =
@@ -156,7 +150,11 @@ void hw_timer_init(irq_handler_t handler)
cf_pit_clockevent.min_delta_ticks = 0x3f;
clockevents_register_device(&cf_pit_clockevent);
- setup_irq(MCF_IRQ_PIT1, &pit_irq);
+ ret = request_irq(MCF_IRQ_PIT1, pit_tick, IRQF_TIMER, "timer", NULL);
+ if (ret) {
+ pr_err("Failed to request irq %d (timer): %pe\n", MCF_IRQ_PIT1,
+ ERR_PTR(ret));
+ }
clocksource_register_hz(&pit_clk, FREQ);
}
diff --git a/arch/m68k/coldfire/sltimers.c b/arch/m68k/coldfire/sltimers.c
index 1b11e7bacab3..5ab81c9c552d 100644
--- a/arch/m68k/coldfire/sltimers.c
+++ b/arch/m68k/coldfire/sltimers.c
@@ -50,18 +50,19 @@ irqreturn_t mcfslt_profile_tick(int irq, void *dummy)
return IRQ_HANDLED;
}
-static struct irqaction mcfslt_profile_irq = {
- .name = "profile timer",
- .flags = IRQF_TIMER,
- .handler = mcfslt_profile_tick,
-};
-
void mcfslt_profile_init(void)
{
+ int ret;
+
printk(KERN_INFO "PROFILE: lodging TIMER 1 @ %dHz as profile timer\n",
PROFILEHZ);
- setup_irq(MCF_IRQ_PROFILER, &mcfslt_profile_irq);
+ ret = request_irq(MCF_IRQ_PROFILER, mcfslt_profile_tick, IRQF_TIMER,
+ "profile timer", NULL);
+ if (ret) {
+ pr_err("Failed to request irq %d (profile timer): %pe\n",
+ MCF_IRQ_PROFILER, ERR_PTR(ret));
+ }
/* Set up TIMER 2 as high speed profile clock */
__raw_writel(MCF_BUSCLK / PROFILEHZ - 1, PA(MCFSLT_STCNT));
@@ -92,12 +93,6 @@ static irqreturn_t mcfslt_tick(int irq, void *dummy)
return timer_interrupt(irq, dummy);
}
-static struct irqaction mcfslt_timer_irq = {
- .name = "timer",
- .flags = IRQF_TIMER,
- .handler = mcfslt_tick,
-};
-
static u64 mcfslt_read_clk(struct clocksource *cs)
{
unsigned long flags;
@@ -126,6 +121,8 @@ static struct clocksource mcfslt_clk = {
void hw_timer_init(irq_handler_t handler)
{
+ int r;
+
mcfslt_cycles_per_jiffy = MCF_BUSCLK / HZ;
/*
* The coldfire slice timer (SLT) runs from STCNT to 0 included,
@@ -140,7 +137,11 @@ void hw_timer_init(irq_handler_t handler)
mcfslt_cnt = mcfslt_cycles_per_jiffy;
timer_interrupt = handler;
- setup_irq(MCF_IRQ_TIMER, &mcfslt_timer_irq);
+ r = request_irq(MCF_IRQ_TIMER, mcfslt_tick, IRQF_TIMER, "timer", NULL);
+ if (r) {
+ pr_err("Failed to request irq %d (timer): %pe\n", MCF_IRQ_TIMER,
+ ERR_PTR(r));
+ }
clocksource_register_hz(&mcfslt_clk, MCF_BUSCLK);
diff --git a/arch/m68k/coldfire/timers.c b/arch/m68k/coldfire/timers.c
index 227aa5d13709..b8301fddf901 100644
--- a/arch/m68k/coldfire/timers.c
+++ b/arch/m68k/coldfire/timers.c
@@ -82,14 +82,6 @@ static irqreturn_t mcftmr_tick(int irq, void *dummy)
/***************************************************************************/
-static struct irqaction mcftmr_timer_irq = {
- .name = "timer",
- .flags = IRQF_TIMER,
- .handler = mcftmr_tick,
-};
-
-/***************************************************************************/
-
static u64 mcftmr_read_clk(struct clocksource *cs)
{
unsigned long flags;
@@ -118,6 +110,8 @@ static struct clocksource mcftmr_clk = {
void hw_timer_init(irq_handler_t handler)
{
+ int r;
+
__raw_writew(MCFTIMER_TMR_DISABLE, TA(MCFTIMER_TMR));
mcftmr_cycles_per_jiffy = FREQ / HZ;
/*
@@ -134,7 +128,11 @@ void hw_timer_init(irq_handler_t handler)
timer_interrupt = handler;
init_timer_irq();
- setup_irq(MCF_IRQ_TIMER, &mcftmr_timer_irq);
+ r = request_irq(MCF_IRQ_TIMER, mcftmr_tick, IRQF_TIMER, "timer", NULL);
+ if (r) {
+ pr_err("Failed to request irq %d (timer): %pe\n", MCF_IRQ_TIMER,
+ ERR_PTR(r));
+ }
#ifdef CONFIG_HIGHPROFILE
coldfire_profile_init();
@@ -170,14 +168,10 @@ irqreturn_t coldfire_profile_tick(int irq, void *dummy)
/***************************************************************************/
-static struct irqaction coldfire_profile_irq = {
- .name = "profile timer",
- .flags = IRQF_TIMER,
- .handler = coldfire_profile_tick,
-};
-
void coldfire_profile_init(void)
{
+ int ret;
+
printk(KERN_INFO "PROFILE: lodging TIMER2 @ %dHz as profile timer\n",
PROFILEHZ);
@@ -188,7 +182,12 @@ void coldfire_profile_init(void)
__raw_writew(MCFTIMER_TMR_ENORI | MCFTIMER_TMR_CLK16 |
MCFTIMER_TMR_RESTART | MCFTIMER_TMR_ENABLE, PA(MCFTIMER_TMR));
- setup_irq(MCF_IRQ_PROFILER, &coldfire_profile_irq);
+ ret = request_irq(MCF_IRQ_PROFILER, coldfire_profile_tick, IRQF_TIMER,
+ "profile timer", NULL);
+ if (ret) {
+ pr_err("Failed to request irq %d (profile timer): %pe\n",
+ MCF_IRQ_PROFILER, ERR_PTR(ret));
+ }
}
/***************************************************************************/
diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h
index b9f45aeded25..0031cd387b75 100644
--- a/arch/m68k/include/asm/mcf_pgtable.h
+++ b/arch/m68k/include/asm/mcf_pgtable.h
@@ -235,11 +235,6 @@ static inline int pte_young(pte_t pte)
return pte_val(pte) & CF_PAGE_ACCESSED;
}
-static inline int pte_special(pte_t pte)
-{
- return 0;
-}
-
static inline pte_t pte_wrprotect(pte_t pte)
{
pte_val(pte) &= ~CF_PAGE_WRITABLE;
@@ -312,11 +307,6 @@ static inline pte_t pte_mkcache(pte_t pte)
return pte;
}
-static inline pte_t pte_mkspecial(pte_t pte)
-{
- return pte;
-}
-
#define swapper_pg_dir kernel_pg_dir
extern pgd_t kernel_pg_dir[PTRS_PER_PGD];
diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h
index 4b91a470ad58..48f19f0ab1e7 100644
--- a/arch/m68k/include/asm/motorola_pgtable.h
+++ b/arch/m68k/include/asm/motorola_pgtable.h
@@ -174,7 +174,6 @@ static inline void pud_set(pud_t *pudp, pmd_t *pmdp)
static inline int pte_write(pte_t pte) { return !(pte_val(pte) & _PAGE_RONLY); }
static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; }
static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
-static inline int pte_special(pte_t pte) { return 0; }
static inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) |= _PAGE_RONLY; return pte; }
static inline pte_t pte_mkclean(pte_t pte) { pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
@@ -192,7 +191,6 @@ static inline pte_t pte_mkcache(pte_t pte)
pte_val(pte) = (pte_val(pte) & _CACHEMASK040) | m68k_supervisor_cachemode;
return pte;
}
-static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
#define PAGE_DIR_OFFSET(tsk,address) pgd_offset((tsk),(address))
diff --git a/arch/m68k/include/asm/page.h b/arch/m68k/include/asm/page.h
index da546487e177..2614a1206f2f 100644
--- a/arch/m68k/include/asm/page.h
+++ b/arch/m68k/include/asm/page.h
@@ -65,9 +65,6 @@ extern unsigned long _ramend;
#define __phys_to_pfn(paddr) ((unsigned long)((paddr) >> PAGE_SHIFT))
#define __pfn_to_phys(pfn) PFN_PHYS(pfn)
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
#include <asm-generic/getorder.h>
#endif /* _M68K_PAGE_H */
diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h
index bc4155264810..0caa18a08437 100644
--- a/arch/m68k/include/asm/sun3_pgtable.h
+++ b/arch/m68k/include/asm/sun3_pgtable.h
@@ -155,7 +155,6 @@ static inline void pmd_clear (pmd_t *pmdp) { pmd_val (*pmdp) = 0; }
static inline int pte_write(pte_t pte) { return pte_val(pte) & SUN3_PAGE_WRITEABLE; }
static inline int pte_dirty(pte_t pte) { return pte_val(pte) & SUN3_PAGE_MODIFIED; }
static inline int pte_young(pte_t pte) { return pte_val(pte) & SUN3_PAGE_ACCESSED; }
-static inline int pte_special(pte_t pte) { return 0; }
static inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) &= ~SUN3_PAGE_WRITEABLE; return pte; }
static inline pte_t pte_mkclean(pte_t pte) { pte_val(pte) &= ~SUN3_PAGE_MODIFIED; return pte; }
@@ -168,7 +167,6 @@ static inline pte_t pte_mknocache(pte_t pte) { pte_val(pte) |= SUN3_PAGE_NOCACHE
//static inline pte_t pte_mkcache(pte_t pte) { pte_val(pte) &= SUN3_PAGE_NOCACHE; return pte; }
// until then, use:
static inline pte_t pte_mkcache(pte_t pte) { return pte; }
-static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
extern pgd_t kernel_pg_dir[PTRS_PER_PGD];
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index f7afb9897966..3bfb5c8ac3c7 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -125,7 +125,7 @@ good_area:
case 1: /* read, present */
goto acc_err;
case 0: /* read, not present */
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+ if (unlikely(!vma_is_accessible(vma)))
goto acc_err;
}
diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h
index ae7215c94706..b13463d39b38 100644
--- a/arch/microblaze/include/asm/page.h
+++ b/arch/microblaze/include/asm/page.h
@@ -194,8 +194,6 @@ extern int page_is_ram(unsigned long pfn);
#ifdef CONFIG_MMU
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
#endif /* CONFIG_MMU */
#endif /* __KERNEL__ */
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index 45b30878fc17..6b056f6545d8 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -77,10 +77,6 @@ extern pte_t *va_to_pte(unsigned long address);
* Undefined behaviour if not..
*/
-static inline int pte_special(pte_t pte) { return 0; }
-
-static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
-
/* Start and end of the vmalloc area. */
/* Make sure to map the vmalloc area above the pinned kernel memory area
of 32Mb. */
diff --git a/arch/microblaze/kernel/syscalls/syscallhdr.sh b/arch/microblaze/kernel/syscalls/syscallhdr.sh
index 2e9062a926a3..a914854f8d9f 100644
--- a/arch/microblaze/kernel/syscalls/syscallhdr.sh
+++ b/arch/microblaze/kernel/syscalls/syscallhdr.sh
@@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
printf "#define __NR_syscalls\t%s\n" "${nxt}"
printf "#endif\n"
printf "\n"
- printf "#endif /* %s */" "${fileguard}"
+ printf "#endif /* %s */\n" "${fileguard}"
) > "$out"
diff --git a/arch/mips/fw/arc/memory.c b/arch/mips/fw/arc/memory.c
index dbbcddc82823..89fa6e62a3b3 100644
--- a/arch/mips/fw/arc/memory.c
+++ b/arch/mips/fw/arc/memory.c
@@ -117,7 +117,7 @@ static int __init prom_memtype_classify(union linux_memtypes type)
return memtype_classify_arc(type);
}
-void __init prom_meminit(void)
+void __weak __init prom_meminit(void)
{
struct linux_mdesc *p;
@@ -162,7 +162,7 @@ void __weak __init prom_cleanup(void)
{
}
-void __init prom_free_prom_memory(void)
+void __weak __init prom_free_prom_memory(void)
{
int i;
diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h
index 0ba4ce6e2bf3..e2f503fc7a84 100644
--- a/arch/mips/include/asm/page.h
+++ b/arch/mips/include/asm/page.h
@@ -253,10 +253,7 @@ extern bool __virt_addr_valid(const volatile void *kaddr);
#define virt_addr_valid(kaddr) \
__virt_addr_valid((const volatile void *) (kaddr))
-#define VM_DATA_DEFAULT_FLAGS \
- (VM_READ | VM_WRITE | \
- ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index aef5378f909c..f1801e7a4b15 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -270,6 +270,36 @@ cache_sync_done:
extern pgd_t swapper_pg_dir[];
/*
+ * Platform specific pte_special() and pte_mkspecial() definitions
+ * are required only when ARCH_HAS_PTE_SPECIAL is enabled.
+ */
+#if defined(CONFIG_ARCH_HAS_PTE_SPECIAL)
+#if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32)
+static inline int pte_special(pte_t pte)
+{
+ return pte.pte_low & _PAGE_SPECIAL;
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+ pte.pte_low |= _PAGE_SPECIAL;
+ return pte;
+}
+#else
+static inline int pte_special(pte_t pte)
+{
+ return pte_val(pte) & _PAGE_SPECIAL;
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+ pte_val(pte) |= _PAGE_SPECIAL;
+ return pte;
+}
+#endif
+#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
+
+/*
* The following only work if pte_present() is true.
* Undefined behaviour if not..
*/
@@ -277,7 +307,6 @@ extern pgd_t swapper_pg_dir[];
static inline int pte_write(pte_t pte) { return pte.pte_low & _PAGE_WRITE; }
static inline int pte_dirty(pte_t pte) { return pte.pte_low & _PAGE_MODIFIED; }
static inline int pte_young(pte_t pte) { return pte.pte_low & _PAGE_ACCESSED; }
-static inline int pte_special(pte_t pte) { return pte.pte_low & _PAGE_SPECIAL; }
static inline pte_t pte_wrprotect(pte_t pte)
{
@@ -338,17 +367,10 @@ static inline pte_t pte_mkyoung(pte_t pte)
}
return pte;
}
-
-static inline pte_t pte_mkspecial(pte_t pte)
-{
- pte.pte_low |= _PAGE_SPECIAL;
- return pte;
-}
#else
static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_WRITE; }
static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_MODIFIED; }
static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
-static inline int pte_special(pte_t pte) { return pte_val(pte) & _PAGE_SPECIAL; }
static inline pte_t pte_wrprotect(pte_t pte)
{
@@ -392,12 +414,6 @@ static inline pte_t pte_mkyoung(pte_t pte)
return pte;
}
-static inline pte_t pte_mkspecial(pte_t pte)
-{
- pte_val(pte) |= _PAGE_SPECIAL;
- return pte;
-}
-
#ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
static inline int pte_huge(pte_t pte) { return pte_val(pte) & _PAGE_HUGE; }
diff --git a/arch/mips/kernel/syscalls/syscallhdr.sh b/arch/mips/kernel/syscalls/syscallhdr.sh
index d2bcfa8f4d1a..2e241e713a7d 100644
--- a/arch/mips/kernel/syscalls/syscallhdr.sh
+++ b/arch/mips/kernel/syscalls/syscallhdr.sh
@@ -32,6 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
printf "#define __NR_syscalls\t%s\n" "${nxt}"
printf "#endif\n"
printf "\n"
- printf "#endif /* %s */" "${fileguard}"
- printf "\n"
+ printf "#endif /* %s */\n" "${fileguard}"
) > "$out"
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index eac25aef21e0..b91d145aa2d5 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -72,6 +72,4 @@ config KVM_MIPS_DEBUG_COP0_COUNTERS
If unsure, say N.
-source "drivers/vhost/Kconfig"
-
endif # VIRTUALIZATION
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 4a0eafe3d932..f8d62cd83b36 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -142,7 +142,7 @@ good_area:
goto bad_area;
}
} else {
- if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
+ if (unlikely(!vma_is_accessible(vma)))
goto bad_area;
}
}
diff --git a/arch/nds32/include/asm/page.h b/arch/nds32/include/asm/page.h
index 86b32014c5f9..add33a7f02c8 100644
--- a/arch/nds32/include/asm/page.h
+++ b/arch/nds32/include/asm/page.h
@@ -59,9 +59,6 @@ typedef struct page *pgtable_t;
#endif /* !__ASSEMBLY__ */
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
#endif /* __KERNEL__ */
#endif
diff --git a/arch/nds32/include/asm/pgtable.h b/arch/nds32/include/asm/pgtable.h
index 6abc58ac406d..476cc4dd1709 100644
--- a/arch/nds32/include/asm/pgtable.h
+++ b/arch/nds32/include/asm/pgtable.h
@@ -286,15 +286,6 @@ PTE_BIT_FUNC(mkclean, &=~_PAGE_D);
PTE_BIT_FUNC(mkdirty, |=_PAGE_D);
PTE_BIT_FUNC(mkold, &=~_PAGE_YOUNG);
PTE_BIT_FUNC(mkyoung, |=_PAGE_YOUNG);
-static inline int pte_special(pte_t pte)
-{
- return 0;
-}
-
-static inline pte_t pte_mkspecial(pte_t pte)
-{
- return pte;
-}
/*
* Mark the prot value as uncacheable and unbufferable.
diff --git a/arch/nds32/kernel/vmlinux.lds.S b/arch/nds32/kernel/vmlinux.lds.S
index f679d3397436..7a6c1cefe3fe 100644
--- a/arch/nds32/kernel/vmlinux.lds.S
+++ b/arch/nds32/kernel/vmlinux.lds.S
@@ -47,6 +47,7 @@ SECTIONS
LOCK_TEXT
KPROBES_TEXT
IRQENTRY_TEXT
+ SOFTIRQENTRY_TEXT
*(.fixup)
}
diff --git a/arch/nds32/mm/fault.c b/arch/nds32/mm/fault.c
index 0cf0c08c7da2..f331e533edc2 100644
--- a/arch/nds32/mm/fault.c
+++ b/arch/nds32/mm/fault.c
@@ -79,7 +79,7 @@ void do_page_fault(unsigned long entry, unsigned long addr,
struct vm_area_struct *vma;
int si_code;
vm_fault_t fault;
- unsigned int mask = VM_READ | VM_WRITE | VM_EXEC;
+ unsigned int mask = VM_ACCESS_FLAGS;
unsigned int flags = FAULT_FLAG_DEFAULT;
error_code = error_code & (ITYPE_mskINST | ITYPE_mskETYPE);
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 2fc4ed210b5f..c6645141bb2a 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -7,6 +7,7 @@ config NIOS2
select ARCH_HAS_SYNC_DMA_FOR_DEVICE
select ARCH_HAS_DMA_SET_UNCACHED
select ARCH_NO_SWAP
+ select COMMON_CLK
select TIMER_OF
select GENERIC_ATOMIC64
select GENERIC_CLOCKEVENTS
diff --git a/arch/nios2/boot/dts/10m50_devboard.dts b/arch/nios2/boot/dts/10m50_devboard.dts
index 5e4ab032c1e8..56339bef3247 100644
--- a/arch/nios2/boot/dts/10m50_devboard.dts
+++ b/arch/nios2/boot/dts/10m50_devboard.dts
@@ -179,8 +179,7 @@
led_pio: gpio@180014d0 {
compatible = "altr,pio-1.0";
reg = <0x180014d0 0x00000010>;
- altr,gpio-bank-width = <4>;
- resetvalue = <15>;
+ altr,ngpio = <4>;
#gpio-cells = <2>;
gpio-controller;
};
@@ -190,11 +189,10 @@
reg = <0x180014c0 0x00000010>;
interrupt-parent = <&cpu>;
interrupts = <6>;
- altr,gpio-bank-width = <3>;
+ altr,ngpio = <3>;
altr,interrupt-type = <2>;
edge_type = <1>;
level_trigger = <0>;
- resetvalue = <0>;
#gpio-cells = <2>;
gpio-controller;
};
diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h
index 79fcac61f6ef..6a989819a7c1 100644
--- a/arch/nios2/include/asm/page.h
+++ b/arch/nios2/include/asm/page.h
@@ -98,8 +98,7 @@ static inline bool pfn_valid(unsigned long pfn)
# define virt_to_page(vaddr) pfn_to_page(PFN_DOWN(virt_to_phys(vaddr)))
# define virt_addr_valid(vaddr) pfn_valid(PFN_DOWN(virt_to_phys(vaddr)))
-# define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+# define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC
#include <asm-generic/memory_model.h>
diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index 99985d8b7166..f98b7f4519ba 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -113,7 +113,6 @@ static inline int pte_dirty(pte_t pte) \
{ return pte_val(pte) & _PAGE_DIRTY; }
static inline int pte_young(pte_t pte) \
{ return pte_val(pte) & _PAGE_ACCESSED; }
-static inline int pte_special(pte_t pte) { return 0; }
#define pgprot_noncached pgprot_noncached
@@ -168,8 +167,6 @@ static inline pte_t pte_mkdirty(pte_t pte)
return pte;
}
-static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
-
static inline pte_t pte_mkyoung(pte_t pte)
{
pte_val(pte) |= _PAGE_ACCESSED;
diff --git a/arch/nios2/platform/platform.c b/arch/nios2/platform/platform.c
index 2a35154ca153..9737a87121fa 100644
--- a/arch/nios2/platform/platform.c
+++ b/arch/nios2/platform/platform.c
@@ -15,6 +15,12 @@
#include <linux/slab.h>
#include <linux/sys_soc.h>
#include <linux/io.h>
+#include <linux/clk-provider.h>
+
+static const struct of_device_id clk_match[] __initconst = {
+ { .compatible = "fixed-clock", .data = of_fixed_clk_setup, },
+ {}
+};
static int __init nios2_soc_device_init(void)
{
@@ -38,6 +44,8 @@ static int __init nios2_soc_device_init(void)
}
}
+ of_clk_init(clk_match);
+
return 0;
}
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 7e94fe37cb2f..8588996165ae 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -16,6 +16,7 @@ config OPENRISC
select HANDLE_DOMAIN_IRQ
select GPIOLIB
select HAVE_ARCH_TRACEHOOK
+ select HAVE_COPY_THREAD_TLS
select SPARSE_IRQ
select GENERIC_IRQ_CHIP
select GENERIC_IRQ_PROBE
diff --git a/arch/openrisc/configs/or1ksim_defconfig b/arch/openrisc/configs/or1ksim_defconfig
index d8ff4f8ffb88..75f2da324d0e 100644
--- a/arch/openrisc/configs/or1ksim_defconfig
+++ b/arch/openrisc/configs/or1ksim_defconfig
@@ -1,4 +1,3 @@
-CONFIG_CROSS_COMPILE="or1k-linux-"
CONFIG_NO_HZ=y
CONFIG_LOG_BUF_SHIFT=14
CONFIG_BLK_DEV_INITRD=y
diff --git a/arch/openrisc/configs/simple_smp_defconfig b/arch/openrisc/configs/simple_smp_defconfig
index 64278992df9c..ff49d868e040 100644
--- a/arch/openrisc/configs/simple_smp_defconfig
+++ b/arch/openrisc/configs/simple_smp_defconfig
@@ -1,4 +1,3 @@
-CONFIG_CROSS_COMPILE="or1k-linux-"
CONFIG_LOCALVERSION="-simple-smp"
CONFIG_NO_HZ=y
CONFIG_LOG_BUF_SHIFT=14
diff --git a/arch/openrisc/include/asm/page.h b/arch/openrisc/include/asm/page.h
index 01069db59454..aab6e64d6db4 100644
--- a/arch/openrisc/include/asm/page.h
+++ b/arch/openrisc/include/asm/page.h
@@ -86,11 +86,6 @@ typedef struct page *pgtable_t;
#endif /* __ASSEMBLY__ */
-
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
-
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h
index 248d22d8faa7..7f3fb9ceb083 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h
@@ -236,8 +236,6 @@ static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_WRITE; }
static inline int pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_EXEC; }
static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; }
static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
-static inline int pte_special(pte_t pte) { return 0; }
-static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
static inline pte_t pte_wrprotect(pte_t pte)
{
diff --git a/arch/openrisc/include/uapi/asm/unistd.h b/arch/openrisc/include/uapi/asm/unistd.h
index 566f8c4f8047..fae34c60fa88 100644
--- a/arch/openrisc/include/uapi/asm/unistd.h
+++ b/arch/openrisc/include/uapi/asm/unistd.h
@@ -24,6 +24,7 @@
#define __ARCH_WANT_SET_GET_RLIMIT
#define __ARCH_WANT_SYS_FORK
#define __ARCH_WANT_SYS_CLONE
+#define __ARCH_WANT_SYS_CLONE3
#define __ARCH_WANT_TIME32_SYSCALLS
#include <asm-generic/unistd.h>
diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c
index b06f84f6676f..6bcdca424e11 100644
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -117,12 +117,12 @@ void release_thread(struct task_struct *dead_task)
extern asmlinkage void ret_from_fork(void);
/*
- * copy_thread
+ * copy_thread_tls
* @clone_flags: flags
* @usp: user stack pointer or fn for kernel thread
* @arg: arg to fn for kernel thread; always NULL for userspace thread
* @p: the newly created task
- * @regs: CPU context to copy for userspace thread; always NULL for kthread
+ * @tls: the Thread Local Storage pointer for the new process
*
* At the top of a newly initialized kernel stack are two stacked pt_reg
* structures. The first (topmost) is the userspace context of the thread.
@@ -148,8 +148,8 @@ extern asmlinkage void ret_from_fork(void);
*/
int
-copy_thread(unsigned long clone_flags, unsigned long usp,
- unsigned long arg, struct task_struct *p)
+copy_thread_tls(unsigned long clone_flags, unsigned long usp,
+ unsigned long arg, struct task_struct *p, unsigned long tls)
{
struct pt_regs *userregs;
struct pt_regs *kregs;
@@ -179,16 +179,10 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
userregs->sp = usp;
/*
- * For CLONE_SETTLS set "tp" (r10) to the TLS pointer passed to sys_clone.
- *
- * The kernel entry is:
- * int clone (long flags, void *child_stack, int *parent_tid,
- * int *child_tid, struct void *tls)
- *
- * This makes the source r7 in the kernel registers.
+ * For CLONE_SETTLS set "tp" (r10) to the TLS pointer.
*/
if (clone_flags & CLONE_SETTLS)
- userregs->gpr[10] = userregs->gpr[7];
+ userregs->gpr[10] = tls;
userregs->gpr[11] = 0; /* Result from fork() */
diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c
index 7d518ee8bddc..bd1e660bbc89 100644
--- a/arch/openrisc/kernel/smp.c
+++ b/arch/openrisc/kernel/smp.c
@@ -14,6 +14,7 @@
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/irq.h>
#include <asm/cpuinfo.h>
#include <asm/mmu_context.h>
@@ -113,7 +114,7 @@ asmlinkage __init void secondary_start_kernel(void)
* All kernel threads share the same mm context; grab a
* reference and switch to it.
*/
- atomic_inc(&mm->mm_count);
+ mmgrab(mm);
current->active_mm = mm;
cpumask_set_cpu(cpu, mm_cpumask(mm));
diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c
index 932a8ec2b520..c11aa2e17ce0 100644
--- a/arch/openrisc/kernel/traps.c
+++ b/arch/openrisc/kernel/traps.c
@@ -55,13 +55,6 @@ void show_stack(struct task_struct *task, unsigned long *esp)
unwind_stack(NULL, esp, print_trace);
}
-void show_trace_task(struct task_struct *tsk)
-{
- /*
- * TODO: SysRq-T trace dump...
- */
-}
-
void show_registers(struct pt_regs *regs)
{
int i;
diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h
index 796ae29e9b9a..6b3f6740a6a6 100644
--- a/arch/parisc/include/asm/page.h
+++ b/arch/parisc/include/asm/page.h
@@ -180,9 +180,6 @@ extern int npmem_ranges;
#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT)
#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
#include <asm/pdc.h>
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index f0a365950536..9832c73a7021 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -377,7 +377,6 @@ static inline void pud_clear(pud_t *pud) {
static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; }
static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_WRITE; }
-static inline int pte_special(pte_t pte) { return 0; }
static inline pte_t pte_mkclean(pte_t pte) { pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
static inline pte_t pte_mkold(pte_t pte) { pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
@@ -385,7 +384,6 @@ static inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) &= ~_PAGE_WRITE; ret
static inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= _PAGE_DIRTY; return pte; }
static inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= _PAGE_ACCESSED; return pte; }
static inline pte_t pte_mkwrite(pte_t pte) { pte_val(pte) |= _PAGE_WRITE; return pte; }
-static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
/*
* Huge pte definitions.
diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
index 197d2247e4db..70fecb8dc4e2 100644
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -10,25 +10,34 @@
static inline int arch_spin_is_locked(arch_spinlock_t *x)
{
volatile unsigned int *a = __ldcw_align(x);
+ smp_mb();
return *a == 0;
}
-#define arch_spin_lock(lock) arch_spin_lock_flags(lock, 0)
+static inline void arch_spin_lock(arch_spinlock_t *x)
+{
+ volatile unsigned int *a;
+
+ a = __ldcw_align(x);
+ while (__ldcw(a) == 0)
+ while (*a == 0)
+ cpu_relax();
+}
static inline void arch_spin_lock_flags(arch_spinlock_t *x,
unsigned long flags)
{
volatile unsigned int *a;
+ unsigned long flags_dis;
a = __ldcw_align(x);
- while (__ldcw(a) == 0)
+ while (__ldcw(a) == 0) {
+ local_save_flags(flags_dis);
+ local_irq_restore(flags);
while (*a == 0)
- if (flags & PSW_SM_I) {
- local_irq_enable();
- cpu_relax();
- local_irq_disable();
- } else
- cpu_relax();
+ cpu_relax();
+ local_irq_restore(flags_dis);
+ }
}
#define arch_spin_lock_flags arch_spin_lock_flags
@@ -58,116 +67,93 @@ static inline int arch_spin_trylock(arch_spinlock_t *x)
/*
* Read-write spinlocks, allowing multiple readers but only one writer.
- * Linux rwlocks are unfair to writers; they can be starved for an indefinite
- * time by readers. With care, they can also be taken in interrupt context.
+ * Unfair locking as Writers could be starved indefinitely by Reader(s)
*
- * In the PA-RISC implementation, we have a spinlock and a counter.
- * Readers use the lock to serialise their access to the counter (which
- * records how many readers currently hold the lock).
- * Writers hold the spinlock, preventing any readers or other writers from
- * grabbing the rwlock.
+ * The spinlock itself is contained in @counter and access to it is
+ * serialized with @lock_mutex.
*/
-/* Note that we have to ensure interrupts are disabled in case we're
- * interrupted by some other code that wants to grab the same read lock */
-static __inline__ void arch_read_lock(arch_rwlock_t *rw)
+/* 1 - lock taken successfully */
+static inline int arch_read_trylock(arch_rwlock_t *rw)
{
+ int ret = 0;
unsigned long flags;
- local_irq_save(flags);
- arch_spin_lock_flags(&rw->lock, flags);
- rw->counter++;
- arch_spin_unlock(&rw->lock);
- local_irq_restore(flags);
-}
-/* Note that we have to ensure interrupts are disabled in case we're
- * interrupted by some other code that wants to grab the same read lock */
-static __inline__ void arch_read_unlock(arch_rwlock_t *rw)
-{
- unsigned long flags;
local_irq_save(flags);
- arch_spin_lock_flags(&rw->lock, flags);
- rw->counter--;
- arch_spin_unlock(&rw->lock);
+ arch_spin_lock(&(rw->lock_mutex));
+
+ /*
+ * zero means writer holds the lock exclusively, deny Reader.
+ * Otherwise grant lock to first/subseq reader
+ */
+ if (rw->counter > 0) {
+ rw->counter--;
+ ret = 1;
+ }
+
+ arch_spin_unlock(&(rw->lock_mutex));
local_irq_restore(flags);
+
+ return ret;
}
-/* Note that we have to ensure interrupts are disabled in case we're
- * interrupted by some other code that wants to grab the same read lock */
-static __inline__ int arch_read_trylock(arch_rwlock_t *rw)
+/* 1 - lock taken successfully */
+static inline int arch_write_trylock(arch_rwlock_t *rw)
{
+ int ret = 0;
unsigned long flags;
- retry:
+
local_irq_save(flags);
- if (arch_spin_trylock(&rw->lock)) {
- rw->counter++;
- arch_spin_unlock(&rw->lock);
- local_irq_restore(flags);
- return 1;
+ arch_spin_lock(&(rw->lock_mutex));
+
+ /*
+ * If reader(s) hold lock (lock < __ARCH_RW_LOCK_UNLOCKED__),
+ * deny writer. Otherwise if unlocked grant to writer
+ * Hence the claim that Linux rwlocks are unfair to writers.
+ * (can be starved for an indefinite time by readers).
+ */
+ if (rw->counter == __ARCH_RW_LOCK_UNLOCKED__) {
+ rw->counter = 0;
+ ret = 1;
}
-
+ arch_spin_unlock(&(rw->lock_mutex));
local_irq_restore(flags);
- /* If write-locked, we fail to acquire the lock */
- if (rw->counter < 0)
- return 0;
- /* Wait until we have a realistic chance at the lock */
- while (arch_spin_is_locked(&rw->lock) && rw->counter >= 0)
+ return ret;
+}
+
+static inline void arch_read_lock(arch_rwlock_t *rw)
+{
+ while (!arch_read_trylock(rw))
cpu_relax();
+}
- goto retry;
+static inline void arch_write_lock(arch_rwlock_t *rw)
+{
+ while (!arch_write_trylock(rw))
+ cpu_relax();
}
-/* Note that we have to ensure interrupts are disabled in case we're
- * interrupted by some other code that wants to read_trylock() this lock */
-static __inline__ void arch_write_lock(arch_rwlock_t *rw)
+static inline void arch_read_unlock(arch_rwlock_t *rw)
{
unsigned long flags;
-retry:
- local_irq_save(flags);
- arch_spin_lock_flags(&rw->lock, flags);
- if (rw->counter != 0) {
- arch_spin_unlock(&rw->lock);
- local_irq_restore(flags);
-
- while (rw->counter != 0)
- cpu_relax();
-
- goto retry;
- }
-
- rw->counter = -1; /* mark as write-locked */
- mb();
+ local_irq_save(flags);
+ arch_spin_lock(&(rw->lock_mutex));
+ rw->counter++;
+ arch_spin_unlock(&(rw->lock_mutex));
local_irq_restore(flags);
}
-static __inline__ void arch_write_unlock(arch_rwlock_t *rw)
-{
- rw->counter = 0;
- arch_spin_unlock(&rw->lock);
-}
-
-/* Note that we have to ensure interrupts are disabled in case we're
- * interrupted by some other code that wants to read_trylock() this lock */
-static __inline__ int arch_write_trylock(arch_rwlock_t *rw)
+static inline void arch_write_unlock(arch_rwlock_t *rw)
{
unsigned long flags;
- int result = 0;
local_irq_save(flags);
- if (arch_spin_trylock(&rw->lock)) {
- if (rw->counter == 0) {
- rw->counter = -1;
- result = 1;
- } else {
- /* Read-locked. Oh well. */
- arch_spin_unlock(&rw->lock);
- }
- }
+ arch_spin_lock(&(rw->lock_mutex));
+ rw->counter = __ARCH_RW_LOCK_UNLOCKED__;
+ arch_spin_unlock(&(rw->lock_mutex));
local_irq_restore(flags);
-
- return result;
}
#endif /* __ASM_SPINLOCK_H */
diff --git a/arch/parisc/include/asm/spinlock_types.h b/arch/parisc/include/asm/spinlock_types.h
index 42979c5704dc..ca39ee350c3f 100644
--- a/arch/parisc/include/asm/spinlock_types.h
+++ b/arch/parisc/include/asm/spinlock_types.h
@@ -12,11 +12,19 @@ typedef struct {
#endif
} arch_spinlock_t;
+
+/* counter:
+ * Unlocked : 0x0100_0000
+ * Read lock(s) : 0x00FF_FFFF to 0x01 (Multiple Readers decrement it)
+ * Write lock : 0x0, but only if prior value is "unlocked" 0x0100_0000
+ */
typedef struct {
- arch_spinlock_t lock;
- volatile int counter;
+ arch_spinlock_t lock_mutex;
+ volatile unsigned int counter;
} arch_rwlock_t;
-#define __ARCH_RW_LOCK_UNLOCKED { __ARCH_SPIN_LOCK_UNLOCKED, 0 }
+#define __ARCH_RW_LOCK_UNLOCKED__ 0x01000000
+#define __ARCH_RW_LOCK_UNLOCKED { .lock_mutex = __ARCH_SPIN_LOCK_UNLOCKED, \
+ .counter = __ARCH_RW_LOCK_UNLOCKED__ }
#endif
diff --git a/arch/parisc/kernel/alternative.c b/arch/parisc/kernel/alternative.c
index 3c66d5c4d90d..fa28c4c9f972 100644
--- a/arch/parisc/kernel/alternative.c
+++ b/arch/parisc/kernel/alternative.c
@@ -25,6 +25,22 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
struct alt_instr *entry;
int index = 0, applied = 0;
int num_cpus = num_online_cpus();
+ u32 cond_check;
+
+ cond_check = ALT_COND_ALWAYS |
+ ((num_cpus == 1) ? ALT_COND_NO_SMP : 0) |
+ ((cache_info.dc_size == 0) ? ALT_COND_NO_DCACHE : 0) |
+ ((cache_info.ic_size == 0) ? ALT_COND_NO_ICACHE : 0) |
+ (running_on_qemu ? ALT_COND_RUN_ON_QEMU : 0) |
+ ((split_tlb == 0) ? ALT_COND_NO_SPLIT_TLB : 0) |
+ /*
+ * If the PDC_MODEL capabilities has Non-coherent IO-PDIR bit
+ * set (bit #61, big endian), we have to flush and sync every
+ * time IO-PDIR is changed in Ike/Astro.
+ */
+ (((boot_cpu_data.cpu_type > pcxw_) &&
+ ((boot_cpu_data.pdc.capabilities & PDC_MODEL_IOPDIR_FDC) == 0))
+ ? ALT_COND_NO_IOC_FDC : 0);
for (entry = start; entry < end; entry++, index++) {
@@ -38,29 +54,14 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
WARN_ON(!cond);
- if (cond != ALT_COND_ALWAYS && no_alternatives)
+ if ((cond & ALT_COND_ALWAYS) == 0 && no_alternatives)
continue;
pr_debug("Check %d: Cond 0x%x, Replace %02d instructions @ 0x%px with 0x%08x\n",
index, cond, len, from, replacement);
- if ((cond & ALT_COND_NO_SMP) && (num_cpus != 1))
- continue;
- if ((cond & ALT_COND_NO_DCACHE) && (cache_info.dc_size != 0))
- continue;
- if ((cond & ALT_COND_NO_ICACHE) && (cache_info.ic_size != 0))
- continue;
- if ((cond & ALT_COND_RUN_ON_QEMU) && !running_on_qemu)
- continue;
-
- /*
- * If the PDC_MODEL capabilities has Non-coherent IO-PDIR bit
- * set (bit #61, big endian), we have to flush and sync every
- * time IO-PDIR is changed in Ike/Astro.
- */
- if ((cond & ALT_COND_NO_IOC_FDC) &&
- ((boot_cpu_data.cpu_type <= pcxw_) ||
- (boot_cpu_data.pdc.capabilities & PDC_MODEL_IOPDIR_FDC)))
+ /* Bounce out if none of the conditions are true. */
+ if ((cond & cond_check) == 0)
continue;
/* Want to replace pdtlb by a pdtlb,l instruction? */
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index e5fcfb70cc7c..e76c86619949 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -560,33 +560,23 @@ void do_cpu_irq_mask(struct pt_regs *regs)
goto out;
}
-static struct irqaction timer_action = {
- .handler = timer_interrupt,
- .name = "timer",
- .flags = IRQF_TIMER | IRQF_PERCPU | IRQF_IRQPOLL,
-};
-
-#ifdef CONFIG_SMP
-static struct irqaction ipi_action = {
- .handler = ipi_interrupt,
- .name = "IPI",
- .flags = IRQF_PERCPU,
-};
-#endif
-
static void claim_cpu_irqs(void)
{
+ unsigned long flags = IRQF_TIMER | IRQF_PERCPU | IRQF_IRQPOLL;
int i;
+
for (i = CPU_IRQ_BASE; i <= CPU_IRQ_MAX; i++) {
irq_set_chip_and_handler(i, &cpu_interrupt_type,
handle_percpu_irq);
}
irq_set_handler(TIMER_IRQ, handle_percpu_irq);
- setup_irq(TIMER_IRQ, &timer_action);
+ if (request_irq(TIMER_IRQ, timer_interrupt, flags, "timer", NULL))
+ pr_err("Failed to register timer interrupt\n");
#ifdef CONFIG_SMP
irq_set_handler(IPI_IRQ, handle_percpu_irq);
- setup_irq(IPI_IRQ, &ipi_action);
+ if (request_irq(IPI_IRQ, ipi_interrupt, IRQF_PERCPU, "IPI", NULL))
+ pr_err("Failed to register IPI interrupt\n");
#endif
}
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index 97ac707c6bff..f05c9d5b6b9e 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -935,7 +935,7 @@ ENTRY(lws_table)
END(lws_table)
/* End of lws table */
-#define __SYSCALL(nr, entry, nargs) ASM_ULONG_INSN entry
+#define __SYSCALL(nr, entry) ASM_ULONG_INSN entry
.align 8
ENTRY(sys_call_table)
.export sys_call_table,data
diff --git a/arch/parisc/kernel/syscalls/syscallhdr.sh b/arch/parisc/kernel/syscalls/syscallhdr.sh
index 50242b747d7c..730db288fe54 100644
--- a/arch/parisc/kernel/syscalls/syscallhdr.sh
+++ b/arch/parisc/kernel/syscalls/syscallhdr.sh
@@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
printf "#define __NR_syscalls\t%s\n" "${nxt}"
printf "#endif\n"
printf "\n"
- printf "#endif /* %s */" "${fileguard}"
+ printf "#endif /* %s */\n" "${fileguard}"
) > "$out"
diff --git a/arch/parisc/kernel/syscalls/syscalltbl.sh b/arch/parisc/kernel/syscalls/syscalltbl.sh
index 45b5bae26240..f7393a7b18aa 100644
--- a/arch/parisc/kernel/syscalls/syscalltbl.sh
+++ b/arch/parisc/kernel/syscalls/syscalltbl.sh
@@ -13,10 +13,10 @@ emit() {
t_entry="$3"
while [ $t_nxt -lt $t_nr ]; do
- printf "__SYSCALL(%s, sys_ni_syscall, )\n" "${t_nxt}"
+ printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}"
t_nxt=$((t_nxt+1))
done
- printf "__SYSCALL(%s, %s, )\n" "${t_nxt}" "${t_entry}"
+ printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}"
}
grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5fc45364e86e..924c541a9260 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -122,6 +122,7 @@ config PPC
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_KCOV
select ARCH_HAS_HUGEPD if HUGETLB_PAGE
+ select ARCH_HAS_MEMREMAP_COMPAT_ALIGN
select ARCH_HAS_MMIOWB if PPC64
select ARCH_HAS_PHYS_TO_DMA
select ARCH_HAS_PMEM_API
@@ -265,8 +266,9 @@ config PANIC_TIMEOUT
default 180
config COMPAT
- bool
- default y if PPC64
+ bool "Enable support for 32bit binaries"
+ depends on PPC64
+ default y if !CPU_LITTLE_ENDIAN
select COMPAT_BINFMT_ELF
select ARCH_WANT_OLD_COMPAT_IPC
select COMPAT_OLD_SIGACTION
diff --git a/arch/powerpc/configs/ps3_defconfig b/arch/powerpc/configs/ps3_defconfig
index 4db51719342a..81b55c880fc3 100644
--- a/arch/powerpc/configs/ps3_defconfig
+++ b/arch/powerpc/configs/ps3_defconfig
@@ -60,6 +60,8 @@ CONFIG_CFG80211=m
CONFIG_CFG80211_WEXT=y
CONFIG_MAC80211=m
# CONFIG_MAC80211_RC_MINSTREL is not set
+CONFIG_UEVENT_HELPER=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=65535
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 2781ebf6add4..6fc4520092c7 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -251,7 +251,8 @@ extern int __meminit hash__vmemmap_create_mapping(unsigned long start,
extern void hash__vmemmap_remove_mapping(unsigned long start,
unsigned long page_size);
-int hash__create_section_mapping(unsigned long start, unsigned long end, int nid);
+int hash__create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot);
int hash__remove_section_mapping(unsigned long start, unsigned long end);
#endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index a1c60d5b50af..08c222d5b764 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -294,7 +294,8 @@ static inline unsigned long radix__get_tree_size(void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int radix__create_section_mapping(unsigned long start, unsigned long end, int nid);
+int radix__create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot);
int radix__remove_section_mapping(unsigned long start, unsigned long end);
#endif /* CONFIG_MEMORY_HOTPLUG */
#endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 080a0bf8e54b..3ee8df0f66e0 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -240,13 +240,8 @@ static inline bool pfn_valid(unsigned long pfn)
* and needs to be executable. This means the whole heap ends
* up being executable.
*/
-#define VM_DATA_DEFAULT_FLAGS32 \
- (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \
- VM_READ | VM_WRITE | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
-#define VM_DATA_DEFAULT_FLAGS64 (VM_READ | VM_WRITE | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_DEFAULT_FLAGS32 VM_DATA_FLAGS_TSK_EXEC
+#define VM_DATA_DEFAULT_FLAGS64 VM_DATA_FLAGS_NON_EXEC
#ifdef __powerpc64__
#include <asm/page_64.h>
diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h
index 5962797f784a..79a9b7c6a132 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -94,11 +94,8 @@ extern u64 ppc64_pft_size;
* stack by default, so in the absence of a PT_GNU_STACK program header
* we turn execute permission off.
*/
-#define VM_STACK_DEFAULT_FLAGS32 (VM_READ | VM_WRITE | VM_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
-#define VM_STACK_DEFAULT_FLAGS64 (VM_READ | VM_WRITE | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_STACK_DEFAULT_FLAGS32 VM_DATA_FLAGS_EXEC
+#define VM_STACK_DEFAULT_FLAGS64 VM_DATA_FLAGS_NON_EXEC
#define VM_STACK_DEFAULT_FLAGS \
(is_32bit_task() ? \
diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
index 3192d454a733..c89b32443cff 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -13,7 +13,8 @@
#endif /* CONFIG_SPARSEMEM */
#ifdef CONFIG_MEMORY_HOTPLUG
-extern int create_section_mapping(unsigned long start, unsigned long end, int nid);
+extern int create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot);
extern int remove_section_mapping(unsigned long start, unsigned long end);
#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index a2270749b282..ca6c97025704 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -162,10 +162,10 @@ static inline bool test_thread_local_flags(unsigned int flags)
return (ti->local_flags & flags) != 0;
}
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_COMPAT
#define is_32bit_task() (test_thread_flag(TIF_32BIT))
#else
-#define is_32bit_task() (1)
+#define is_32bit_task() (IS_ENABLED(CONFIG_PPC32))
#endif
#if defined(CONFIG_PPC64)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index b0720c7c3fcf..700fcdac2e3c 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -31,6 +31,7 @@
#define __ARCH_WANT_SYS_SOCKETCALL
#define __ARCH_WANT_SYS_FADVISE64
#define __ARCH_WANT_SYS_GETPGRP
+#define __ARCH_WANT_SYS_LLSEEK
#define __ARCH_WANT_SYS_NICE
#define __ARCH_WANT_SYS_OLD_GETRLIMIT
#define __ARCH_WANT_SYS_OLD_UNAME
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 570660efbb3d..1c4385852d3d 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -40,16 +40,17 @@ CFLAGS_btext.o += -DDISABLE_BRANCH_PROFILING
endif
obj-y := cputable.o syscalls.o \
- irq.o align.o signal_32.o pmc.o vdso.o \
+ irq.o align.o signal_$(BITS).o pmc.o vdso.o \
process.o systbl.o idle.o \
signal.o sysfs.o cacheinfo.o time.o \
prom.o traps.o setup-common.o \
udbg.o misc.o io.o misc_$(BITS).o \
of_platform.o prom_parse.o
obj-y += ptrace/
-obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o signal_64.o \
+obj-$(CONFIG_PPC64) += setup_64.o \
paca.o nvram_64.o firmware.o note.o \
syscall_64.o
+obj-$(CONFIG_COMPAT) += sys_ppc32.o signal_32.o
obj-$(CONFIG_VDSO32) += vdso32/
obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 63f0a4414618..9a1e5d636dea 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -52,8 +52,10 @@
SYS_CALL_TABLE:
.tc sys_call_table[TC],sys_call_table
+#ifdef CONFIG_COMPAT
COMPAT_SYS_CALL_TABLE:
.tc compat_sys_call_table[TC],compat_sys_call_table
+#endif
/* This value is used to mark exception frames on the stack. */
exception_marker:
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 18bbce143084..728ccb0f560c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -3121,22 +3121,3 @@ handle_dabr_fault:
li r5,SIGSEGV
bl bad_page_fault
b interrupt_return
-
-/*
- * When doorbell is triggered from system reset wakeup, the message is
- * not cleared, so it would fire again when EE is enabled.
- *
- * When coming from local_irq_enable, there may be the same problem if
- * we were hard disabled.
- *
- * Execute msgclr to clear pending exceptions before handling it.
- */
-h_doorbell_common_msgclr:
- LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36))
- PPC_MSGCLR(3)
- b h_doorbell_common_virt
-
-doorbell_super_common_msgclr:
- LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36))
- PPC_MSGCLRP(3)
- b doorbell_super_common_virt
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index a25ed47087ee..1f1169856dc8 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -527,6 +527,19 @@ void irq_set_pending_from_srr1(unsigned long srr1)
return;
}
+ if (reason == PACA_IRQ_DBELL) {
+ /*
+ * When doorbell triggers a system reset wakeup, the message
+ * is not cleared, so if the doorbell interrupt is replayed
+ * and the IPI handled, the doorbell interrupt would still
+ * fire when EE is enabled.
+ *
+ * To avoid taking the superfluous doorbell interrupt,
+ * execute a msgclr here before the interrupt is replayed.
+ */
+ ppc_msgclr(PPC_DBELL_MSGTYPE);
+ }
+
/*
* The 0 index (SRR1[42:45]=b0000) must always evaluate to 0,
* so this can be called unconditionally with the SRR1 wake
diff --git a/arch/powerpc/kernel/ppc_save_regs.S b/arch/powerpc/kernel/ppc_save_regs.S
index f3bd0bbf2ae8..2d4d21bb46a9 100644
--- a/arch/powerpc/kernel/ppc_save_regs.S
+++ b/arch/powerpc/kernel/ppc_save_regs.S
@@ -55,14 +55,17 @@ _GLOBAL(ppc_save_regs)
PPC_STL r29,29*SZL(r3)
PPC_STL r30,30*SZL(r3)
PPC_STL r31,31*SZL(r3)
+ lbz r0,PACAIRQSOFTMASK(r13)
+ PPC_STL r0,SOFTE-STACK_FRAME_OVERHEAD(r3)
#endif
/* go up one stack frame for SP */
PPC_LL r4,0(r1)
PPC_STL r4,1*SZL(r3)
/* get caller's LR */
PPC_LL r0,LRSAVE(r4)
- PPC_STL r0,_NIP-STACK_FRAME_OVERHEAD(r3)
PPC_STL r0,_LINK-STACK_FRAME_OVERHEAD(r3)
+ mflr r0
+ PPC_STL r0,_NIP-STACK_FRAME_OVERHEAD(r3)
mfmsr r0
PPC_STL r0,_MSR-STACK_FRAME_OVERHEAD(r3)
mfctr r0
@@ -73,4 +76,5 @@ _GLOBAL(ppc_save_regs)
PPC_STL r0,_CCR-STACK_FRAME_OVERHEAD(r3)
li r0,0
PPC_STL r0,_TRAP-STACK_FRAME_OVERHEAD(r3)
+ PPC_STL r0,ORIG_GPR3-STACK_FRAME_OVERHEAD(r3)
blr
diff --git a/arch/powerpc/kernel/ptrace/Makefile b/arch/powerpc/kernel/ptrace/Makefile
index e9d97c2d063e..c2f2402ebc8c 100644
--- a/arch/powerpc/kernel/ptrace/Makefile
+++ b/arch/powerpc/kernel/ptrace/Makefile
@@ -6,7 +6,7 @@
CFLAGS_ptrace-view.o += -DUTS_MACHINE='"$(UTS_MACHINE)"'
obj-y += ptrace.o ptrace-view.o
-obj-$(CONFIG_PPC64) += ptrace32.o
+obj-$(CONFIG_COMPAT) += ptrace32.o
obj-$(CONFIG_VSX) += ptrace-vsx.o
ifneq ($(CONFIG_VSX),y)
obj-y += ptrace-novsx.o
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index d215f9554553..a264989626fd 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -18,12 +18,153 @@
#include <linux/syscalls.h>
#include <asm/hw_breakpoint.h>
#include <linux/uaccess.h>
+#include <asm/switch_to.h>
#include <asm/unistd.h>
#include <asm/debug.h>
#include <asm/tm.h>
#include "signal.h"
+#ifdef CONFIG_VSX
+unsigned long copy_fpr_to_user(void __user *to,
+ struct task_struct *task)
+{
+ u64 buf[ELF_NFPREG];
+ int i;
+
+ /* save FPR copy to local buffer then write to the thread_struct */
+ for (i = 0; i < (ELF_NFPREG - 1) ; i++)
+ buf[i] = task->thread.TS_FPR(i);
+ buf[i] = task->thread.fp_state.fpscr;
+ return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double));
+}
+
+unsigned long copy_fpr_from_user(struct task_struct *task,
+ void __user *from)
+{
+ u64 buf[ELF_NFPREG];
+ int i;
+
+ if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double)))
+ return 1;
+ for (i = 0; i < (ELF_NFPREG - 1) ; i++)
+ task->thread.TS_FPR(i) = buf[i];
+ task->thread.fp_state.fpscr = buf[i];
+
+ return 0;
+}
+
+unsigned long copy_vsx_to_user(void __user *to,
+ struct task_struct *task)
+{
+ u64 buf[ELF_NVSRHALFREG];
+ int i;
+
+ /* save FPR copy to local buffer then write to the thread_struct */
+ for (i = 0; i < ELF_NVSRHALFREG; i++)
+ buf[i] = task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET];
+ return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double));
+}
+
+unsigned long copy_vsx_from_user(struct task_struct *task,
+ void __user *from)
+{
+ u64 buf[ELF_NVSRHALFREG];
+ int i;
+
+ if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double)))
+ return 1;
+ for (i = 0; i < ELF_NVSRHALFREG ; i++)
+ task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
+ return 0;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+unsigned long copy_ckfpr_to_user(void __user *to,
+ struct task_struct *task)
+{
+ u64 buf[ELF_NFPREG];
+ int i;
+
+ /* save FPR copy to local buffer then write to the thread_struct */
+ for (i = 0; i < (ELF_NFPREG - 1) ; i++)
+ buf[i] = task->thread.TS_CKFPR(i);
+ buf[i] = task->thread.ckfp_state.fpscr;
+ return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double));
+}
+
+unsigned long copy_ckfpr_from_user(struct task_struct *task,
+ void __user *from)
+{
+ u64 buf[ELF_NFPREG];
+ int i;
+
+ if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double)))
+ return 1;
+ for (i = 0; i < (ELF_NFPREG - 1) ; i++)
+ task->thread.TS_CKFPR(i) = buf[i];
+ task->thread.ckfp_state.fpscr = buf[i];
+
+ return 0;
+}
+
+unsigned long copy_ckvsx_to_user(void __user *to,
+ struct task_struct *task)
+{
+ u64 buf[ELF_NVSRHALFREG];
+ int i;
+
+ /* save FPR copy to local buffer then write to the thread_struct */
+ for (i = 0; i < ELF_NVSRHALFREG; i++)
+ buf[i] = task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET];
+ return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double));
+}
+
+unsigned long copy_ckvsx_from_user(struct task_struct *task,
+ void __user *from)
+{
+ u64 buf[ELF_NVSRHALFREG];
+ int i;
+
+ if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double)))
+ return 1;
+ for (i = 0; i < ELF_NVSRHALFREG ; i++)
+ task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
+ return 0;
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+#else
+inline unsigned long copy_fpr_to_user(void __user *to,
+ struct task_struct *task)
+{
+ return __copy_to_user(to, task->thread.fp_state.fpr,
+ ELF_NFPREG * sizeof(double));
+}
+
+inline unsigned long copy_fpr_from_user(struct task_struct *task,
+ void __user *from)
+{
+ return __copy_from_user(task->thread.fp_state.fpr, from,
+ ELF_NFPREG * sizeof(double));
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+inline unsigned long copy_ckfpr_to_user(void __user *to,
+ struct task_struct *task)
+{
+ return __copy_to_user(to, task->thread.ckfp_state.fpr,
+ ELF_NFPREG * sizeof(double));
+}
+
+inline unsigned long copy_ckfpr_from_user(struct task_struct *task,
+ void __user *from)
+{
+ return __copy_from_user(task->thread.ckfp_state.fpr, from,
+ ELF_NFPREG * sizeof(double));
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+#endif
+
/* Log an error when sending an unhandled signal to a process. Controlled
* through debug.exception-trace sysctl.
*/
@@ -106,7 +247,6 @@ static void do_signal(struct task_struct *tsk)
sigset_t *oldset = sigmask_to_save();
struct ksignal ksig = { .sig = 0 };
int ret;
- int is32 = is_32bit_task();
BUG_ON(tsk != current);
@@ -136,7 +276,7 @@ static void do_signal(struct task_struct *tsk)
rseq_signal_deliver(&ksig, tsk->thread.regs);
- if (is32) {
+ if (is_32bit_task()) {
if (ksig.ka.sa.sa_flags & SA_SIGINFO)
ret = handle_rt_signal32(&ksig, oldset, tsk);
else
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 1b090a76b444..4f96d29a22bf 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -235,146 +235,6 @@ struct rt_sigframe {
int abigap[56];
};
-#ifdef CONFIG_VSX
-unsigned long copy_fpr_to_user(void __user *to,
- struct task_struct *task)
-{
- u64 buf[ELF_NFPREG];
- int i;
-
- /* save FPR copy to local buffer then write to the thread_struct */
- for (i = 0; i < (ELF_NFPREG - 1) ; i++)
- buf[i] = task->thread.TS_FPR(i);
- buf[i] = task->thread.fp_state.fpscr;
- return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double));
-}
-
-unsigned long copy_fpr_from_user(struct task_struct *task,
- void __user *from)
-{
- u64 buf[ELF_NFPREG];
- int i;
-
- if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double)))
- return 1;
- for (i = 0; i < (ELF_NFPREG - 1) ; i++)
- task->thread.TS_FPR(i) = buf[i];
- task->thread.fp_state.fpscr = buf[i];
-
- return 0;
-}
-
-unsigned long copy_vsx_to_user(void __user *to,
- struct task_struct *task)
-{
- u64 buf[ELF_NVSRHALFREG];
- int i;
-
- /* save FPR copy to local buffer then write to the thread_struct */
- for (i = 0; i < ELF_NVSRHALFREG; i++)
- buf[i] = task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET];
- return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double));
-}
-
-unsigned long copy_vsx_from_user(struct task_struct *task,
- void __user *from)
-{
- u64 buf[ELF_NVSRHALFREG];
- int i;
-
- if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double)))
- return 1;
- for (i = 0; i < ELF_NVSRHALFREG ; i++)
- task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
- return 0;
-}
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-unsigned long copy_ckfpr_to_user(void __user *to,
- struct task_struct *task)
-{
- u64 buf[ELF_NFPREG];
- int i;
-
- /* save FPR copy to local buffer then write to the thread_struct */
- for (i = 0; i < (ELF_NFPREG - 1) ; i++)
- buf[i] = task->thread.TS_CKFPR(i);
- buf[i] = task->thread.ckfp_state.fpscr;
- return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double));
-}
-
-unsigned long copy_ckfpr_from_user(struct task_struct *task,
- void __user *from)
-{
- u64 buf[ELF_NFPREG];
- int i;
-
- if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double)))
- return 1;
- for (i = 0; i < (ELF_NFPREG - 1) ; i++)
- task->thread.TS_CKFPR(i) = buf[i];
- task->thread.ckfp_state.fpscr = buf[i];
-
- return 0;
-}
-
-unsigned long copy_ckvsx_to_user(void __user *to,
- struct task_struct *task)
-{
- u64 buf[ELF_NVSRHALFREG];
- int i;
-
- /* save FPR copy to local buffer then write to the thread_struct */
- for (i = 0; i < ELF_NVSRHALFREG; i++)
- buf[i] = task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET];
- return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double));
-}
-
-unsigned long copy_ckvsx_from_user(struct task_struct *task,
- void __user *from)
-{
- u64 buf[ELF_NVSRHALFREG];
- int i;
-
- if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double)))
- return 1;
- for (i = 0; i < ELF_NVSRHALFREG ; i++)
- task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
- return 0;
-}
-#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
-#else
-inline unsigned long copy_fpr_to_user(void __user *to,
- struct task_struct *task)
-{
- return __copy_to_user(to, task->thread.fp_state.fpr,
- ELF_NFPREG * sizeof(double));
-}
-
-inline unsigned long copy_fpr_from_user(struct task_struct *task,
- void __user *from)
-{
- return __copy_from_user(task->thread.fp_state.fpr, from,
- ELF_NFPREG * sizeof(double));
-}
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-inline unsigned long copy_ckfpr_to_user(void __user *to,
- struct task_struct *task)
-{
- return __copy_to_user(to, task->thread.ckfp_state.fpr,
- ELF_NFPREG * sizeof(double));
-}
-
-inline unsigned long copy_ckfpr_from_user(struct task_struct *task,
- void __user *from)
-{
- return __copy_from_user(task->thread.ckfp_state.fpr, from,
- ELF_NFPREG * sizeof(double));
-}
-#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
-#endif
-
/*
* Save the current user registers on the user stack.
* We only save the altivec/spe registers if the process has used
diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c
index cf06eb443a80..c74295a7765b 100644
--- a/arch/powerpc/kernel/syscall_64.c
+++ b/arch/powerpc/kernel/syscall_64.c
@@ -22,7 +22,6 @@ notrace long system_call_exception(long r3, long r4, long r5,
long r6, long r7, long r8,
unsigned long r0, struct pt_regs *regs)
{
- unsigned long ti_flags;
syscall_fn f;
if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
@@ -60,8 +59,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
local_irq_enable();
- ti_flags = current_thread_info()->flags;
- if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) {
+ if (unlikely(current_thread_info()->flags & _TIF_SYSCALL_DOTRACE)) {
/*
* We use the return value of do_syscall_trace_enter() as the
* syscall number. If the syscall was rejected for any reason
@@ -86,7 +84,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
/* May be faster to do array_index_nospec? */
barrier_nospec();
- if (unlikely(ti_flags & _TIF_32BIT)) {
+ if (unlikely(is_32bit_task())) {
f = (void *)compat_sys_call_table[r0];
r3 &= 0x00000000ffffffffULL;
diff --git a/arch/powerpc/kernel/syscalls/syscallhdr.sh b/arch/powerpc/kernel/syscalls/syscallhdr.sh
index c0a9a32937f1..02d6751f3be3 100644
--- a/arch/powerpc/kernel/syscalls/syscallhdr.sh
+++ b/arch/powerpc/kernel/syscalls/syscallhdr.sh
@@ -32,6 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
printf "#define __NR_syscalls\t%s\n" "${nxt}"
printf "#endif\n"
printf "\n"
- printf "#endif /* %s */" "${fileguard}"
- printf "\n"
+ printf "#endif /* %s */\n" "${fileguard}"
) > "$out"
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index bda9cb4a0a5f..6fcae436ae51 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -50,7 +50,7 @@
#include <linux/irq.h>
#include <linux/delay.h>
#include <linux/irq_work.h>
-#include <linux/clk-provider.h>
+#include <linux/of_clk.h>
#include <linux/suspend.h>
#include <linux/sched/cputime.h>
#include <linux/processor.h>
@@ -522,35 +522,6 @@ static inline void clear_irq_work_pending(void)
"i" (offsetof(struct paca_struct, irq_work_pending)));
}
-void arch_irq_work_raise(void)
-{
- preempt_disable();
- set_irq_work_pending_flag();
- /*
- * Non-nmi code running with interrupts disabled will replay
- * irq_happened before it re-enables interrupts, so setthe
- * decrementer there instead of causing a hardware exception
- * which would immediately hit the masked interrupt handler
- * and have the net effect of setting the decrementer in
- * irq_happened.
- *
- * NMI interrupts can not check this when they return, so the
- * decrementer hardware exception is raised, which will fire
- * when interrupts are next enabled.
- *
- * BookE does not support this yet, it must audit all NMI
- * interrupt handlers to ensure they call nmi_enter() so this
- * check would be correct.
- */
- if (IS_ENABLED(CONFIG_BOOKE) || !irqs_disabled() || in_nmi()) {
- set_dec(1);
- } else {
- hard_irq_disable();
- local_paca->irq_happened |= PACA_IRQ_DEC;
- }
- preempt_enable();
-}
-
#else /* 32-bit */
DEFINE_PER_CPU(u8, irq_work_pending);
@@ -559,16 +530,27 @@ DEFINE_PER_CPU(u8, irq_work_pending);
#define test_irq_work_pending() __this_cpu_read(irq_work_pending)
#define clear_irq_work_pending() __this_cpu_write(irq_work_pending, 0)
+#endif /* 32 vs 64 bit */
+
void arch_irq_work_raise(void)
{
+ /*
+ * 64-bit code that uses irq soft-mask can just cause an immediate
+ * interrupt here that gets soft masked, if this is called under
+ * local_irq_disable(). It might be possible to prevent that happening
+ * by noticing interrupts are disabled and setting decrementer pending
+ * to be replayed when irqs are enabled. The problem there is that
+ * tracing can call irq_work_raise, including in code that does low
+ * level manipulations of irq soft-mask state (e.g., trace_hardirqs_on)
+ * which could get tangled up if we're messing with the same state
+ * here.
+ */
preempt_disable();
set_irq_work_pending_flag();
set_dec(1);
preempt_enable();
}
-#endif /* 32 vs 64 bit */
-
#else /* CONFIG_IRQ_WORK */
#define test_irq_work_pending() 0
@@ -1149,9 +1131,7 @@ void __init time_init(void)
init_decrementer_clockevent();
tick_setup_hrtimer_broadcast();
-#ifdef CONFIG_COMMON_CLK
of_clk_init(NULL);
-#endif
}
/*
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index d3b77c15f9ce..f38f26e844b6 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -651,7 +651,8 @@ static void __init vdso_setup_syscall_map(void)
if (sys_call_table[i] != sys_ni_syscall)
vdso_data->syscall_map_64[i >> 5] |=
0x80000000UL >> (i & 0x1f);
- if (compat_sys_call_table[i] != sys_ni_syscall)
+ if (IS_ENABLED(CONFIG_COMPAT) &&
+ compat_sys_call_table[i] != sys_ni_syscall)
vdso_data->syscall_map_32[i >> 5] |=
0x80000000UL >> (i & 0x1f);
#else /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 711fca9bc6f0..12885eda324e 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -204,6 +204,4 @@ config KVM_XIVE
default y
depends on KVM_XICS && PPC_XIVE_NATIVE && KVM_BOOK3S_HV_POSSIBLE
-source "drivers/vhost/Kconfig"
-
endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 425d13806645..df9989cf7ba3 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -422,7 +422,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
break;
}
} else if (vma && hva >= vma->vm_start &&
- (vma->vm_flags & VM_HUGETLB)) {
+ is_vm_hugetlb_page(vma)) {
unsigned long psize = vma_kernel_pagesize(vma);
tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 7e5714a69a58..8ed2411c3f39 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -809,7 +809,8 @@ int resize_hpt_for_hotplug(unsigned long new_mem_size)
return 0;
}
-int hash__create_section_mapping(unsigned long start, unsigned long end, int nid)
+int hash__create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot)
{
int rc;
@@ -819,7 +820,7 @@ int hash__create_section_mapping(unsigned long start, unsigned long end, int nid
}
rc = htab_bolt_mapping(start, end, __pa(start),
- pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+ pgprot_val(prot), mmu_linear_psize,
mmu_kernel_ssize);
if (rc < 0) {
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 2bf7e1b4fd82..e0bb69c616e4 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -171,12 +171,13 @@ void mmu_cleanup_all(void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid)
+int __meminit create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot)
{
if (radix_enabled())
- return radix__create_section_mapping(start, end, nid);
+ return radix__create_section_mapping(start, end, nid, prot);
- return hash__create_section_mapping(start, end, nid);
+ return hash__create_section_mapping(start, end, nid, prot);
}
int __meminit remove_section_mapping(unsigned long start, unsigned long end)
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index 07527f1ed108..1199fc2bfaec 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -315,7 +315,7 @@ int __execute_only_pkey(struct mm_struct *mm)
static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
{
/* Do this check first since the vm_flags should be hot */
- if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
+ if ((vma->vm_flags & VM_ACCESS_FLAGS) != VM_EXEC)
return false;
return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey);
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 2a9a0cd79490..8f9edf07063a 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -254,7 +254,7 @@ static unsigned long next_boundary(unsigned long addr, unsigned long end)
static int __meminit create_physical_mapping(unsigned long start,
unsigned long end,
- int nid)
+ int nid, pgprot_t _prot)
{
unsigned long vaddr, addr, mapping_size = 0;
bool prev_exec, exec = false;
@@ -290,7 +290,7 @@ static int __meminit create_physical_mapping(unsigned long start,
prot = PAGE_KERNEL_X;
exec = true;
} else {
- prot = PAGE_KERNEL;
+ prot = _prot;
exec = false;
}
@@ -334,7 +334,7 @@ static void __init radix_init_pgtable(void)
WARN_ON(create_physical_mapping(reg->base,
reg->base + reg->size,
- -1));
+ -1, PAGE_KERNEL));
}
/* Find out how many PID bits are supported */
@@ -713,8 +713,10 @@ static int __meminit stop_machine_change_mapping(void *data)
spin_unlock(&init_mm.page_table_lock);
pte_clear(&init_mm, params->aligned_start, params->pte);
- create_physical_mapping(__pa(params->aligned_start), __pa(params->start), -1);
- create_physical_mapping(__pa(params->end), __pa(params->aligned_end), -1);
+ create_physical_mapping(__pa(params->aligned_start),
+ __pa(params->start), -1, PAGE_KERNEL);
+ create_physical_mapping(__pa(params->end), __pa(params->aligned_end),
+ -1, PAGE_KERNEL);
spin_lock(&init_mm.page_table_lock);
return 0;
}
@@ -871,14 +873,16 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
radix__flush_tlb_kernel_range(start, end);
}
-int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
+int __meminit radix__create_section_mapping(unsigned long start,
+ unsigned long end, int nid,
+ pgprot_t prot)
{
if (end >= RADIX_VMALLOC_START) {
pr_warn("Outside the supported range\n");
return -1;
}
- return create_physical_mapping(__pa(start), __pa(end), nid);
+ return create_physical_mapping(__pa(start), __pa(end), nid, prot);
}
int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index d15f0f0ee806..84af6c8eecf7 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -314,7 +314,7 @@ static bool access_error(bool is_write, bool is_exec,
return false;
}
- if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+ if (unlikely(!vma_is_accessible(vma)))
return true;
/*
* We should ideally do the vma pkey access check here. But in the
diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
index fc669643ce6a..b1a0aebe8c48 100644
--- a/arch/powerpc/mm/ioremap.c
+++ b/arch/powerpc/mm/ioremap.c
@@ -2,6 +2,7 @@
#include <linux/io.h>
#include <linux/slab.h>
+#include <linux/mmzone.h>
#include <linux/vmalloc.h>
#include <asm/io-workarounds.h>
@@ -97,3 +98,23 @@ void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
return NULL;
}
+
+#ifdef CONFIG_ZONE_DEVICE
+/*
+ * Override the generic version in mm/memremap.c.
+ *
+ * With hash translation, the direct-map range is mapped with just one
+ * page size selected by htab_init_page_sizes(). Consult
+ * mmu_psize_defs[] to determine the minimum page size alignment.
+*/
+unsigned long memremap_compat_align(void)
+{
+ unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift;
+
+ if (radix_enabled())
+ return SUBSECTION_SIZE;
+ return max(SUBSECTION_SIZE, 1UL << shift);
+
+}
+EXPORT_SYMBOL_GPL(memremap_compat_align);
+#endif
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 9b4f5fb719e0..041ed7cfd341 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -90,7 +90,8 @@ int memory_add_physaddr_to_nid(u64 start)
}
#endif
-int __weak create_section_mapping(unsigned long start, unsigned long end, int nid)
+int __weak create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot)
{
return -ENODEV;
}
@@ -122,7 +123,7 @@ static void flush_dcache_range_chunked(unsigned long start, unsigned long stop,
}
int __ref arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -131,14 +132,15 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
resize_hpt_for_hotplug(memblock_phys_mem_size());
start = (unsigned long)__va(start);
- rc = create_section_mapping(start, start + size, nid);
+ rc = create_section_mapping(start, start + size, nid,
+ params->pgprot);
if (rc) {
pr_warn("Unable to create mapping for hot added memory 0x%llx..0x%llx: %d\n",
start, start + size, rc);
return -EFAULT;
}
- return __add_pages(nid, start_pfn, nr_pages, restrictions);
+ return __add_pages(nid, start_pfn, nr_pages, params);
}
void __ref arch_remove_memory(int nid, u64 start, u64 size,
diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile
index c155dcbb8691..53d614e98537 100644
--- a/arch/powerpc/perf/Makefile
+++ b/arch/powerpc/perf/Makefile
@@ -1,6 +1,9 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_PERF_EVENTS) += callchain.o perf_regs.o
+obj-$(CONFIG_PERF_EVENTS) += callchain.o callchain_$(BITS).o perf_regs.o
+ifdef CONFIG_COMPAT
+obj-$(CONFIG_PERF_EVENTS) += callchain_32.o
+endif
obj-$(CONFIG_PPC_PERF_CTRS) += core-book3s.o bhrb.o
obj64-$(CONFIG_PPC_PERF_CTRS) += ppc970-pmu.o power5-pmu.o \
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index cbc251981209..dd5051015008 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -15,11 +15,9 @@
#include <asm/sigcontext.h>
#include <asm/ucontext.h>
#include <asm/vdso.h>
-#ifdef CONFIG_PPC64
-#include "../kernel/ppc32.h"
-#endif
#include <asm/pte-walk.h>
+#include "callchain.h"
/*
* Is sp valid as the address of the next kernel stack frame after prev_sp?
@@ -102,358 +100,6 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
}
}
-#ifdef CONFIG_PPC64
-/*
- * On 64-bit we don't want to invoke hash_page on user addresses from
- * interrupt context, so if the access faults, we read the page tables
- * to find which page (if any) is mapped and access it directly.
- */
-static int read_user_stack_slow(void __user *ptr, void *buf, int nb)
-{
- int ret = -EFAULT;
- pgd_t *pgdir;
- pte_t *ptep, pte;
- unsigned shift;
- unsigned long addr = (unsigned long) ptr;
- unsigned long offset;
- unsigned long pfn, flags;
- void *kaddr;
-
- pgdir = current->mm->pgd;
- if (!pgdir)
- return -EFAULT;
-
- local_irq_save(flags);
- ptep = find_current_mm_pte(pgdir, addr, NULL, &shift);
- if (!ptep)
- goto err_out;
- if (!shift)
- shift = PAGE_SHIFT;
-
- /* align address to page boundary */
- offset = addr & ((1UL << shift) - 1);
-
- pte = READ_ONCE(*ptep);
- if (!pte_present(pte) || !pte_user(pte))
- goto err_out;
- pfn = pte_pfn(pte);
- if (!page_is_ram(pfn))
- goto err_out;
-
- /* no highmem to worry about here */
- kaddr = pfn_to_kaddr(pfn);
- memcpy(buf, kaddr + offset, nb);
- ret = 0;
-err_out:
- local_irq_restore(flags);
- return ret;
-}
-
-static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
-{
- if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) ||
- ((unsigned long)ptr & 7))
- return -EFAULT;
-
- if (!probe_user_read(ret, ptr, sizeof(*ret)))
- return 0;
-
- return read_user_stack_slow(ptr, ret, 8);
-}
-
-static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
-{
- if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
- ((unsigned long)ptr & 3))
- return -EFAULT;
-
- if (!probe_user_read(ret, ptr, sizeof(*ret)))
- return 0;
-
- return read_user_stack_slow(ptr, ret, 4);
-}
-
-static inline int valid_user_sp(unsigned long sp, int is_64)
-{
- if (!sp || (sp & 7) || sp > (is_64 ? TASK_SIZE : 0x100000000UL) - 32)
- return 0;
- return 1;
-}
-
-/*
- * 64-bit user processes use the same stack frame for RT and non-RT signals.
- */
-struct signal_frame_64 {
- char dummy[__SIGNAL_FRAMESIZE];
- struct ucontext uc;
- unsigned long unused[2];
- unsigned int tramp[6];
- struct siginfo *pinfo;
- void *puc;
- struct siginfo info;
- char abigap[288];
-};
-
-static int is_sigreturn_64_address(unsigned long nip, unsigned long fp)
-{
- if (nip == fp + offsetof(struct signal_frame_64, tramp))
- return 1;
- if (vdso64_rt_sigtramp && current->mm->context.vdso_base &&
- nip == current->mm->context.vdso_base + vdso64_rt_sigtramp)
- return 1;
- return 0;
-}
-
-/*
- * Do some sanity checking on the signal frame pointed to by sp.
- * We check the pinfo and puc pointers in the frame.
- */
-static int sane_signal_64_frame(unsigned long sp)
-{
- struct signal_frame_64 __user *sf;
- unsigned long pinfo, puc;
-
- sf = (struct signal_frame_64 __user *) sp;
- if (read_user_stack_64((unsigned long __user *) &sf->pinfo, &pinfo) ||
- read_user_stack_64((unsigned long __user *) &sf->puc, &puc))
- return 0;
- return pinfo == (unsigned long) &sf->info &&
- puc == (unsigned long) &sf->uc;
-}
-
-static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
- struct pt_regs *regs)
-{
- unsigned long sp, next_sp;
- unsigned long next_ip;
- unsigned long lr;
- long level = 0;
- struct signal_frame_64 __user *sigframe;
- unsigned long __user *fp, *uregs;
-
- next_ip = perf_instruction_pointer(regs);
- lr = regs->link;
- sp = regs->gpr[1];
- perf_callchain_store(entry, next_ip);
-
- while (entry->nr < entry->max_stack) {
- fp = (unsigned long __user *) sp;
- if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
- return;
- if (level > 0 && read_user_stack_64(&fp[2], &next_ip))
- return;
-
- /*
- * Note: the next_sp - sp >= signal frame size check
- * is true when next_sp < sp, which can happen when
- * transitioning from an alternate signal stack to the
- * normal stack.
- */
- if (next_sp - sp >= sizeof(struct signal_frame_64) &&
- (is_sigreturn_64_address(next_ip, sp) ||
- (level <= 1 && is_sigreturn_64_address(lr, sp))) &&
- sane_signal_64_frame(sp)) {
- /*
- * This looks like an signal frame
- */
- sigframe = (struct signal_frame_64 __user *) sp;
- uregs = sigframe->uc.uc_mcontext.gp_regs;
- if (read_user_stack_64(&uregs[PT_NIP], &next_ip) ||
- read_user_stack_64(&uregs[PT_LNK], &lr) ||
- read_user_stack_64(&uregs[PT_R1], &sp))
- return;
- level = 0;
- perf_callchain_store_context(entry, PERF_CONTEXT_USER);
- perf_callchain_store(entry, next_ip);
- continue;
- }
-
- if (level == 0)
- next_ip = lr;
- perf_callchain_store(entry, next_ip);
- ++level;
- sp = next_sp;
- }
-}
-
-#else /* CONFIG_PPC64 */
-/*
- * On 32-bit we just access the address and let hash_page create a
- * HPTE if necessary, so there is no need to fall back to reading
- * the page tables. Since this is called at interrupt level,
- * do_page_fault() won't treat a DSI as a page fault.
- */
-static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
-{
- if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
- ((unsigned long)ptr & 3))
- return -EFAULT;
-
- return probe_user_read(ret, ptr, sizeof(*ret));
-}
-
-static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
- struct pt_regs *regs)
-{
-}
-
-static inline int valid_user_sp(unsigned long sp, int is_64)
-{
- if (!sp || (sp & 7) || sp > TASK_SIZE - 32)
- return 0;
- return 1;
-}
-
-#define __SIGNAL_FRAMESIZE32 __SIGNAL_FRAMESIZE
-#define sigcontext32 sigcontext
-#define mcontext32 mcontext
-#define ucontext32 ucontext
-#define compat_siginfo_t struct siginfo
-
-#endif /* CONFIG_PPC64 */
-
-/*
- * Layout for non-RT signal frames
- */
-struct signal_frame_32 {
- char dummy[__SIGNAL_FRAMESIZE32];
- struct sigcontext32 sctx;
- struct mcontext32 mctx;
- int abigap[56];
-};
-
-/*
- * Layout for RT signal frames
- */
-struct rt_signal_frame_32 {
- char dummy[__SIGNAL_FRAMESIZE32 + 16];
- compat_siginfo_t info;
- struct ucontext32 uc;
- int abigap[56];
-};
-
-static int is_sigreturn_32_address(unsigned int nip, unsigned int fp)
-{
- if (nip == fp + offsetof(struct signal_frame_32, mctx.mc_pad))
- return 1;
- if (vdso32_sigtramp && current->mm->context.vdso_base &&
- nip == current->mm->context.vdso_base + vdso32_sigtramp)
- return 1;
- return 0;
-}
-
-static int is_rt_sigreturn_32_address(unsigned int nip, unsigned int fp)
-{
- if (nip == fp + offsetof(struct rt_signal_frame_32,
- uc.uc_mcontext.mc_pad))
- return 1;
- if (vdso32_rt_sigtramp && current->mm->context.vdso_base &&
- nip == current->mm->context.vdso_base + vdso32_rt_sigtramp)
- return 1;
- return 0;
-}
-
-static int sane_signal_32_frame(unsigned int sp)
-{
- struct signal_frame_32 __user *sf;
- unsigned int regs;
-
- sf = (struct signal_frame_32 __user *) (unsigned long) sp;
- if (read_user_stack_32((unsigned int __user *) &sf->sctx.regs, &regs))
- return 0;
- return regs == (unsigned long) &sf->mctx;
-}
-
-static int sane_rt_signal_32_frame(unsigned int sp)
-{
- struct rt_signal_frame_32 __user *sf;
- unsigned int regs;
-
- sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
- if (read_user_stack_32((unsigned int __user *) &sf->uc.uc_regs, &regs))
- return 0;
- return regs == (unsigned long) &sf->uc.uc_mcontext;
-}
-
-static unsigned int __user *signal_frame_32_regs(unsigned int sp,
- unsigned int next_sp, unsigned int next_ip)
-{
- struct mcontext32 __user *mctx = NULL;
- struct signal_frame_32 __user *sf;
- struct rt_signal_frame_32 __user *rt_sf;
-
- /*
- * Note: the next_sp - sp >= signal frame size check
- * is true when next_sp < sp, for example, when
- * transitioning from an alternate signal stack to the
- * normal stack.
- */
- if (next_sp - sp >= sizeof(struct signal_frame_32) &&
- is_sigreturn_32_address(next_ip, sp) &&
- sane_signal_32_frame(sp)) {
- sf = (struct signal_frame_32 __user *) (unsigned long) sp;
- mctx = &sf->mctx;
- }
-
- if (!mctx && next_sp - sp >= sizeof(struct rt_signal_frame_32) &&
- is_rt_sigreturn_32_address(next_ip, sp) &&
- sane_rt_signal_32_frame(sp)) {
- rt_sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
- mctx = &rt_sf->uc.uc_mcontext;
- }
-
- if (!mctx)
- return NULL;
- return mctx->mc_gregs;
-}
-
-static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
- struct pt_regs *regs)
-{
- unsigned int sp, next_sp;
- unsigned int next_ip;
- unsigned int lr;
- long level = 0;
- unsigned int __user *fp, *uregs;
-
- next_ip = perf_instruction_pointer(regs);
- lr = regs->link;
- sp = regs->gpr[1];
- perf_callchain_store(entry, next_ip);
-
- while (entry->nr < entry->max_stack) {
- fp = (unsigned int __user *) (unsigned long) sp;
- if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
- return;
- if (level > 0 && read_user_stack_32(&fp[1], &next_ip))
- return;
-
- uregs = signal_frame_32_regs(sp, next_sp, next_ip);
- if (!uregs && level <= 1)
- uregs = signal_frame_32_regs(sp, next_sp, lr);
- if (uregs) {
- /*
- * This looks like an signal frame, so restart
- * the stack trace with the values in it.
- */
- if (read_user_stack_32(&uregs[PT_NIP], &next_ip) ||
- read_user_stack_32(&uregs[PT_LNK], &lr) ||
- read_user_stack_32(&uregs[PT_R1], &sp))
- return;
- level = 0;
- perf_callchain_store_context(entry, PERF_CONTEXT_USER);
- perf_callchain_store(entry, next_ip);
- continue;
- }
-
- if (level == 0)
- next_ip = lr;
- perf_callchain_store(entry, next_ip);
- ++level;
- sp = next_sp;
- }
-}
-
void
perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
{
diff --git a/arch/powerpc/perf/callchain.h b/arch/powerpc/perf/callchain.h
new file mode 100644
index 000000000000..7a2cb9e1181a
--- /dev/null
+++ b/arch/powerpc/perf/callchain.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _POWERPC_PERF_CALLCHAIN_H
+#define _POWERPC_PERF_CALLCHAIN_H
+
+int read_user_stack_slow(void __user *ptr, void *buf, int nb);
+void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
+ struct pt_regs *regs);
+void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
+ struct pt_regs *regs);
+
+static inline bool invalid_user_sp(unsigned long sp)
+{
+ unsigned long mask = is_32bit_task() ? 3 : 7;
+ unsigned long top = STACK_TOP - (is_32bit_task() ? 16 : 32);
+
+ return (!sp || (sp & mask) || (sp > top));
+}
+
+#endif /* _POWERPC_PERF_CALLCHAIN_H */
diff --git a/arch/powerpc/perf/callchain_32.c b/arch/powerpc/perf/callchain_32.c
new file mode 100644
index 000000000000..8aa951003141
--- /dev/null
+++ b/arch/powerpc/perf/callchain_32.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Performance counter callchain support - powerpc architecture code
+ *
+ * Copyright © 2009 Paul Mackerras, IBM Corporation.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <asm/ptrace.h>
+#include <asm/pgtable.h>
+#include <asm/sigcontext.h>
+#include <asm/ucontext.h>
+#include <asm/vdso.h>
+#include <asm/pte-walk.h>
+
+#include "callchain.h"
+
+#ifdef CONFIG_PPC64
+#include "../kernel/ppc32.h"
+#else /* CONFIG_PPC64 */
+
+#define __SIGNAL_FRAMESIZE32 __SIGNAL_FRAMESIZE
+#define sigcontext32 sigcontext
+#define mcontext32 mcontext
+#define ucontext32 ucontext
+#define compat_siginfo_t struct siginfo
+
+#endif /* CONFIG_PPC64 */
+
+/*
+ * On 32-bit we just access the address and let hash_page create a
+ * HPTE if necessary, so there is no need to fall back to reading
+ * the page tables. Since this is called at interrupt level,
+ * do_page_fault() won't treat a DSI as a page fault.
+ */
+static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
+{
+ int rc;
+
+ if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
+ ((unsigned long)ptr & 3))
+ return -EFAULT;
+
+ rc = probe_user_read(ret, ptr, sizeof(*ret));
+
+ if (IS_ENABLED(CONFIG_PPC64) && rc)
+ return read_user_stack_slow(ptr, ret, 4);
+
+ return rc;
+}
+
+/*
+ * Layout for non-RT signal frames
+ */
+struct signal_frame_32 {
+ char dummy[__SIGNAL_FRAMESIZE32];
+ struct sigcontext32 sctx;
+ struct mcontext32 mctx;
+ int abigap[56];
+};
+
+/*
+ * Layout for RT signal frames
+ */
+struct rt_signal_frame_32 {
+ char dummy[__SIGNAL_FRAMESIZE32 + 16];
+ compat_siginfo_t info;
+ struct ucontext32 uc;
+ int abigap[56];
+};
+
+static int is_sigreturn_32_address(unsigned int nip, unsigned int fp)
+{
+ if (nip == fp + offsetof(struct signal_frame_32, mctx.mc_pad))
+ return 1;
+ if (vdso32_sigtramp && current->mm->context.vdso_base &&
+ nip == current->mm->context.vdso_base + vdso32_sigtramp)
+ return 1;
+ return 0;
+}
+
+static int is_rt_sigreturn_32_address(unsigned int nip, unsigned int fp)
+{
+ if (nip == fp + offsetof(struct rt_signal_frame_32,
+ uc.uc_mcontext.mc_pad))
+ return 1;
+ if (vdso32_rt_sigtramp && current->mm->context.vdso_base &&
+ nip == current->mm->context.vdso_base + vdso32_rt_sigtramp)
+ return 1;
+ return 0;
+}
+
+static int sane_signal_32_frame(unsigned int sp)
+{
+ struct signal_frame_32 __user *sf;
+ unsigned int regs;
+
+ sf = (struct signal_frame_32 __user *) (unsigned long) sp;
+ if (read_user_stack_32((unsigned int __user *) &sf->sctx.regs, &regs))
+ return 0;
+ return regs == (unsigned long) &sf->mctx;
+}
+
+static int sane_rt_signal_32_frame(unsigned int sp)
+{
+ struct rt_signal_frame_32 __user *sf;
+ unsigned int regs;
+
+ sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
+ if (read_user_stack_32((unsigned int __user *) &sf->uc.uc_regs, &regs))
+ return 0;
+ return regs == (unsigned long) &sf->uc.uc_mcontext;
+}
+
+static unsigned int __user *signal_frame_32_regs(unsigned int sp,
+ unsigned int next_sp, unsigned int next_ip)
+{
+ struct mcontext32 __user *mctx = NULL;
+ struct signal_frame_32 __user *sf;
+ struct rt_signal_frame_32 __user *rt_sf;
+
+ /*
+ * Note: the next_sp - sp >= signal frame size check
+ * is true when next_sp < sp, for example, when
+ * transitioning from an alternate signal stack to the
+ * normal stack.
+ */
+ if (next_sp - sp >= sizeof(struct signal_frame_32) &&
+ is_sigreturn_32_address(next_ip, sp) &&
+ sane_signal_32_frame(sp)) {
+ sf = (struct signal_frame_32 __user *) (unsigned long) sp;
+ mctx = &sf->mctx;
+ }
+
+ if (!mctx && next_sp - sp >= sizeof(struct rt_signal_frame_32) &&
+ is_rt_sigreturn_32_address(next_ip, sp) &&
+ sane_rt_signal_32_frame(sp)) {
+ rt_sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
+ mctx = &rt_sf->uc.uc_mcontext;
+ }
+
+ if (!mctx)
+ return NULL;
+ return mctx->mc_gregs;
+}
+
+void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
+ struct pt_regs *regs)
+{
+ unsigned int sp, next_sp;
+ unsigned int next_ip;
+ unsigned int lr;
+ long level = 0;
+ unsigned int __user *fp, *uregs;
+
+ next_ip = perf_instruction_pointer(regs);
+ lr = regs->link;
+ sp = regs->gpr[1];
+ perf_callchain_store(entry, next_ip);
+
+ while (entry->nr < entry->max_stack) {
+ fp = (unsigned int __user *) (unsigned long) sp;
+ if (invalid_user_sp(sp) || read_user_stack_32(fp, &next_sp))
+ return;
+ if (level > 0 && read_user_stack_32(&fp[1], &next_ip))
+ return;
+
+ uregs = signal_frame_32_regs(sp, next_sp, next_ip);
+ if (!uregs && level <= 1)
+ uregs = signal_frame_32_regs(sp, next_sp, lr);
+ if (uregs) {
+ /*
+ * This looks like an signal frame, so restart
+ * the stack trace with the values in it.
+ */
+ if (read_user_stack_32(&uregs[PT_NIP], &next_ip) ||
+ read_user_stack_32(&uregs[PT_LNK], &lr) ||
+ read_user_stack_32(&uregs[PT_R1], &sp))
+ return;
+ level = 0;
+ perf_callchain_store_context(entry, PERF_CONTEXT_USER);
+ perf_callchain_store(entry, next_ip);
+ continue;
+ }
+
+ if (level == 0)
+ next_ip = lr;
+ perf_callchain_store(entry, next_ip);
+ ++level;
+ sp = next_sp;
+ }
+}
diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c
new file mode 100644
index 000000000000..df1ffd8b20f2
--- /dev/null
+++ b/arch/powerpc/perf/callchain_64.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Performance counter callchain support - powerpc architecture code
+ *
+ * Copyright © 2009 Paul Mackerras, IBM Corporation.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <asm/ptrace.h>
+#include <asm/pgtable.h>
+#include <asm/sigcontext.h>
+#include <asm/ucontext.h>
+#include <asm/vdso.h>
+#include <asm/pte-walk.h>
+
+#include "callchain.h"
+
+/*
+ * On 64-bit we don't want to invoke hash_page on user addresses from
+ * interrupt context, so if the access faults, we read the page tables
+ * to find which page (if any) is mapped and access it directly.
+ */
+int read_user_stack_slow(void __user *ptr, void *buf, int nb)
+{
+ int ret = -EFAULT;
+ pgd_t *pgdir;
+ pte_t *ptep, pte;
+ unsigned int shift;
+ unsigned long addr = (unsigned long) ptr;
+ unsigned long offset;
+ unsigned long pfn, flags;
+ void *kaddr;
+
+ pgdir = current->mm->pgd;
+ if (!pgdir)
+ return -EFAULT;
+
+ local_irq_save(flags);
+ ptep = find_current_mm_pte(pgdir, addr, NULL, &shift);
+ if (!ptep)
+ goto err_out;
+ if (!shift)
+ shift = PAGE_SHIFT;
+
+ /* align address to page boundary */
+ offset = addr & ((1UL << shift) - 1);
+
+ pte = READ_ONCE(*ptep);
+ if (!pte_present(pte) || !pte_user(pte))
+ goto err_out;
+ pfn = pte_pfn(pte);
+ if (!page_is_ram(pfn))
+ goto err_out;
+
+ /* no highmem to worry about here */
+ kaddr = pfn_to_kaddr(pfn);
+ memcpy(buf, kaddr + offset, nb);
+ ret = 0;
+err_out:
+ local_irq_restore(flags);
+ return ret;
+}
+
+static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
+{
+ if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) ||
+ ((unsigned long)ptr & 7))
+ return -EFAULT;
+
+ if (!probe_user_read(ret, ptr, sizeof(*ret)))
+ return 0;
+
+ return read_user_stack_slow(ptr, ret, 8);
+}
+
+/*
+ * 64-bit user processes use the same stack frame for RT and non-RT signals.
+ */
+struct signal_frame_64 {
+ char dummy[__SIGNAL_FRAMESIZE];
+ struct ucontext uc;
+ unsigned long unused[2];
+ unsigned int tramp[6];
+ struct siginfo *pinfo;
+ void *puc;
+ struct siginfo info;
+ char abigap[288];
+};
+
+static int is_sigreturn_64_address(unsigned long nip, unsigned long fp)
+{
+ if (nip == fp + offsetof(struct signal_frame_64, tramp))
+ return 1;
+ if (vdso64_rt_sigtramp && current->mm->context.vdso_base &&
+ nip == current->mm->context.vdso_base + vdso64_rt_sigtramp)
+ return 1;
+ return 0;
+}
+
+/*
+ * Do some sanity checking on the signal frame pointed to by sp.
+ * We check the pinfo and puc pointers in the frame.
+ */
+static int sane_signal_64_frame(unsigned long sp)
+{
+ struct signal_frame_64 __user *sf;
+ unsigned long pinfo, puc;
+
+ sf = (struct signal_frame_64 __user *) sp;
+ if (read_user_stack_64((unsigned long __user *) &sf->pinfo, &pinfo) ||
+ read_user_stack_64((unsigned long __user *) &sf->puc, &puc))
+ return 0;
+ return pinfo == (unsigned long) &sf->info &&
+ puc == (unsigned long) &sf->uc;
+}
+
+void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
+ struct pt_regs *regs)
+{
+ unsigned long sp, next_sp;
+ unsigned long next_ip;
+ unsigned long lr;
+ long level = 0;
+ struct signal_frame_64 __user *sigframe;
+ unsigned long __user *fp, *uregs;
+
+ next_ip = perf_instruction_pointer(regs);
+ lr = regs->link;
+ sp = regs->gpr[1];
+ perf_callchain_store(entry, next_ip);
+
+ while (entry->nr < entry->max_stack) {
+ fp = (unsigned long __user *) sp;
+ if (invalid_user_sp(sp) || read_user_stack_64(fp, &next_sp))
+ return;
+ if (level > 0 && read_user_stack_64(&fp[2], &next_ip))
+ return;
+
+ /*
+ * Note: the next_sp - sp >= signal frame size check
+ * is true when next_sp < sp, which can happen when
+ * transitioning from an alternate signal stack to the
+ * normal stack.
+ */
+ if (next_sp - sp >= sizeof(struct signal_frame_64) &&
+ (is_sigreturn_64_address(next_ip, sp) ||
+ (level <= 1 && is_sigreturn_64_address(lr, sp))) &&
+ sane_signal_64_frame(sp)) {
+ /*
+ * This looks like an signal frame
+ */
+ sigframe = (struct signal_frame_64 __user *) sp;
+ uregs = sigframe->uc.uc_mcontext.gp_regs;
+ if (read_user_stack_64(&uregs[PT_NIP], &next_ip) ||
+ read_user_stack_64(&uregs[PT_LNK], &lr) ||
+ read_user_stack_64(&uregs[PT_R1], &sp))
+ return;
+ level = 0;
+ perf_callchain_store_context(entry, PERF_CONTEXT_USER);
+ perf_callchain_store(entry, next_ip);
+ continue;
+ }
+
+ if (level == 0)
+ next_ip = lr;
+ perf_callchain_store(entry, next_ip);
+ ++level;
+ sp = next_sp;
+ }
+}
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index cb50a9e1fd2d..eb82dda884e5 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -44,6 +44,16 @@ static DEFINE_PER_CPU(u64 *, trace_imc_mem);
static struct imc_pmu_ref *trace_imc_refc;
static int trace_imc_mem_size;
+/*
+ * Global data structure used to avoid races between thread,
+ * core and trace-imc
+ */
+static struct imc_pmu_ref imc_global_refc = {
+ .lock = __MUTEX_INITIALIZER(imc_global_refc.lock),
+ .id = 0,
+ .refc = 0,
+};
+
static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
{
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -698,6 +708,16 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu)
return -EINVAL;
ref->refc = 0;
+ /*
+ * Reduce the global reference count, if this is the
+ * last cpu in this core and core-imc event running
+ * in this cpu.
+ */
+ mutex_lock(&imc_global_refc.lock);
+ if (imc_global_refc.id == IMC_DOMAIN_CORE)
+ imc_global_refc.refc--;
+
+ mutex_unlock(&imc_global_refc.lock);
}
return 0;
}
@@ -710,6 +730,23 @@ static int core_imc_pmu_cpumask_init(void)
ppc_core_imc_cpu_offline);
}
+static void reset_global_refc(struct perf_event *event)
+{
+ mutex_lock(&imc_global_refc.lock);
+ imc_global_refc.refc--;
+
+ /*
+ * If no other thread is running any
+ * event for this domain(thread/core/trace),
+ * set the global id to zero.
+ */
+ if (imc_global_refc.refc <= 0) {
+ imc_global_refc.refc = 0;
+ imc_global_refc.id = 0;
+ }
+ mutex_unlock(&imc_global_refc.lock);
+}
+
static void core_imc_counters_release(struct perf_event *event)
{
int rc, core_id;
@@ -759,6 +796,8 @@ static void core_imc_counters_release(struct perf_event *event)
ref->refc = 0;
}
mutex_unlock(&ref->lock);
+
+ reset_global_refc(event);
}
static int core_imc_event_init(struct perf_event *event)
@@ -819,6 +858,29 @@ static int core_imc_event_init(struct perf_event *event)
++ref->refc;
mutex_unlock(&ref->lock);
+ /*
+ * Since the system can run either in accumulation or trace-mode
+ * of IMC at a time, core-imc events are allowed only if no other
+ * trace/thread imc events are enabled/monitored.
+ *
+ * Take the global lock, and check the refc.id
+ * to know whether any other trace/thread imc
+ * events are running.
+ */
+ mutex_lock(&imc_global_refc.lock);
+ if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_CORE) {
+ /*
+ * No other trace/thread imc events are running in
+ * the system, so set the refc.id to core-imc.
+ */
+ imc_global_refc.id = IMC_DOMAIN_CORE;
+ imc_global_refc.refc++;
+ } else {
+ mutex_unlock(&imc_global_refc.lock);
+ return -EBUSY;
+ }
+ mutex_unlock(&imc_global_refc.lock);
+
event->hw.event_base = (u64)pcmi->vbase + (config & IMC_EVENT_OFFSET_MASK);
event->destroy = core_imc_counters_release;
return 0;
@@ -877,7 +939,23 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu)
static int ppc_thread_imc_cpu_offline(unsigned int cpu)
{
- mtspr(SPRN_LDBAR, 0);
+ /*
+ * Set the bit 0 of LDBAR to zero.
+ *
+ * If bit 0 of LDBAR is unset, it will stop posting
+ * the counter data to memory.
+ * For thread-imc, bit 0 of LDBAR will be set to 1 in the
+ * event_add function. So reset this bit here, to stop the updates
+ * to memory in the cpu_offline path.
+ */
+ mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
+
+ /* Reduce the refc if thread-imc event running on this cpu */
+ mutex_lock(&imc_global_refc.lock);
+ if (imc_global_refc.id == IMC_DOMAIN_THREAD)
+ imc_global_refc.refc--;
+ mutex_unlock(&imc_global_refc.lock);
+
return 0;
}
@@ -916,7 +994,22 @@ static int thread_imc_event_init(struct perf_event *event)
if (!target)
return -EINVAL;
+ mutex_lock(&imc_global_refc.lock);
+ /*
+ * Check if any other trace/core imc events are running in the
+ * system, if not set the global id to thread-imc.
+ */
+ if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_THREAD) {
+ imc_global_refc.id = IMC_DOMAIN_THREAD;
+ imc_global_refc.refc++;
+ } else {
+ mutex_unlock(&imc_global_refc.lock);
+ return -EBUSY;
+ }
+ mutex_unlock(&imc_global_refc.lock);
+
event->pmu->task_ctx_nr = perf_sw_context;
+ event->destroy = reset_global_refc;
return 0;
}
@@ -1063,10 +1156,12 @@ static void thread_imc_event_del(struct perf_event *event, int flags)
int core_id;
struct imc_pmu_ref *ref;
- mtspr(SPRN_LDBAR, 0);
-
core_id = smp_processor_id() / threads_per_core;
ref = &core_imc_refc[core_id];
+ if (!ref) {
+ pr_debug("imc: Failed to get event reference count\n");
+ return;
+ }
mutex_lock(&ref->lock);
ref->refc--;
@@ -1082,6 +1177,10 @@ static void thread_imc_event_del(struct perf_event *event, int flags)
ref->refc = 0;
}
mutex_unlock(&ref->lock);
+
+ /* Set bit 0 of LDBAR to zero, to stop posting updates to memory */
+ mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
+
/*
* Take a snapshot and calculate the delta and update
* the event counter values.
@@ -1133,7 +1232,18 @@ static int ppc_trace_imc_cpu_online(unsigned int cpu)
static int ppc_trace_imc_cpu_offline(unsigned int cpu)
{
- mtspr(SPRN_LDBAR, 0);
+ /*
+ * No need to set bit 0 of LDBAR to zero, as
+ * it is set to zero for imc trace-mode
+ *
+ * Reduce the refc if any trace-imc event running
+ * on this cpu.
+ */
+ mutex_lock(&imc_global_refc.lock);
+ if (imc_global_refc.id == IMC_DOMAIN_TRACE)
+ imc_global_refc.refc--;
+ mutex_unlock(&imc_global_refc.lock);
+
return 0;
}
@@ -1226,15 +1336,14 @@ static int trace_imc_event_add(struct perf_event *event, int flags)
local_mem = get_trace_imc_event_base_addr();
ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | TRACE_IMC_ENABLE;
- if (core_imc_refc)
- ref = &core_imc_refc[core_id];
+ /* trace-imc reference count */
+ if (trace_imc_refc)
+ ref = &trace_imc_refc[core_id];
if (!ref) {
- /* If core-imc is not enabled, use trace-imc reference count */
- if (trace_imc_refc)
- ref = &trace_imc_refc[core_id];
- if (!ref)
- return -EINVAL;
+ pr_debug("imc: Failed to get the event reference count\n");
+ return -EINVAL;
}
+
mtspr(SPRN_LDBAR, ldbar_value);
mutex_lock(&ref->lock);
if (ref->refc == 0) {
@@ -1242,13 +1351,11 @@ static int trace_imc_event_add(struct perf_event *event, int flags)
get_hard_smp_processor_id(smp_processor_id()))) {
mutex_unlock(&ref->lock);
pr_err("trace-imc: Unable to start the counters for core %d\n", core_id);
- mtspr(SPRN_LDBAR, 0);
return -EINVAL;
}
}
++ref->refc;
mutex_unlock(&ref->lock);
-
return 0;
}
@@ -1274,16 +1381,13 @@ static void trace_imc_event_del(struct perf_event *event, int flags)
int core_id = smp_processor_id() / threads_per_core;
struct imc_pmu_ref *ref = NULL;
- if (core_imc_refc)
- ref = &core_imc_refc[core_id];
+ if (trace_imc_refc)
+ ref = &trace_imc_refc[core_id];
if (!ref) {
- /* If core-imc is not enabled, use trace-imc reference count */
- if (trace_imc_refc)
- ref = &trace_imc_refc[core_id];
- if (!ref)
- return;
+ pr_debug("imc: Failed to get event reference count\n");
+ return;
}
- mtspr(SPRN_LDBAR, 0);
+
mutex_lock(&ref->lock);
ref->refc--;
if (ref->refc == 0) {
@@ -1297,6 +1401,7 @@ static void trace_imc_event_del(struct perf_event *event, int flags)
ref->refc = 0;
}
mutex_unlock(&ref->lock);
+
trace_imc_event_stop(event, flags);
}
@@ -1314,10 +1419,30 @@ static int trace_imc_event_init(struct perf_event *event)
if (event->attr.sample_period == 0)
return -ENOENT;
+ /*
+ * Take the global lock, and make sure
+ * no other thread is running any core/thread imc
+ * events
+ */
+ mutex_lock(&imc_global_refc.lock);
+ if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_TRACE) {
+ /*
+ * No core/thread imc events are running in the
+ * system, so set the refc.id to trace-imc.
+ */
+ imc_global_refc.id = IMC_DOMAIN_TRACE;
+ imc_global_refc.refc++;
+ } else {
+ mutex_unlock(&imc_global_refc.lock);
+ return -EBUSY;
+ }
+ mutex_unlock(&imc_global_refc.lock);
+
event->hw.idx = -1;
target = event->hw.target;
event->pmu->task_ctx_nr = perf_hw_context;
+ event->destroy = reset_global_refc;
return 0;
}
@@ -1429,10 +1554,10 @@ static void cleanup_all_core_imc_memory(void)
static void thread_imc_ldbar_disable(void *dummy)
{
/*
- * By Zeroing LDBAR, we disable thread-imc
- * updates.
+ * By setting 0th bit of LDBAR to zero, we disable thread-imc
+ * updates to memory.
*/
- mtspr(SPRN_LDBAR, 0);
+ mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
}
void thread_imc_disable(void)
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index d6d64f8718e6..13b369d2cc45 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -231,16 +231,10 @@ static int memtrace_online(void)
continue;
}
- /*
- * If kernel isn't compiled with the auto online option
- * we need to online the memory ourselves.
- */
- if (!memhp_auto_online) {
- lock_device_hotplug();
- walk_memory_blocks(ent->start, ent->size, NULL,
- online_mem_block);
- unlock_device_hotplug();
- }
+ lock_device_hotplug();
+ walk_memory_blocks(ent->start, ent->size, NULL,
+ online_mem_block);
+ unlock_device_hotplug();
/*
* Memory was added successfully so clean up references to it
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c
index 968b9a4d1cd9..7824cc364bc4 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -268,14 +268,7 @@ static int opal_imc_counters_probe(struct platform_device *pdev)
domain = IMC_DOMAIN_THREAD;
break;
case IMC_TYPE_TRACE:
- /*
- * FIXME. Using trace_imc events to monitor application
- * or KVM thread performance can cause a checkstop
- * (system crash).
- * Disable it for now.
- */
- pr_info_once("IMC: disabling trace_imc PMU\n");
- domain = -1;
+ domain = IMC_DOMAIN_TRACE;
break;
default:
pr_warn("IMC Unknown Device type \n");
diff --git a/arch/powerpc/platforms/ps3/os-area.c b/arch/powerpc/platforms/ps3/os-area.c
index cbddd63caf2d..e8530371aed6 100644
--- a/arch/powerpc/platforms/ps3/os-area.c
+++ b/arch/powerpc/platforms/ps3/os-area.c
@@ -613,10 +613,8 @@ static int update_flash_db(void)
/* Read in header and db from flash. */
header = kmalloc(buf_len, GFP_KERNEL);
- if (!header) {
- pr_debug("%s: kmalloc failed\n", __func__);
+ if (!header)
return -ENOMEM;
- }
count = os_area_flash_read(header, buf_len, 0);
if (count < 0) {
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 2e0a8eab5588..6d47b4a3ce39 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -945,6 +945,15 @@ static phys_addr_t ddw_memory_hotplug_max(void)
phys_addr_t max_addr = memory_hotplug_max();
struct device_node *memory;
+ /*
+ * The "ibm,pmemory" can appear anywhere in the address space.
+ * Assuming it is still backed by page structs, set the upper limit
+ * for the huge DMA window as MAX_PHYSMEM_BITS.
+ */
+ if (of_find_node_by_type(NULL, "ibm,pmemory"))
+ return (sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ?
+ (phys_addr_t) -1 : (1ULL << MAX_PHYSMEM_BITS);
+
for_each_node_by_type(memory, "memory") {
unsigned long start, size;
int n_mem_addr_cells, n_mem_size_cells, len;
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index e4606100e286..f35592423380 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -286,25 +286,6 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc,
return 0;
}
-static inline int papr_scm_node(int node)
-{
- int min_dist = INT_MAX, dist;
- int nid, min_node;
-
- if ((node == NUMA_NO_NODE) || node_online(node))
- return node;
-
- min_node = first_online_node;
- for_each_online_node(nid) {
- dist = node_distance(node, nid);
- if (dist < min_dist) {
- min_dist = dist;
- min_node = nid;
- }
- }
- return min_node;
-}
-
static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
{
struct device *dev = &p->pdev->dev;
@@ -329,7 +310,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
}
dimm_flags = 0;
- set_bit(NDD_ALIASING, &dimm_flags);
+ set_bit(NDD_LABELING, &dimm_flags);
p->nvdimm = nvdimm_create(p->bus, p, NULL, dimm_flags,
PAPR_SCM_DIMM_CMD_MASK, 0, NULL);
@@ -350,7 +331,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
memset(&ndr_desc, 0, sizeof(ndr_desc));
target_nid = dev_to_node(&p->pdev->dev);
- online_nid = papr_scm_node(target_nid);
+ online_nid = numa_map_to_online_node(target_nid);
ndr_desc.numa_node = online_nid;
ndr_desc.target_node = target_nid;
ndr_desc.res = &p->res;
@@ -362,8 +343,10 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
if (p->is_volatile)
p->region = nvdimm_volatile_region_create(p->bus, &ndr_desc);
- else
+ else {
+ set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags);
p->region = nvdimm_pmem_region_create(p->bus, &ndr_desc);
+ }
if (!p->region) {
dev_err(dev, "Error registering region %pR from %pOF\n",
ndr_desc.res, p->dn);
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index aa6208c8d4f0..1d1da639b8b7 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -686,6 +686,17 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
#endif
out:
+ /*
+ * Enable translation as we will be accessing per-cpu variables
+ * in save_mce_event() which may fall outside RMO region, also
+ * leave it enabled because subsequently we will be queuing work
+ * to workqueues where again per-cpu variables accessed, besides
+ * fwnmi_release_errinfo() crashes when called in realmode on
+ * pseries.
+ * Note: All the realmode handling like flushing SLB entries for
+ * SLB multihit is done by now.
+ */
+ mtmsr(mfmsr() | MSR_IR | MSR_DR);
save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED,
&mce_err, regs->nip, eaddr, paddr);
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 8672e77a5b7a..a197258595ef 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -20,7 +20,6 @@ config RISCV
select CLONE_BACKWARDS
select COMMON_CLK
select GENERIC_CLOCKEVENTS
- select GENERIC_CPU_DEVICES
select GENERIC_IRQ_SHOW
select GENERIC_PCI_IOMAP
select GENERIC_SCHED_CLOCK
@@ -29,6 +28,7 @@ config RISCV
select GENERIC_SMP_IDLE_THREAD
select GENERIC_ATOMIC64 if !64BIT
select GENERIC_IOREMAP
+ select GENERIC_PTDUMP if MMU
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ASM_MODVERSIONS
@@ -58,6 +58,9 @@ config RISCV
select HAVE_EBPF_JIT
select EDAC_SUPPORT
select ARCH_HAS_GIGANTIC_PAGE
+ select ARCH_HAS_SET_DIRECT_MAP
+ select ARCH_HAS_SET_MEMORY
+ select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
select SPARSEMEM_STATIC if 32BIT
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
@@ -129,6 +132,9 @@ config ARCH_SELECT_MEMORY_MODEL
config ARCH_WANT_GENERAL_HUGETLB
def_bool y
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ def_bool y
+
config SYS_SUPPORTS_HUGETLBFS
def_bool y
@@ -247,6 +253,17 @@ config NR_CPUS
depends on SMP
default "8"
+config HOTPLUG_CPU
+ bool "Support for hot-pluggable CPUs"
+ depends on SMP
+ select GENERIC_IRQ_MIGRATION
+ help
+
+ Say Y here to experiment with turning CPUs off and on. CPUs
+ can be controlled through /sys/devices/system/cpu.
+
+ Say N if you want to disable CPU hotplug.
+
choice
prompt "CPU Tuning"
default TUNE_GENERIC
@@ -307,6 +324,13 @@ config SECCOMP
and the task is only allowed to execute a few safe syscalls
defined by each seccomp mode.
+config RISCV_SBI_V01
+ bool "SBI v0.1 support"
+ default y
+ depends on RISCV_SBI
+ help
+ This config allows kernel to use SBI v0.1 APIs. This will be
+ deprecated in future once legacy M-mode software are no longer in use.
endmenu
menu "Boot options"
diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs
index a131174a0a77..216286db81c9 100644
--- a/arch/riscv/Kconfig.socs
+++ b/arch/riscv/Kconfig.socs
@@ -20,4 +20,14 @@ config SOC_VIRT
help
This enables support for QEMU Virt Machine.
+config SOC_KENDRYTE
+ bool "Kendryte K210 SoC"
+ depends on !MMU
+ select BUILTIN_DTB
+ select SERIAL_SIFIVE if TTY
+ select SERIAL_SIFIVE_CONSOLE if TTY
+ select SIFIVE_PLIC
+ help
+ This enables support for Kendryte K210 SoC platform hardware.
+
endmenu
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index 259cb53d7f20..fb6e37db836d 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -85,12 +85,12 @@ PHONY += vdso_install
vdso_install:
$(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso $@
-ifeq ($(CONFIG_RISCV_M_MODE),y)
-KBUILD_IMAGE := $(boot)/loader
+ifeq ($(CONFIG_RISCV_M_MODE)$(CONFIG_SOC_KENDRYTE),yy)
+KBUILD_IMAGE := $(boot)/loader.bin
else
KBUILD_IMAGE := $(boot)/Image.gz
endif
-BOOT_TARGETS := Image Image.gz loader
+BOOT_TARGETS := Image Image.gz loader loader.bin
all: $(notdir $(KBUILD_IMAGE))
diff --git a/arch/riscv/boot/Makefile b/arch/riscv/boot/Makefile
index 36db8145f9f4..3530c59b3ea7 100644
--- a/arch/riscv/boot/Makefile
+++ b/arch/riscv/boot/Makefile
@@ -41,6 +41,9 @@ $(obj)/Image.lzma: $(obj)/Image FORCE
$(obj)/Image.lzo: $(obj)/Image FORCE
$(call if_changed,lzo)
+$(obj)/loader.bin: $(obj)/loader FORCE
+ $(call if_changed,objcopy)
+
install:
$(CONFIG_SHELL) $(srctree)/$(src)/install.sh $(KERNELRELEASE) \
$(obj)/Image System.map "$(INSTALL_PATH)"
diff --git a/arch/riscv/boot/dts/Makefile b/arch/riscv/boot/dts/Makefile
index dcc3ada78455..557f0b519c8e 100644
--- a/arch/riscv/boot/dts/Makefile
+++ b/arch/riscv/boot/dts/Makefile
@@ -1,2 +1,3 @@
# SPDX-License-Identifier: GPL-2.0
subdir-y += sifive
+subdir-y += kendryte
diff --git a/arch/riscv/boot/dts/kendryte/Makefile b/arch/riscv/boot/dts/kendryte/Makefile
new file mode 100644
index 000000000000..815444e69e89
--- /dev/null
+++ b/arch/riscv/boot/dts/kendryte/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+dtb-$(CONFIG_SOC_KENDRYTE) += k210.dtb
diff --git a/arch/riscv/boot/dts/kendryte/k210.dts b/arch/riscv/boot/dts/kendryte/k210.dts
new file mode 100644
index 000000000000..0d1f28fce6b2
--- /dev/null
+++ b/arch/riscv/boot/dts/kendryte/k210.dts
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+
+/dts-v1/;
+
+#include "k210.dtsi"
+
+/ {
+ model = "Kendryte K210 generic";
+ compatible = "kendryte,k210";
+
+ chosen {
+ bootargs = "earlycon console=ttySIF0";
+ stdout-path = "serial0";
+ };
+};
+
+&uarths0 {
+ status = "okay";
+};
+
diff --git a/arch/riscv/boot/dts/kendryte/k210.dtsi b/arch/riscv/boot/dts/kendryte/k210.dtsi
new file mode 100644
index 000000000000..c1df56ccb8d5
--- /dev/null
+++ b/arch/riscv/boot/dts/kendryte/k210.dtsi
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Sean Anderson <seanga2@gmail.com>
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+#include <dt-bindings/clock/k210-clk.h>
+
+/ {
+ /*
+ * Although the K210 is a 64-bit CPU, the address bus is only 32-bits
+ * wide, and the upper half of all addresses is ignored.
+ */
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "kendryte,k210";
+
+ aliases {
+ serial0 = &uarths0;
+ };
+
+ /*
+ * The K210 has an sv39 MMU following the priviledge specification v1.9.
+ * Since this is a non-ratified draft specification, the kernel does not
+ * support it and the K210 support enabled only for the !MMU case.
+ * Be consistent with this by setting the CPUs MMU type to "none".
+ */
+ cpus {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ timebase-frequency = <7800000>;
+ cpu0: cpu@0 {
+ device_type = "cpu";
+ reg = <0>;
+ compatible = "kendryte,k210", "sifive,rocket0", "riscv";
+ riscv,isa = "rv64imafdc";
+ mmu-type = "none";
+ i-cache-size = <0x8000>;
+ i-cache-block-size = <64>;
+ d-cache-size = <0x8000>;
+ d-cache-block-size = <64>;
+ clocks = <&sysctl K210_CLK_CPU>;
+ clock-frequency = <390000000>;
+ cpu0_intc: interrupt-controller {
+ #interrupt-cells = <1>;
+ interrupt-controller;
+ compatible = "riscv,cpu-intc";
+ };
+ };
+ cpu1: cpu@1 {
+ device_type = "cpu";
+ reg = <1>;
+ compatible = "kendryte,k210", "sifive,rocket0", "riscv";
+ riscv,isa = "rv64imafdc";
+ mmu-type = "none";
+ i-cache-size = <0x8000>;
+ i-cache-block-size = <64>;
+ d-cache-size = <0x8000>;
+ d-cache-block-size = <64>;
+ clocks = <&sysctl K210_CLK_CPU>;
+ clock-frequency = <390000000>;
+ cpu1_intc: interrupt-controller {
+ #interrupt-cells = <1>;
+ interrupt-controller;
+ compatible = "riscv,cpu-intc";
+ };
+ };
+ };
+
+ sram: memory@80000000 {
+ device_type = "memory";
+ reg = <0x80000000 0x400000>,
+ <0x80400000 0x200000>,
+ <0x80600000 0x200000>;
+ reg-names = "sram0", "sram1", "aisram";
+ };
+
+ clocks {
+ in0: oscillator {
+ compatible = "fixed-clock";
+ #clock-cells = <0>;
+ clock-frequency = <26000000>;
+ };
+ };
+
+ soc {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "kendryte,k210-soc", "simple-bus";
+ ranges;
+ interrupt-parent = <&plic0>;
+
+ sysctl: sysctl@50440000 {
+ compatible = "kendryte,k210-sysctl", "simple-mfd";
+ reg = <0x50440000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ clint0: interrupt-controller@2000000 {
+ compatible = "riscv,clint0";
+ reg = <0x2000000 0xC000>;
+ interrupts-extended = <&cpu0_intc 3>, <&cpu1_intc 3>;
+ clocks = <&sysctl K210_CLK_ACLK>;
+ };
+
+ plic0: interrupt-controller@c000000 {
+ #interrupt-cells = <1>;
+ interrupt-controller;
+ compatible = "kendryte,k210-plic0", "riscv,plic0";
+ reg = <0xC000000 0x4000000>;
+ interrupts-extended = <&cpu0_intc 11>, <&cpu0_intc 0xffffffff>,
+ <&cpu1_intc 11>, <&cpu1_intc 0xffffffff>;
+ riscv,ndev = <65>;
+ riscv,max-priority = <7>;
+ };
+
+ uarths0: serial@38000000 {
+ compatible = "kendryte,k210-uarths", "sifive,uart0";
+ reg = <0x38000000 0x1000>;
+ interrupts = <33>;
+ clocks = <&sysctl K210_CLK_CPU>;
+ };
+ };
+};
diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig
index 2557c5372a25..4da4886246a4 100644
--- a/arch/riscv/configs/defconfig
+++ b/arch/riscv/configs/defconfig
@@ -128,3 +128,4 @@ CONFIG_DEBUG_BLOCK_EXT_DEVT=y
# CONFIG_FTRACE is not set
# CONFIG_RUNTIME_TESTING_MENU is not set
CONFIG_MEMTEST=y
+# CONFIG_SYSFS_SYSCALL is not set
diff --git a/arch/riscv/configs/nommu_k210_defconfig b/arch/riscv/configs/nommu_k210_defconfig
new file mode 100644
index 000000000000..632aa2f95e57
--- /dev/null
+++ b/arch/riscv/configs/nommu_k210_defconfig
@@ -0,0 +1,68 @@
+# CONFIG_CPU_ISOLATION is not set
+CONFIG_LOG_BUF_SHIFT=15
+CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=12
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_INITRAMFS_SOURCE=""
+CONFIG_INITRAMFS_FORCE=y
+# CONFIG_RD_BZIP2 is not set
+# CONFIG_RD_LZMA is not set
+# CONFIG_RD_XZ is not set
+# CONFIG_RD_LZO is not set
+# CONFIG_RD_LZ4 is not set
+# CONFIG_BOOT_CONFIG is not set
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+# CONFIG_SYSFS_SYSCALL is not set
+# CONFIG_FHANDLE is not set
+# CONFIG_BASE_FULL is not set
+# CONFIG_EPOLL is not set
+# CONFIG_SIGNALFD is not set
+# CONFIG_TIMERFD is not set
+# CONFIG_EVENTFD is not set
+# CONFIG_AIO is not set
+# CONFIG_IO_URING is not set
+# CONFIG_ADVISE_SYSCALLS is not set
+# CONFIG_MEMBARRIER is not set
+# CONFIG_KALLSYMS is not set
+CONFIG_EMBEDDED=y
+# CONFIG_VM_EVENT_COUNTERS is not set
+# CONFIG_COMPAT_BRK is not set
+CONFIG_SLOB=y
+# CONFIG_SLAB_MERGE_DEFAULT is not set
+# CONFIG_MMU is not set
+CONFIG_SOC_KENDRYTE=y
+CONFIG_MAXPHYSMEM_2GB=y
+CONFIG_SMP=y
+CONFIG_NR_CPUS=2
+CONFIG_CMDLINE="earlycon console=ttySIF0"
+CONFIG_CMDLINE_FORCE=y
+CONFIG_USE_BUILTIN_DTB=y
+CONFIG_BUILTIN_DTB_SOURCE="kendryte/k210"
+# CONFIG_BLOCK is not set
+CONFIG_BINFMT_FLAT=y
+# CONFIG_COREDUMP is not set
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+# CONFIG_FW_LOADER is not set
+# CONFIG_ALLOW_DEV_COREDUMP is not set
+# CONFIG_INPUT_KEYBOARD is not set
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_SERIO is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_LDISC_AUTOLOAD is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_HW_RANDOM is not set
+# CONFIG_HWMON is not set
+# CONFIG_VGA_CONSOLE is not set
+# CONFIG_HID is not set
+# CONFIG_USB_SUPPORT is not set
+# CONFIG_VIRTIO_MENU is not set
+# CONFIG_DNOTIFY is not set
+# CONFIG_INOTIFY_USER is not set
+# CONFIG_MISC_FILESYSTEMS is not set
+CONFIG_LSM="[]"
+CONFIG_PRINTK_TIME=y
+# CONFIG_DEBUG_MISC is not set
+# CONFIG_SCHED_DEBUG is not set
+# CONFIG_RCU_TRACE is not set
+# CONFIG_FTRACE is not set
+# CONFIG_RUNTIME_TESTING_MENU is not set
diff --git a/arch/riscv/configs/rv32_defconfig b/arch/riscv/configs/rv32_defconfig
index 0292879a9690..05bbf5240569 100644
--- a/arch/riscv/configs/rv32_defconfig
+++ b/arch/riscv/configs/rv32_defconfig
@@ -124,3 +124,4 @@ CONFIG_DEBUG_BLOCK_EXT_DEVT=y
# CONFIG_FTRACE is not set
# CONFIG_RUNTIME_TESTING_MENU is not set
CONFIG_MEMTEST=y
+# CONFIG_SYSFS_SYSCALL is not set
diff --git a/arch/riscv/include/asm/bug.h b/arch/riscv/include/asm/bug.h
index 75604fec1b1b..d6f1ec08d97b 100644
--- a/arch/riscv/include/asm/bug.h
+++ b/arch/riscv/include/asm/bug.h
@@ -19,6 +19,14 @@
#define __BUG_INSN_32 _UL(0x00100073) /* ebreak */
#define __BUG_INSN_16 _UL(0x9002) /* c.ebreak */
+#define GET_INSN_LENGTH(insn) \
+({ \
+ unsigned long __len; \
+ __len = ((insn & __INSN_LENGTH_MASK) == __INSN_LENGTH_32) ? \
+ 4UL : 2UL; \
+ __len; \
+})
+
typedef u32 bug_insn_t;
#ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h
index 555b20b11dc3..c8677c75f82c 100644
--- a/arch/riscv/include/asm/cacheflush.h
+++ b/arch/riscv/include/asm/cacheflush.h
@@ -85,7 +85,7 @@ static inline void flush_dcache_page(struct page *page)
* so instead we just flush the whole thing.
*/
#define flush_icache_range(start, end) flush_icache_all()
-#define flush_icache_user_range(vma, pg, addr, len) flush_icache_all()
+#define flush_icache_user_range(vma, pg, addr, len) flush_icache_mm(vma->vm_mm, 0)
#ifndef CONFIG_SMP
diff --git a/arch/riscv/include/asm/cpu_ops.h b/arch/riscv/include/asm/cpu_ops.h
new file mode 100644
index 000000000000..a8ec3c5c1bd2
--- /dev/null
+++ b/arch/riscv/include/asm/cpu_ops.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
+ * Based on arch/arm64/include/asm/cpu_ops.h
+ */
+#ifndef __ASM_CPU_OPS_H
+#define __ASM_CPU_OPS_H
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/threads.h>
+
+/**
+ * struct cpu_operations - Callback operations for hotplugging CPUs.
+ *
+ * @name: Name of the boot protocol.
+ * @cpu_prepare: Early one-time preparation step for a cpu. If there
+ * is a mechanism for doing so, tests whether it is
+ * possible to boot the given HART.
+ * @cpu_start: Boots a cpu into the kernel.
+ * @cpu_disable: Prepares a cpu to die. May fail for some
+ * mechanism-specific reason, which will cause the hot
+ * unplug to be aborted. Called from the cpu to be killed.
+ * @cpu_stop: Makes a cpu leave the kernel. Must not fail. Called from
+ * the cpu being stopped.
+ * @cpu_is_stopped: Ensures a cpu has left the kernel. Called from another
+ * cpu.
+ */
+struct cpu_operations {
+ const char *name;
+ int (*cpu_prepare)(unsigned int cpu);
+ int (*cpu_start)(unsigned int cpu,
+ struct task_struct *tidle);
+#ifdef CONFIG_HOTPLUG_CPU
+ int (*cpu_disable)(unsigned int cpu);
+ void (*cpu_stop)(void);
+ int (*cpu_is_stopped)(unsigned int cpu);
+#endif
+};
+
+extern const struct cpu_operations *cpu_ops[NR_CPUS];
+void __init cpu_set_ops(int cpu);
+void cpu_update_secondary_bootdata(unsigned int cpuid,
+ struct task_struct *tidle);
+
+#endif /* ifndef __ASM_CPU_OPS_H */
diff --git a/arch/riscv/include/asm/current.h b/arch/riscv/include/asm/current.h
index dd973efe5d7c..1de233d8e8de 100644
--- a/arch/riscv/include/asm/current.h
+++ b/arch/riscv/include/asm/current.h
@@ -17,6 +17,8 @@
struct task_struct;
+register struct task_struct *riscv_current_is_tp __asm__("tp");
+
/*
* This only works because "struct thread_info" is at offset 0 from "struct
* task_struct". This constraint seems to be necessary on other architectures
@@ -26,8 +28,7 @@ struct task_struct;
*/
static __always_inline struct task_struct *get_current(void)
{
- register struct task_struct *tp __asm__("tp");
- return tp;
+ return riscv_current_is_tp;
}
#define current get_current()
diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
index 42d2c42f3cc9..2368d49eb4ef 100644
--- a/arch/riscv/include/asm/fixmap.h
+++ b/arch/riscv/include/asm/fixmap.h
@@ -27,6 +27,8 @@ enum fixed_addresses {
FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,
FIX_PTE,
FIX_PMD,
+ FIX_TEXT_POKE1,
+ FIX_TEXT_POKE0,
FIX_EARLYCON_MEM_BASE,
__end_of_fixed_addresses
};
diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h
index eee6e6588b12..b47045cb85ce 100644
--- a/arch/riscv/include/asm/kasan.h
+++ b/arch/riscv/include/asm/kasan.h
@@ -13,7 +13,7 @@
#define KASAN_SHADOW_SCALE_SHIFT 3
#define KASAN_SHADOW_SIZE (UL(1) << (38 - KASAN_SHADOW_SCALE_SHIFT))
-#define KASAN_SHADOW_START 0xffffffc000000000 /* 2^64 - 2^38 */
+#define KASAN_SHADOW_START KERN_VIRT_START /* 2^64 - 2^38 */
#define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE)
#define KASAN_SHADOW_OFFSET (KASAN_SHADOW_END - (1ULL << \
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index 8ca1930caa44..2d50f76efe48 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -137,8 +137,7 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x);
#define virt_addr_valid(vaddr) (pfn_valid(virt_to_pfn(vaddr)))
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
diff --git a/arch/riscv/include/asm/patch.h b/arch/riscv/include/asm/patch.h
new file mode 100644
index 000000000000..b5918a6e0615
--- /dev/null
+++ b/arch/riscv/include/asm/patch.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 SiFive
+ */
+
+#ifndef _ASM_RISCV_PATCH_H
+#define _ASM_RISCV_PATCH_H
+
+int riscv_patch_text_nosync(void *addr, const void *insns, size_t len);
+int riscv_patch_text(void *addr, u32 insn);
+
+#endif /* _ASM_RISCV_PATCH_H */
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 393f2014dfee..9c188ad2e52d 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -449,6 +449,16 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
/*
+ * In the RV64 Linux scheme, we give the user half of the virtual-address space
+ * and give the kernel the other (upper) half.
+ */
+#ifdef CONFIG_64BIT
+#define KERN_VIRT_START (-(BIT(CONFIG_VA_BITS)) + TASK_SIZE)
+#else
+#define KERN_VIRT_START FIXADDR_START
+#endif
+
+/*
* Task size is 0x4000000000 for RV64 or 0x9fc00000 for RV32.
* Note that PGDIR_SIZE must evenly divide TASK_SIZE.
*/
diff --git a/arch/riscv/include/asm/ptdump.h b/arch/riscv/include/asm/ptdump.h
new file mode 100644
index 000000000000..e29af7191909
--- /dev/null
+++ b/arch/riscv/include/asm/ptdump.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 SiFive
+ */
+
+#ifndef _ASM_RISCV_PTDUMP_H
+#define _ASM_RISCV_PTDUMP_H
+
+void ptdump_check_wx(void);
+
+#endif /* _ASM_RISCV_PTDUMP_H */
diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
index 2570c1e683d3..653edb25d495 100644
--- a/arch/riscv/include/asm/sbi.h
+++ b/arch/riscv/include/asm/sbi.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2015 Regents of the University of California
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
*/
#ifndef _ASM_RISCV_SBI_H
@@ -9,96 +10,148 @@
#include <linux/types.h>
#ifdef CONFIG_RISCV_SBI
-#define SBI_SET_TIMER 0
-#define SBI_CONSOLE_PUTCHAR 1
-#define SBI_CONSOLE_GETCHAR 2
-#define SBI_CLEAR_IPI 3
-#define SBI_SEND_IPI 4
-#define SBI_REMOTE_FENCE_I 5
-#define SBI_REMOTE_SFENCE_VMA 6
-#define SBI_REMOTE_SFENCE_VMA_ASID 7
-#define SBI_SHUTDOWN 8
-
-#define SBI_CALL(which, arg0, arg1, arg2, arg3) ({ \
- register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0); \
- register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1); \
- register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2); \
- register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3); \
- register uintptr_t a7 asm ("a7") = (uintptr_t)(which); \
- asm volatile ("ecall" \
- : "+r" (a0) \
- : "r" (a1), "r" (a2), "r" (a3), "r" (a7) \
- : "memory"); \
- a0; \
-})
-
-/* Lazy implementations until SBI is finalized */
-#define SBI_CALL_0(which) SBI_CALL(which, 0, 0, 0, 0)
-#define SBI_CALL_1(which, arg0) SBI_CALL(which, arg0, 0, 0, 0)
-#define SBI_CALL_2(which, arg0, arg1) SBI_CALL(which, arg0, arg1, 0, 0)
-#define SBI_CALL_3(which, arg0, arg1, arg2) \
- SBI_CALL(which, arg0, arg1, arg2, 0)
-#define SBI_CALL_4(which, arg0, arg1, arg2, arg3) \
- SBI_CALL(which, arg0, arg1, arg2, arg3)
-
-static inline void sbi_console_putchar(int ch)
-{
- SBI_CALL_1(SBI_CONSOLE_PUTCHAR, ch);
-}
+enum sbi_ext_id {
+#ifdef CONFIG_RISCV_SBI_V01
+ SBI_EXT_0_1_SET_TIMER = 0x0,
+ SBI_EXT_0_1_CONSOLE_PUTCHAR = 0x1,
+ SBI_EXT_0_1_CONSOLE_GETCHAR = 0x2,
+ SBI_EXT_0_1_CLEAR_IPI = 0x3,
+ SBI_EXT_0_1_SEND_IPI = 0x4,
+ SBI_EXT_0_1_REMOTE_FENCE_I = 0x5,
+ SBI_EXT_0_1_REMOTE_SFENCE_VMA = 0x6,
+ SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID = 0x7,
+ SBI_EXT_0_1_SHUTDOWN = 0x8,
+#endif
+ SBI_EXT_BASE = 0x10,
+ SBI_EXT_TIME = 0x54494D45,
+ SBI_EXT_IPI = 0x735049,
+ SBI_EXT_RFENCE = 0x52464E43,
+ SBI_EXT_HSM = 0x48534D,
+};
-static inline int sbi_console_getchar(void)
-{
- return SBI_CALL_0(SBI_CONSOLE_GETCHAR);
-}
+enum sbi_ext_base_fid {
+ SBI_EXT_BASE_GET_SPEC_VERSION = 0,
+ SBI_EXT_BASE_GET_IMP_ID,
+ SBI_EXT_BASE_GET_IMP_VERSION,
+ SBI_EXT_BASE_PROBE_EXT,
+ SBI_EXT_BASE_GET_MVENDORID,
+ SBI_EXT_BASE_GET_MARCHID,
+ SBI_EXT_BASE_GET_MIMPID,
+};
-static inline void sbi_set_timer(uint64_t stime_value)
-{
-#if __riscv_xlen == 32
- SBI_CALL_2(SBI_SET_TIMER, stime_value, stime_value >> 32);
-#else
- SBI_CALL_1(SBI_SET_TIMER, stime_value);
-#endif
-}
+enum sbi_ext_time_fid {
+ SBI_EXT_TIME_SET_TIMER = 0,
+};
-static inline void sbi_shutdown(void)
-{
- SBI_CALL_0(SBI_SHUTDOWN);
-}
+enum sbi_ext_ipi_fid {
+ SBI_EXT_IPI_SEND_IPI = 0,
+};
-static inline void sbi_clear_ipi(void)
-{
- SBI_CALL_0(SBI_CLEAR_IPI);
-}
+enum sbi_ext_rfence_fid {
+ SBI_EXT_RFENCE_REMOTE_FENCE_I = 0,
+ SBI_EXT_RFENCE_REMOTE_SFENCE_VMA,
+ SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID,
+ SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA,
+ SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID,
+ SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA,
+ SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID,
+};
-static inline void sbi_send_ipi(const unsigned long *hart_mask)
-{
- SBI_CALL_1(SBI_SEND_IPI, hart_mask);
-}
+enum sbi_ext_hsm_fid {
+ SBI_EXT_HSM_HART_START = 0,
+ SBI_EXT_HSM_HART_STOP,
+ SBI_EXT_HSM_HART_STATUS,
+};
+
+enum sbi_hsm_hart_status {
+ SBI_HSM_HART_STATUS_STARTED = 0,
+ SBI_HSM_HART_STATUS_STOPPED,
+ SBI_HSM_HART_STATUS_START_PENDING,
+ SBI_HSM_HART_STATUS_STOP_PENDING,
+};
+
+#define SBI_SPEC_VERSION_DEFAULT 0x1
+#define SBI_SPEC_VERSION_MAJOR_SHIFT 24
+#define SBI_SPEC_VERSION_MAJOR_MASK 0x7f
+#define SBI_SPEC_VERSION_MINOR_MASK 0xffffff
+
+/* SBI return error codes */
+#define SBI_SUCCESS 0
+#define SBI_ERR_FAILURE -1
+#define SBI_ERR_NOT_SUPPORTED -2
+#define SBI_ERR_INVALID_PARAM -3
+#define SBI_ERR_DENIED -4
+#define SBI_ERR_INVALID_ADDRESS -5
-static inline void sbi_remote_fence_i(const unsigned long *hart_mask)
+extern unsigned long sbi_spec_version;
+struct sbiret {
+ long error;
+ long value;
+};
+
+int sbi_init(void);
+struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,
+ unsigned long arg1, unsigned long arg2,
+ unsigned long arg3, unsigned long arg4,
+ unsigned long arg5);
+
+void sbi_console_putchar(int ch);
+int sbi_console_getchar(void);
+void sbi_set_timer(uint64_t stime_value);
+void sbi_shutdown(void);
+void sbi_clear_ipi(void);
+void sbi_send_ipi(const unsigned long *hart_mask);
+void sbi_remote_fence_i(const unsigned long *hart_mask);
+void sbi_remote_sfence_vma(const unsigned long *hart_mask,
+ unsigned long start,
+ unsigned long size);
+
+void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask,
+ unsigned long start,
+ unsigned long size,
+ unsigned long asid);
+int sbi_remote_hfence_gvma(const unsigned long *hart_mask,
+ unsigned long start,
+ unsigned long size);
+int sbi_remote_hfence_gvma_vmid(const unsigned long *hart_mask,
+ unsigned long start,
+ unsigned long size,
+ unsigned long vmid);
+int sbi_remote_hfence_vvma(const unsigned long *hart_mask,
+ unsigned long start,
+ unsigned long size);
+int sbi_remote_hfence_vvma_asid(const unsigned long *hart_mask,
+ unsigned long start,
+ unsigned long size,
+ unsigned long asid);
+int sbi_probe_extension(int ext);
+
+/* Check if current SBI specification version is 0.1 or not */
+static inline int sbi_spec_is_0_1(void)
{
- SBI_CALL_1(SBI_REMOTE_FENCE_I, hart_mask);
+ return (sbi_spec_version == SBI_SPEC_VERSION_DEFAULT) ? 1 : 0;
}
-static inline void sbi_remote_sfence_vma(const unsigned long *hart_mask,
- unsigned long start,
- unsigned long size)
+/* Get the major version of SBI */
+static inline unsigned long sbi_major_version(void)
{
- SBI_CALL_3(SBI_REMOTE_SFENCE_VMA, hart_mask, start, size);
+ return (sbi_spec_version >> SBI_SPEC_VERSION_MAJOR_SHIFT) &
+ SBI_SPEC_VERSION_MAJOR_MASK;
}
-static inline void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask,
- unsigned long start,
- unsigned long size,
- unsigned long asid)
+/* Get the minor version of SBI */
+static inline unsigned long sbi_minor_version(void)
{
- SBI_CALL_4(SBI_REMOTE_SFENCE_VMA_ASID, hart_mask, start, size, asid);
+ return sbi_spec_version & SBI_SPEC_VERSION_MINOR_MASK;
}
+
+int sbi_err_map_linux_errno(int err);
#else /* CONFIG_RISCV_SBI */
/* stubs for code that is only reachable under IS_ENABLED(CONFIG_RISCV_SBI): */
void sbi_set_timer(uint64_t stime_value);
void sbi_clear_ipi(void);
void sbi_send_ipi(const unsigned long *hart_mask);
void sbi_remote_fence_i(const unsigned long *hart_mask);
+void sbi_init(void);
#endif /* CONFIG_RISCV_SBI */
#endif /* _ASM_RISCV_SBI_H */
diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h
new file mode 100644
index 000000000000..c38df4771c09
--- /dev/null
+++ b/arch/riscv/include/asm/set_memory.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2019 SiFive
+ */
+
+#ifndef _ASM_RISCV_SET_MEMORY_H
+#define _ASM_RISCV_SET_MEMORY_H
+
+#ifndef __ASSEMBLY__
+/*
+ * Functions to change memory attributes.
+ */
+#ifdef CONFIG_MMU
+int set_memory_ro(unsigned long addr, int numpages);
+int set_memory_rw(unsigned long addr, int numpages);
+int set_memory_x(unsigned long addr, int numpages);
+int set_memory_nx(unsigned long addr, int numpages);
+#else
+static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; }
+static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; }
+static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
+static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
+#endif
+
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void set_kernel_text_ro(void);
+void set_kernel_text_rw(void);
+#else
+static inline void set_kernel_text_ro(void) { }
+static inline void set_kernel_text_rw(void) { }
+#endif
+
+int set_direct_map_invalid_noflush(struct page *page);
+int set_direct_map_default_noflush(struct page *page);
+
+#endif /* __ASSEMBLY__ */
+
+#ifdef CONFIG_ARCH_HAS_STRICT_KERNEL_RWX
+#ifdef CONFIG_64BIT
+#define SECTION_ALIGN (1 << 21)
+#else
+#define SECTION_ALIGN (1 << 22)
+#endif
+#else /* !CONFIG_ARCH_HAS_STRICT_KERNEL_RWX */
+#define SECTION_ALIGN L1_CACHE_BYTES
+#endif /* CONFIG_ARCH_HAS_STRICT_KERNEL_RWX */
+
+#endif /* _ASM_RISCV_SET_MEMORY_H */
diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h
index a83451d73a4e..f4c7cfda6b7f 100644
--- a/arch/riscv/include/asm/smp.h
+++ b/arch/riscv/include/asm/smp.h
@@ -43,6 +43,13 @@ void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out);
*/
#define raw_smp_processor_id() (current_thread_info()->cpu)
+#if defined CONFIG_HOTPLUG_CPU
+int __cpu_disable(void);
+void __cpu_die(unsigned int cpu);
+void cpu_stop(void);
+#else
+#endif /* CONFIG_HOTPLUG_CPU */
+
#else
static inline void show_ipi_stats(struct seq_file *p, int prec)
@@ -61,5 +68,22 @@ static inline unsigned long cpuid_to_hartid_map(int cpu)
return boot_cpu_hartid;
}
+static inline void riscv_cpuid_to_hartid_mask(const struct cpumask *in,
+ struct cpumask *out)
+{
+ cpumask_clear(out);
+ cpumask_set_cpu(boot_cpu_hartid, out);
+}
+
#endif /* CONFIG_SMP */
+
+#if defined(CONFIG_HOTPLUG_CPU) && (CONFIG_SMP)
+bool cpu_has_hotplug(unsigned int cpu);
+#else
+static inline bool cpu_has_hotplug(unsigned int cpu)
+{
+ return false;
+}
+#endif
+
#endif /* _ASM_RISCV_SMP_H */
diff --git a/arch/riscv/include/asm/soc.h b/arch/riscv/include/asm/soc.h
new file mode 100644
index 000000000000..7cec1968c8b4
--- /dev/null
+++ b/arch/riscv/include/asm/soc.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+
+#ifndef _ASM_RISCV_SOC_H
+#define _ASM_RISCV_SOC_H
+
+#include <linux/of.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+
+#define SOC_EARLY_INIT_DECLARE(name, compat, fn) \
+ static const struct of_device_id __soc_early_init__##name \
+ __used __section(__soc_early_init_table) \
+ = { .compatible = compat, .data = fn }
+
+void soc_early_init(void);
+
+extern unsigned long __soc_early_init_table_start;
+extern unsigned long __soc_early_init_table_end;
+
+#endif
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index f40205cb9a22..86c83081044f 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -4,12 +4,14 @@
#
ifdef CONFIG_FTRACE
-CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_patch.o = -pg
endif
extra-y += head.o
extra-y += vmlinux.lds
+obj-y += soc.o
obj-y += cpu.o
obj-y += cpufeature.o
obj-y += entry.o
@@ -26,12 +28,15 @@ obj-y += traps.o
obj-y += riscv_ksyms.o
obj-y += stacktrace.o
obj-y += cacheinfo.o
+obj-y += patch.o
obj-$(CONFIG_MMU) += vdso.o vdso/
-obj-$(CONFIG_RISCV_M_MODE) += clint.o
+obj-$(CONFIG_RISCV_M_MODE) += clint.o traps_misaligned.o
obj-$(CONFIG_FPU) += fpu.o
obj-$(CONFIG_SMP) += smpboot.o
obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_SMP) += cpu_ops.o
+obj-$(CONFIG_SMP) += cpu_ops_spinwait.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o
@@ -42,5 +47,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
obj-$(CONFIG_PERF_EVENTS) += perf_callchain.o
obj-$(CONFIG_HAVE_PERF_REGS) += perf_regs.o
obj-$(CONFIG_RISCV_SBI) += sbi.o
+ifeq ($(CONFIG_RISCV_SBI), y)
+obj-$(CONFIG_SMP) += cpu_ops_sbi.o
+endif
+obj-$(CONFIG_HOTPLUG_CPU) += cpu-hotplug.o
clean:
diff --git a/arch/riscv/kernel/cpu-hotplug.c b/arch/riscv/kernel/cpu-hotplug.c
new file mode 100644
index 000000000000..df84e0c13db1
--- /dev/null
+++ b/arch/riscv/kernel/cpu-hotplug.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/irq.h>
+#include <linux/cpu.h>
+#include <linux/sched/hotplug.h>
+#include <asm/irq.h>
+#include <asm/cpu_ops.h>
+#include <asm/sbi.h>
+
+void cpu_stop(void);
+void arch_cpu_idle_dead(void)
+{
+ cpu_stop();
+}
+
+bool cpu_has_hotplug(unsigned int cpu)
+{
+ if (cpu_ops[cpu]->cpu_stop)
+ return true;
+
+ return false;
+}
+
+/*
+ * __cpu_disable runs on the processor to be shutdown.
+ */
+int __cpu_disable(void)
+{
+ int ret = 0;
+ unsigned int cpu = smp_processor_id();
+
+ if (!cpu_ops[cpu] || !cpu_ops[cpu]->cpu_stop)
+ return -EOPNOTSUPP;
+
+ if (cpu_ops[cpu]->cpu_disable)
+ ret = cpu_ops[cpu]->cpu_disable(cpu);
+
+ if (ret)
+ return ret;
+
+ remove_cpu_topology(cpu);
+ set_cpu_online(cpu, false);
+ irq_migrate_all_off_this_cpu();
+
+ return ret;
+}
+
+/*
+ * Called on the thread which is asking for a CPU to be shutdown.
+ */
+void __cpu_die(unsigned int cpu)
+{
+ int ret = 0;
+
+ if (!cpu_wait_death(cpu, 5)) {
+ pr_err("CPU %u: didn't die\n", cpu);
+ return;
+ }
+ pr_notice("CPU%u: off\n", cpu);
+
+ /* Verify from the firmware if the cpu is really stopped*/
+ if (cpu_ops[cpu]->cpu_is_stopped)
+ ret = cpu_ops[cpu]->cpu_is_stopped(cpu);
+ if (ret)
+ pr_warn("CPU%d may not have stopped: %d\n", cpu, ret);
+}
+
+/*
+ * Called from the idle thread for the CPU which has been shutdown.
+ */
+void cpu_stop(void)
+{
+ idle_task_exit();
+
+ (void)cpu_report_death();
+
+ cpu_ops[smp_processor_id()]->cpu_stop();
+ /* It should never reach here */
+ BUG();
+}
diff --git a/arch/riscv/kernel/cpu_ops.c b/arch/riscv/kernel/cpu_ops.c
new file mode 100644
index 000000000000..c4c33bf02369
--- /dev/null
+++ b/arch/riscv/kernel/cpu_ops.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/of.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <asm/cpu_ops.h>
+#include <asm/sbi.h>
+#include <asm/smp.h>
+
+const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init;
+
+void *__cpu_up_stack_pointer[NR_CPUS];
+void *__cpu_up_task_pointer[NR_CPUS];
+
+extern const struct cpu_operations cpu_ops_sbi;
+extern const struct cpu_operations cpu_ops_spinwait;
+
+void cpu_update_secondary_bootdata(unsigned int cpuid,
+ struct task_struct *tidle)
+{
+ int hartid = cpuid_to_hartid_map(cpuid);
+
+ /* Make sure tidle is updated */
+ smp_mb();
+ WRITE_ONCE(__cpu_up_stack_pointer[hartid],
+ task_stack_page(tidle) + THREAD_SIZE);
+ WRITE_ONCE(__cpu_up_task_pointer[hartid], tidle);
+}
+
+void __init cpu_set_ops(int cpuid)
+{
+#if IS_ENABLED(CONFIG_RISCV_SBI)
+ if (sbi_probe_extension(SBI_EXT_HSM) > 0) {
+ if (!cpuid)
+ pr_info("SBI v0.2 HSM extension detected\n");
+ cpu_ops[cpuid] = &cpu_ops_sbi;
+ } else
+#endif
+ cpu_ops[cpuid] = &cpu_ops_spinwait;
+}
diff --git a/arch/riscv/kernel/cpu_ops_sbi.c b/arch/riscv/kernel/cpu_ops_sbi.c
new file mode 100644
index 000000000000..685fae72b7f5
--- /dev/null
+++ b/arch/riscv/kernel/cpu_ops_sbi.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * HSM extension and cpu_ops implementation.
+ *
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <asm/cpu_ops.h>
+#include <asm/sbi.h>
+#include <asm/smp.h>
+
+extern char secondary_start_sbi[];
+const struct cpu_operations cpu_ops_sbi;
+
+static int sbi_hsm_hart_start(unsigned long hartid, unsigned long saddr,
+ unsigned long priv)
+{
+ struct sbiret ret;
+
+ ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_START,
+ hartid, saddr, priv, 0, 0, 0);
+ if (ret.error)
+ return sbi_err_map_linux_errno(ret.error);
+ else
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int sbi_hsm_hart_stop(void)
+{
+ struct sbiret ret;
+
+ ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_STOP, 0, 0, 0, 0, 0, 0);
+
+ if (ret.error)
+ return sbi_err_map_linux_errno(ret.error);
+ else
+ return 0;
+}
+
+static int sbi_hsm_hart_get_status(unsigned long hartid)
+{
+ struct sbiret ret;
+
+ ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_STATUS,
+ hartid, 0, 0, 0, 0, 0);
+ if (ret.error)
+ return sbi_err_map_linux_errno(ret.error);
+ else
+ return ret.value;
+}
+#endif
+
+static int sbi_cpu_start(unsigned int cpuid, struct task_struct *tidle)
+{
+ int rc;
+ unsigned long boot_addr = __pa_symbol(secondary_start_sbi);
+ int hartid = cpuid_to_hartid_map(cpuid);
+
+ cpu_update_secondary_bootdata(cpuid, tidle);
+ rc = sbi_hsm_hart_start(hartid, boot_addr, 0);
+
+ return rc;
+}
+
+static int sbi_cpu_prepare(unsigned int cpuid)
+{
+ if (!cpu_ops_sbi.cpu_start) {
+ pr_err("cpu start method not defined for CPU [%d]\n", cpuid);
+ return -ENODEV;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int sbi_cpu_disable(unsigned int cpuid)
+{
+ if (!cpu_ops_sbi.cpu_stop)
+ return -EOPNOTSUPP;
+ return 0;
+}
+
+static void sbi_cpu_stop(void)
+{
+ int ret;
+
+ ret = sbi_hsm_hart_stop();
+ pr_crit("Unable to stop the cpu %u (%d)\n", smp_processor_id(), ret);
+}
+
+static int sbi_cpu_is_stopped(unsigned int cpuid)
+{
+ int rc;
+ int hartid = cpuid_to_hartid_map(cpuid);
+
+ rc = sbi_hsm_hart_get_status(hartid);
+
+ if (rc == SBI_HSM_HART_STATUS_STOPPED)
+ return 0;
+ return rc;
+}
+#endif
+
+const struct cpu_operations cpu_ops_sbi = {
+ .name = "sbi",
+ .cpu_prepare = sbi_cpu_prepare,
+ .cpu_start = sbi_cpu_start,
+#ifdef CONFIG_HOTPLUG_CPU
+ .cpu_disable = sbi_cpu_disable,
+ .cpu_stop = sbi_cpu_stop,
+ .cpu_is_stopped = sbi_cpu_is_stopped,
+#endif
+};
diff --git a/arch/riscv/kernel/cpu_ops_spinwait.c b/arch/riscv/kernel/cpu_ops_spinwait.c
new file mode 100644
index 000000000000..b2c957bb68c1
--- /dev/null
+++ b/arch/riscv/kernel/cpu_ops_spinwait.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/errno.h>
+#include <linux/of.h>
+#include <linux/string.h>
+#include <asm/cpu_ops.h>
+#include <asm/sbi.h>
+#include <asm/smp.h>
+
+const struct cpu_operations cpu_ops_spinwait;
+
+static int spinwait_cpu_prepare(unsigned int cpuid)
+{
+ if (!cpu_ops_spinwait.cpu_start) {
+ pr_err("cpu start method not defined for CPU [%d]\n", cpuid);
+ return -ENODEV;
+ }
+ return 0;
+}
+
+static int spinwait_cpu_start(unsigned int cpuid, struct task_struct *tidle)
+{
+ /*
+ * In this protocol, all cpus boot on their own accord. _start
+ * selects the first cpu to boot the kernel and causes the remainder
+ * of the cpus to spin in a loop waiting for their stack pointer to be
+ * setup by that main cpu. Writing to bootdata
+ * (i.e __cpu_up_stack_pointer) signals to the spinning cpus that they
+ * can continue the boot process.
+ */
+ cpu_update_secondary_bootdata(cpuid, tidle);
+
+ return 0;
+}
+
+const struct cpu_operations cpu_ops_spinwait = {
+ .name = "spinwait",
+ .cpu_prepare = spinwait_cpu_prepare,
+ .cpu_start = spinwait_cpu_start,
+};
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 208702d8c18e..56d071b2c0a1 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -13,17 +13,11 @@
#include <asm/thread_info.h>
#include <asm/asm-offsets.h>
- .text
- .altmacro
-
-/*
- * Prepares to enter a system call or exception by saving all registers to the
- * stack.
- */
- .macro SAVE_ALL
- LOCAL _restore_kernel_tpsp
- LOCAL _save_context
+#if !IS_ENABLED(CONFIG_PREEMPTION)
+.set resume_kernel, restore_all
+#endif
+ENTRY(handle_exception)
/*
* If coming from userspace, preserve the user thread pointer and load
* the kernel thread pointer. If we came from the kernel, the scratch
@@ -90,77 +84,6 @@ _save_context:
REG_S s3, PT_BADADDR(sp)
REG_S s4, PT_CAUSE(sp)
REG_S s5, PT_TP(sp)
- .endm
-
-/*
- * Prepares to return from a system call or exception by restoring all
- * registers from the stack.
- */
- .macro RESTORE_ALL
- REG_L a0, PT_STATUS(sp)
- /*
- * The current load reservation is effectively part of the processor's
- * state, in the sense that load reservations cannot be shared between
- * different hart contexts. We can't actually save and restore a load
- * reservation, so instead here we clear any existing reservation --
- * it's always legal for implementations to clear load reservations at
- * any point (as long as the forward progress guarantee is kept, but
- * we'll ignore that here).
- *
- * Dangling load reservations can be the result of taking a trap in the
- * middle of an LR/SC sequence, but can also be the result of a taken
- * forward branch around an SC -- which is how we implement CAS. As a
- * result we need to clear reservations between the last CAS and the
- * jump back to the new context. While it is unlikely the store
- * completes, implementations are allowed to expand reservations to be
- * arbitrarily large.
- */
- REG_L a2, PT_EPC(sp)
- REG_SC x0, a2, PT_EPC(sp)
-
- csrw CSR_STATUS, a0
- csrw CSR_EPC, a2
-
- REG_L x1, PT_RA(sp)
- REG_L x3, PT_GP(sp)
- REG_L x4, PT_TP(sp)
- REG_L x5, PT_T0(sp)
- REG_L x6, PT_T1(sp)
- REG_L x7, PT_T2(sp)
- REG_L x8, PT_S0(sp)
- REG_L x9, PT_S1(sp)
- REG_L x10, PT_A0(sp)
- REG_L x11, PT_A1(sp)
- REG_L x12, PT_A2(sp)
- REG_L x13, PT_A3(sp)
- REG_L x14, PT_A4(sp)
- REG_L x15, PT_A5(sp)
- REG_L x16, PT_A6(sp)
- REG_L x17, PT_A7(sp)
- REG_L x18, PT_S2(sp)
- REG_L x19, PT_S3(sp)
- REG_L x20, PT_S4(sp)
- REG_L x21, PT_S5(sp)
- REG_L x22, PT_S6(sp)
- REG_L x23, PT_S7(sp)
- REG_L x24, PT_S8(sp)
- REG_L x25, PT_S9(sp)
- REG_L x26, PT_S10(sp)
- REG_L x27, PT_S11(sp)
- REG_L x28, PT_T3(sp)
- REG_L x29, PT_T4(sp)
- REG_L x30, PT_T5(sp)
- REG_L x31, PT_T6(sp)
-
- REG_L x2, PT_SP(sp)
- .endm
-
-#if !IS_ENABLED(CONFIG_PREEMPTION)
-.set resume_kernel, restore_all
-#endif
-
-ENTRY(handle_exception)
- SAVE_ALL
/*
* Set the scratch register to 0, so that if a recursive exception
@@ -291,7 +214,63 @@ resume_userspace:
csrw CSR_SCRATCH, tp
restore_all:
- RESTORE_ALL
+ REG_L a0, PT_STATUS(sp)
+ /*
+ * The current load reservation is effectively part of the processor's
+ * state, in the sense that load reservations cannot be shared between
+ * different hart contexts. We can't actually save and restore a load
+ * reservation, so instead here we clear any existing reservation --
+ * it's always legal for implementations to clear load reservations at
+ * any point (as long as the forward progress guarantee is kept, but
+ * we'll ignore that here).
+ *
+ * Dangling load reservations can be the result of taking a trap in the
+ * middle of an LR/SC sequence, but can also be the result of a taken
+ * forward branch around an SC -- which is how we implement CAS. As a
+ * result we need to clear reservations between the last CAS and the
+ * jump back to the new context. While it is unlikely the store
+ * completes, implementations are allowed to expand reservations to be
+ * arbitrarily large.
+ */
+ REG_L a2, PT_EPC(sp)
+ REG_SC x0, a2, PT_EPC(sp)
+
+ csrw CSR_STATUS, a0
+ csrw CSR_EPC, a2
+
+ REG_L x1, PT_RA(sp)
+ REG_L x3, PT_GP(sp)
+ REG_L x4, PT_TP(sp)
+ REG_L x5, PT_T0(sp)
+ REG_L x6, PT_T1(sp)
+ REG_L x7, PT_T2(sp)
+ REG_L x8, PT_S0(sp)
+ REG_L x9, PT_S1(sp)
+ REG_L x10, PT_A0(sp)
+ REG_L x11, PT_A1(sp)
+ REG_L x12, PT_A2(sp)
+ REG_L x13, PT_A3(sp)
+ REG_L x14, PT_A4(sp)
+ REG_L x15, PT_A5(sp)
+ REG_L x16, PT_A6(sp)
+ REG_L x17, PT_A7(sp)
+ REG_L x18, PT_S2(sp)
+ REG_L x19, PT_S3(sp)
+ REG_L x20, PT_S4(sp)
+ REG_L x21, PT_S5(sp)
+ REG_L x22, PT_S6(sp)
+ REG_L x23, PT_S7(sp)
+ REG_L x24, PT_S8(sp)
+ REG_L x25, PT_S9(sp)
+ REG_L x26, PT_S10(sp)
+ REG_L x27, PT_S11(sp)
+ REG_L x28, PT_T3(sp)
+ REG_L x29, PT_T4(sp)
+ REG_L x30, PT_T5(sp)
+ REG_L x31, PT_T6(sp)
+
+ REG_L x2, PT_SP(sp)
+
#ifdef CONFIG_RISCV_M_MODE
mret
#else
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index c40fdcdeb950..ce69b34ff55d 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -8,6 +8,7 @@
#include <linux/ftrace.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
+#include <asm/patch.h>
#ifdef CONFIG_DYNAMIC_FTRACE
static int ftrace_check_current_call(unsigned long hook_pos,
@@ -46,20 +47,14 @@ static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target,
{
unsigned int call[2];
unsigned int nops[2] = {NOP4, NOP4};
- int ret = 0;
make_call(hook_pos, target, call);
- /* replace the auipc-jalr pair at once */
- ret = probe_kernel_write((void *)hook_pos, enable ? call : nops,
- MCOUNT_INSN_SIZE);
- /* return must be -EPERM on write error */
- if (ret)
+ /* Replace the auipc-jalr pair at once. Return -EPERM on write error. */
+ if (riscv_patch_text_nosync
+ ((void *)hook_pos, enable ? call : nops, MCOUNT_INSN_SIZE))
return -EPERM;
- smp_mb();
- flush_icache_range((void *)hook_pos, (void *)hook_pos + MCOUNT_INSN_SIZE);
-
return 0;
}
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index 85f2073e7fe4..98a406474e7d 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -14,7 +14,7 @@
#include <asm/hwcap.h>
#include <asm/image.h>
-__INIT
+__HEAD
ENTRY(_start)
/*
* Image header expected by Linux boot-loaders. The image header data
@@ -45,8 +45,111 @@ ENTRY(_start)
.ascii RISCV_IMAGE_MAGIC2
.word 0
-.global _start_kernel
-_start_kernel:
+.align 2
+#ifdef CONFIG_MMU
+relocate:
+ /* Relocate return address */
+ li a1, PAGE_OFFSET
+ la a2, _start
+ sub a1, a1, a2
+ add ra, ra, a1
+
+ /* Point stvec to virtual address of intruction after satp write */
+ la a2, 1f
+ add a2, a2, a1
+ csrw CSR_TVEC, a2
+
+ /* Compute satp for kernel page tables, but don't load it yet */
+ srl a2, a0, PAGE_SHIFT
+ li a1, SATP_MODE
+ or a2, a2, a1
+
+ /*
+ * Load trampoline page directory, which will cause us to trap to
+ * stvec if VA != PA, or simply fall through if VA == PA. We need a
+ * full fence here because setup_vm() just wrote these PTEs and we need
+ * to ensure the new translations are in use.
+ */
+ la a0, trampoline_pg_dir
+ srl a0, a0, PAGE_SHIFT
+ or a0, a0, a1
+ sfence.vma
+ csrw CSR_SATP, a0
+.align 2
+1:
+ /* Set trap vector to spin forever to help debug */
+ la a0, .Lsecondary_park
+ csrw CSR_TVEC, a0
+
+ /* Reload the global pointer */
+.option push
+.option norelax
+ la gp, __global_pointer$
+.option pop
+
+ /*
+ * Switch to kernel page tables. A full fence is necessary in order to
+ * avoid using the trampoline translations, which are only correct for
+ * the first superpage. Fetching the fence is guarnteed to work
+ * because that first superpage is translated the same way.
+ */
+ csrw CSR_SATP, a2
+ sfence.vma
+
+ ret
+#endif /* CONFIG_MMU */
+#ifdef CONFIG_SMP
+ .global secondary_start_sbi
+secondary_start_sbi:
+ /* Mask all interrupts */
+ csrw CSR_IE, zero
+ csrw CSR_IP, zero
+
+ /* Load the global pointer */
+ .option push
+ .option norelax
+ la gp, __global_pointer$
+ .option pop
+
+ /*
+ * Disable FPU to detect illegal usage of
+ * floating point in kernel space
+ */
+ li t0, SR_FS
+ csrc CSR_STATUS, t0
+
+ /* Set trap vector to spin forever to help debug */
+ la a3, .Lsecondary_park
+ csrw CSR_TVEC, a3
+
+ slli a3, a0, LGREG
+ la a4, __cpu_up_stack_pointer
+ la a5, __cpu_up_task_pointer
+ add a4, a3, a4
+ add a5, a3, a5
+ REG_L sp, (a4)
+ REG_L tp, (a5)
+
+ .global secondary_start_common
+secondary_start_common:
+
+#ifdef CONFIG_MMU
+ /* Enable virtual memory and relocate to virtual address */
+ la a0, swapper_pg_dir
+ call relocate
+#endif
+ tail smp_callin
+#endif /* CONFIG_SMP */
+
+.Lsecondary_park:
+ /* We lack SMP support or have too many harts, so park this hart */
+ wfi
+ j .Lsecondary_park
+
+END(_start)
+
+ __INIT
+ENTRY(_start_kernel)
/* Mask all interrupts */
csrw CSR_IE, zero
csrw CSR_IP, zero
@@ -131,62 +234,10 @@ clear_bss_done:
call kasan_early_init
#endif
/* Start the kernel */
+ call soc_early_init
call parse_dtb
tail start_kernel
-#ifdef CONFIG_MMU
-relocate:
- /* Relocate return address */
- li a1, PAGE_OFFSET
- la a2, _start
- sub a1, a1, a2
- add ra, ra, a1
-
- /* Point stvec to virtual address of intruction after satp write */
- la a2, 1f
- add a2, a2, a1
- csrw CSR_TVEC, a2
-
- /* Compute satp for kernel page tables, but don't load it yet */
- srl a2, a0, PAGE_SHIFT
- li a1, SATP_MODE
- or a2, a2, a1
-
- /*
- * Load trampoline page directory, which will cause us to trap to
- * stvec if VA != PA, or simply fall through if VA == PA. We need a
- * full fence here because setup_vm() just wrote these PTEs and we need
- * to ensure the new translations are in use.
- */
- la a0, trampoline_pg_dir
- srl a0, a0, PAGE_SHIFT
- or a0, a0, a1
- sfence.vma
- csrw CSR_SATP, a0
-.align 2
-1:
- /* Set trap vector to spin forever to help debug */
- la a0, .Lsecondary_park
- csrw CSR_TVEC, a0
-
- /* Reload the global pointer */
-.option push
-.option norelax
- la gp, __global_pointer$
-.option pop
-
- /*
- * Switch to kernel page tables. A full fence is necessary in order to
- * avoid using the trampoline translations, which are only correct for
- * the first superpage. Fetching the fence is guarnteed to work
- * because that first superpage is translated the same way.
- */
- csrw CSR_SATP, a2
- sfence.vma
-
- ret
-#endif /* CONFIG_MMU */
-
.Lsecondary_start:
#ifdef CONFIG_SMP
/* Set trap vector to spin forever to help debug */
@@ -211,16 +262,10 @@ relocate:
beqz tp, .Lwait_for_cpu_up
fence
-#ifdef CONFIG_MMU
- /* Enable virtual memory and relocate to virtual address */
- la a0, swapper_pg_dir
- call relocate
+ tail secondary_start_common
#endif
- tail smp_callin
-#endif
-
-END(_start)
+END(_start_kernel)
#ifdef CONFIG_RISCV_M_MODE
ENTRY(reset_regs)
@@ -301,13 +346,6 @@ ENTRY(reset_regs)
END(reset_regs)
#endif /* CONFIG_RISCV_M_MODE */
-.section ".text", "ax",@progbits
-.align 2
-.Lsecondary_park:
- /* We lack SMP support or have too many harts, so park this hart */
- wfi
- j .Lsecondary_park
-
__PAGE_ALIGNED_BSS
/* Empty zero page */
.balign PAGE_SIZE
diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c
new file mode 100644
index 000000000000..8a4fc65ee022
--- /dev/null
+++ b/arch/riscv/kernel/patch.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 SiFive
+ */
+
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/stop_machine.h>
+#include <asm/kprobes.h>
+#include <asm/cacheflush.h>
+#include <asm/fixmap.h>
+
+struct riscv_insn_patch {
+ void *addr;
+ u32 insn;
+ atomic_t cpu_count;
+};
+
+#ifdef CONFIG_MMU
+static DEFINE_RAW_SPINLOCK(patch_lock);
+
+static void __kprobes *patch_map(void *addr, int fixmap)
+{
+ uintptr_t uintaddr = (uintptr_t) addr;
+ struct page *page;
+
+ if (core_kernel_text(uintaddr))
+ page = phys_to_page(__pa_symbol(addr));
+ else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ page = vmalloc_to_page(addr);
+ else
+ return addr;
+
+ BUG_ON(!page);
+
+ return (void *)set_fixmap_offset(fixmap, page_to_phys(page) +
+ (uintaddr & ~PAGE_MASK));
+}
+
+static void __kprobes patch_unmap(int fixmap)
+{
+ clear_fixmap(fixmap);
+}
+
+static int __kprobes riscv_insn_write(void *addr, const void *insn, size_t len)
+{
+ void *waddr = addr;
+ bool across_pages = (((uintptr_t) addr & ~PAGE_MASK) + len) > PAGE_SIZE;
+ unsigned long flags = 0;
+ int ret;
+
+ raw_spin_lock_irqsave(&patch_lock, flags);
+
+ if (across_pages)
+ patch_map(addr + len, FIX_TEXT_POKE1);
+
+ waddr = patch_map(addr, FIX_TEXT_POKE0);
+
+ ret = probe_kernel_write(waddr, insn, len);
+
+ patch_unmap(FIX_TEXT_POKE0);
+
+ if (across_pages)
+ patch_unmap(FIX_TEXT_POKE1);
+
+ raw_spin_unlock_irqrestore(&patch_lock, flags);
+
+ return ret;
+}
+#else
+static int __kprobes riscv_insn_write(void *addr, const void *insn, size_t len)
+{
+ return probe_kernel_write(addr, insn, len);
+}
+#endif /* CONFIG_MMU */
+
+int __kprobes riscv_patch_text_nosync(void *addr, const void *insns, size_t len)
+{
+ u32 *tp = addr;
+ int ret;
+
+ ret = riscv_insn_write(tp, insns, len);
+
+ if (!ret)
+ flush_icache_range((uintptr_t) tp, (uintptr_t) tp + len);
+
+ return ret;
+}
+
+static int __kprobes riscv_patch_text_cb(void *data)
+{
+ struct riscv_insn_patch *patch = data;
+ int ret = 0;
+
+ if (atomic_inc_return(&patch->cpu_count) == 1) {
+ ret =
+ riscv_patch_text_nosync(patch->addr, &patch->insn,
+ GET_INSN_LENGTH(patch->insn));
+ atomic_inc(&patch->cpu_count);
+ } else {
+ while (atomic_read(&patch->cpu_count) <= num_online_cpus())
+ cpu_relax();
+ smp_mb();
+ }
+
+ return ret;
+}
+
+int __kprobes riscv_patch_text(void *addr, u32 insn)
+{
+ struct riscv_insn_patch patch = {
+ .addr = addr,
+ .insn = insn,
+ .cpu_count = ATOMIC_INIT(0),
+ };
+
+ return stop_machine_cpuslocked(riscv_patch_text_cb,
+ &patch, cpu_online_mask);
+}
diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index 817cf7b0974c..610c11e91606 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -22,6 +22,8 @@
#include <asm/switch_to.h>
#include <asm/thread_info.h>
+unsigned long gp_in_global __asm__("gp");
+
extern asmlinkage void ret_from_fork(void);
extern asmlinkage void ret_from_kernel_thread(void);
@@ -107,9 +109,8 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
/* p->thread holds context to be restored by __switch_to() */
if (unlikely(p->flags & PF_KTHREAD)) {
/* Kernel thread */
- const register unsigned long gp __asm__ ("gp");
memset(childregs, 0, sizeof(struct pt_regs));
- childregs->gp = gp;
+ childregs->gp = gp_in_global;
/* Supervisor/Machine, irqs on: */
childregs->status = SR_PP | SR_PIE;
diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c
index f6c7c3e82d28..7c24da59bccf 100644
--- a/arch/riscv/kernel/sbi.c
+++ b/arch/riscv/kernel/sbi.c
@@ -1,17 +1,588 @@
// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * SBI initialilization and all extension implementation.
+ *
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
+ */
#include <linux/init.h>
#include <linux/pm.h>
#include <asm/sbi.h>
+#include <asm/smp.h>
+
+/* default SBI version is 0.1 */
+unsigned long sbi_spec_version = SBI_SPEC_VERSION_DEFAULT;
+EXPORT_SYMBOL(sbi_spec_version);
+
+static void (*__sbi_set_timer)(uint64_t stime);
+static int (*__sbi_send_ipi)(const unsigned long *hart_mask);
+static int (*__sbi_rfence)(int fid, const unsigned long *hart_mask,
+ unsigned long start, unsigned long size,
+ unsigned long arg4, unsigned long arg5);
+
+struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,
+ unsigned long arg1, unsigned long arg2,
+ unsigned long arg3, unsigned long arg4,
+ unsigned long arg5)
+{
+ struct sbiret ret;
+
+ register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0);
+ register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1);
+ register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2);
+ register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3);
+ register uintptr_t a4 asm ("a4") = (uintptr_t)(arg4);
+ register uintptr_t a5 asm ("a5") = (uintptr_t)(arg5);
+ register uintptr_t a6 asm ("a6") = (uintptr_t)(fid);
+ register uintptr_t a7 asm ("a7") = (uintptr_t)(ext);
+ asm volatile ("ecall"
+ : "+r" (a0), "+r" (a1)
+ : "r" (a2), "r" (a3), "r" (a4), "r" (a5), "r" (a6), "r" (a7)
+ : "memory");
+ ret.error = a0;
+ ret.value = a1;
+
+ return ret;
+}
+EXPORT_SYMBOL(sbi_ecall);
+
+int sbi_err_map_linux_errno(int err)
+{
+ switch (err) {
+ case SBI_SUCCESS:
+ return 0;
+ case SBI_ERR_DENIED:
+ return -EPERM;
+ case SBI_ERR_INVALID_PARAM:
+ return -EINVAL;
+ case SBI_ERR_INVALID_ADDRESS:
+ return -EFAULT;
+ case SBI_ERR_NOT_SUPPORTED:
+ case SBI_ERR_FAILURE:
+ default:
+ return -ENOTSUPP;
+ };
+}
+EXPORT_SYMBOL(sbi_err_map_linux_errno);
+
+#ifdef CONFIG_RISCV_SBI_V01
+/**
+ * sbi_console_putchar() - Writes given character to the console device.
+ * @ch: The data to be written to the console.
+ *
+ * Return: None
+ */
+void sbi_console_putchar(int ch)
+{
+ sbi_ecall(SBI_EXT_0_1_CONSOLE_PUTCHAR, 0, ch, 0, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(sbi_console_putchar);
+
+/**
+ * sbi_console_getchar() - Reads a byte from console device.
+ *
+ * Returns the value read from console.
+ */
+int sbi_console_getchar(void)
+{
+ struct sbiret ret;
+
+ ret = sbi_ecall(SBI_EXT_0_1_CONSOLE_GETCHAR, 0, 0, 0, 0, 0, 0, 0);
+
+ return ret.error;
+}
+EXPORT_SYMBOL(sbi_console_getchar);
+
+/**
+ * sbi_shutdown() - Remove all the harts from executing supervisor code.
+ *
+ * Return: None
+ */
+void sbi_shutdown(void)
+{
+ sbi_ecall(SBI_EXT_0_1_SHUTDOWN, 0, 0, 0, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(sbi_set_timer);
+
+/**
+ * sbi_clear_ipi() - Clear any pending IPIs for the calling hart.
+ *
+ * Return: None
+ */
+void sbi_clear_ipi(void)
+{
+ sbi_ecall(SBI_EXT_0_1_CLEAR_IPI, 0, 0, 0, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(sbi_shutdown);
+
+/**
+ * sbi_set_timer_v01() - Program the timer for next timer event.
+ * @stime_value: The value after which next timer event should fire.
+ *
+ * Return: None
+ */
+static void __sbi_set_timer_v01(uint64_t stime_value)
+{
+#if __riscv_xlen == 32
+ sbi_ecall(SBI_EXT_0_1_SET_TIMER, 0, stime_value,
+ stime_value >> 32, 0, 0, 0, 0);
+#else
+ sbi_ecall(SBI_EXT_0_1_SET_TIMER, 0, stime_value, 0, 0, 0, 0, 0);
+#endif
+}
+
+static int __sbi_send_ipi_v01(const unsigned long *hart_mask)
+{
+ sbi_ecall(SBI_EXT_0_1_SEND_IPI, 0, (unsigned long)hart_mask,
+ 0, 0, 0, 0, 0);
+ return 0;
+}
+
+static int __sbi_rfence_v01(int fid, const unsigned long *hart_mask,
+ unsigned long start, unsigned long size,
+ unsigned long arg4, unsigned long arg5)
+{
+ int result = 0;
+
+ /* v0.2 function IDs are equivalent to v0.1 extension IDs */
+ switch (fid) {
+ case SBI_EXT_RFENCE_REMOTE_FENCE_I:
+ sbi_ecall(SBI_EXT_0_1_REMOTE_FENCE_I, 0,
+ (unsigned long)hart_mask, 0, 0, 0, 0, 0);
+ break;
+ case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA:
+ sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA, 0,
+ (unsigned long)hart_mask, start, size,
+ 0, 0, 0);
+ break;
+ case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID:
+ sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID, 0,
+ (unsigned long)hart_mask, start, size,
+ arg4, 0, 0);
+ break;
+ default:
+ pr_err("SBI call [%d]not supported in SBI v0.1\n", fid);
+ result = -EINVAL;
+ }
+
+ return result;
+}
+#else
+static void __sbi_set_timer_v01(uint64_t stime_value)
+{
+ pr_warn("Timer extension is not available in SBI v%lu.%lu\n",
+ sbi_major_version(), sbi_minor_version());
+}
+
+static int __sbi_send_ipi_v01(const unsigned long *hart_mask)
+{
+ pr_warn("IPI extension is not available in SBI v%lu.%lu\n",
+ sbi_major_version(), sbi_minor_version());
+
+ return 0;
+}
+
+static int __sbi_rfence_v01(int fid, const unsigned long *hart_mask,
+ unsigned long start, unsigned long size,
+ unsigned long arg4, unsigned long arg5)
+{
+ pr_warn("remote fence extension is not available in SBI v%lu.%lu\n",
+ sbi_major_version(), sbi_minor_version());
+
+ return 0;
+}
+#endif /* CONFIG_RISCV_SBI_V01 */
+
+static void __sbi_set_timer_v02(uint64_t stime_value)
+{
+#if __riscv_xlen == 32
+ sbi_ecall(SBI_EXT_TIME, SBI_EXT_TIME_SET_TIMER, stime_value,
+ stime_value >> 32, 0, 0, 0, 0);
+#else
+ sbi_ecall(SBI_EXT_TIME, SBI_EXT_TIME_SET_TIMER, stime_value, 0,
+ 0, 0, 0, 0);
+#endif
+}
+
+static int __sbi_send_ipi_v02(const unsigned long *hart_mask)
+{
+ unsigned long hartid, hmask_val, hbase;
+ struct cpumask tmask;
+ struct sbiret ret = {0};
+ int result;
+
+ if (!hart_mask || !(*hart_mask)) {
+ riscv_cpuid_to_hartid_mask(cpu_online_mask, &tmask);
+ hart_mask = cpumask_bits(&tmask);
+ }
+
+ hmask_val = 0;
+ hbase = 0;
+ for_each_set_bit(hartid, hart_mask, NR_CPUS) {
+ if (hmask_val && ((hbase + BITS_PER_LONG) <= hartid)) {
+ ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI,
+ hmask_val, hbase, 0, 0, 0, 0);
+ if (ret.error)
+ goto ecall_failed;
+ hmask_val = 0;
+ hbase = 0;
+ }
+ if (!hmask_val)
+ hbase = hartid;
+ hmask_val |= 1UL << (hartid - hbase);
+ }
+
+ if (hmask_val) {
+ ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI,
+ hmask_val, hbase, 0, 0, 0, 0);
+ if (ret.error)
+ goto ecall_failed;
+ }
+
+ return 0;
+
+ecall_failed:
+ result = sbi_err_map_linux_errno(ret.error);
+ pr_err("%s: hbase = [%lu] hmask = [0x%lx] failed (error [%d])\n",
+ __func__, hbase, hmask_val, result);
+ return result;
+}
+
+static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask_val,
+ unsigned long hbase, unsigned long start,
+ unsigned long size, unsigned long arg4,
+ unsigned long arg5)
+{
+ struct sbiret ret = {0};
+ int ext = SBI_EXT_RFENCE;
+ int result = 0;
+
+ switch (fid) {
+ case SBI_EXT_RFENCE_REMOTE_FENCE_I:
+ ret = sbi_ecall(ext, fid, hmask_val, hbase, 0, 0, 0, 0);
+ break;
+ case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA:
+ ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+ size, 0, 0);
+ break;
+ case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID:
+ ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+ size, arg4, 0);
+ break;
+
+ case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA:
+ ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+ size, 0, 0);
+ break;
+ case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID:
+ ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+ size, arg4, 0);
+ break;
+ case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA:
+ ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+ size, 0, 0);
+ break;
+ case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID:
+ ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+ size, arg4, 0);
+ break;
+ default:
+ pr_err("unknown function ID [%lu] for SBI extension [%d]\n",
+ fid, ext);
+ result = -EINVAL;
+ }
+
+ if (ret.error) {
+ result = sbi_err_map_linux_errno(ret.error);
+ pr_err("%s: hbase = [%lu] hmask = [0x%lx] failed (error [%d])\n",
+ __func__, hbase, hmask_val, result);
+ }
+
+ return result;
+}
+
+static int __sbi_rfence_v02(int fid, const unsigned long *hart_mask,
+ unsigned long start, unsigned long size,
+ unsigned long arg4, unsigned long arg5)
+{
+ unsigned long hmask_val, hartid, hbase;
+ struct cpumask tmask;
+ int result;
+
+ if (!hart_mask || !(*hart_mask)) {
+ riscv_cpuid_to_hartid_mask(cpu_online_mask, &tmask);
+ hart_mask = cpumask_bits(&tmask);
+ }
+
+ hmask_val = 0;
+ hbase = 0;
+ for_each_set_bit(hartid, hart_mask, NR_CPUS) {
+ if (hmask_val && ((hbase + BITS_PER_LONG) <= hartid)) {
+ result = __sbi_rfence_v02_call(fid, hmask_val, hbase,
+ start, size, arg4, arg5);
+ if (result)
+ return result;
+ hmask_val = 0;
+ hbase = 0;
+ }
+ if (!hmask_val)
+ hbase = hartid;
+ hmask_val |= 1UL << (hartid - hbase);
+ }
+
+ if (hmask_val) {
+ result = __sbi_rfence_v02_call(fid, hmask_val, hbase,
+ start, size, arg4, arg5);
+ if (result)
+ return result;
+ }
+
+ return 0;
+}
+
+/**
+ * sbi_set_timer() - Program the timer for next timer event.
+ * @stime_value: The value after which next timer event should fire.
+ *
+ * Return: None
+ */
+void sbi_set_timer(uint64_t stime_value)
+{
+ __sbi_set_timer(stime_value);
+}
+
+/**
+ * sbi_send_ipi() - Send an IPI to any hart.
+ * @hart_mask: A cpu mask containing all the target harts.
+ *
+ * Return: None
+ */
+void sbi_send_ipi(const unsigned long *hart_mask)
+{
+ __sbi_send_ipi(hart_mask);
+}
+EXPORT_SYMBOL(sbi_send_ipi);
+
+/**
+ * sbi_remote_fence_i() - Execute FENCE.I instruction on given remote harts.
+ * @hart_mask: A cpu mask containing all the target harts.
+ *
+ * Return: None
+ */
+void sbi_remote_fence_i(const unsigned long *hart_mask)
+{
+ __sbi_rfence(SBI_EXT_RFENCE_REMOTE_FENCE_I,
+ hart_mask, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(sbi_remote_fence_i);
+
+/**
+ * sbi_remote_sfence_vma() - Execute SFENCE.VMA instructions on given remote
+ * harts for the specified virtual address range.
+ * @hart_mask: A cpu mask containing all the target harts.
+ * @start: Start of the virtual address
+ * @size: Total size of the virtual address range.
+ *
+ * Return: None
+ */
+void sbi_remote_sfence_vma(const unsigned long *hart_mask,
+ unsigned long start,
+ unsigned long size)
+{
+ __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA,
+ hart_mask, start, size, 0, 0);
+}
+EXPORT_SYMBOL(sbi_remote_sfence_vma);
+
+/**
+ * sbi_remote_sfence_vma_asid() - Execute SFENCE.VMA instructions on given
+ * remote harts for a virtual address range belonging to a specific ASID.
+ *
+ * @hart_mask: A cpu mask containing all the target harts.
+ * @start: Start of the virtual address
+ * @size: Total size of the virtual address range.
+ * @asid: The value of address space identifier (ASID).
+ *
+ * Return: None
+ */
+void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask,
+ unsigned long start,
+ unsigned long size,
+ unsigned long asid)
+{
+ __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID,
+ hart_mask, start, size, asid, 0);
+}
+EXPORT_SYMBOL(sbi_remote_sfence_vma_asid);
+
+/**
+ * sbi_remote_hfence_gvma() - Execute HFENCE.GVMA instructions on given remote
+ * harts for the specified guest physical address range.
+ * @hart_mask: A cpu mask containing all the target harts.
+ * @start: Start of the guest physical address
+ * @size: Total size of the guest physical address range.
+ *
+ * Return: None
+ */
+int sbi_remote_hfence_gvma(const unsigned long *hart_mask,
+ unsigned long start,
+ unsigned long size)
+{
+ return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA,
+ hart_mask, start, size, 0, 0);
+}
+EXPORT_SYMBOL_GPL(sbi_remote_hfence_gvma);
+
+/**
+ * sbi_remote_hfence_gvma_vmid() - Execute HFENCE.GVMA instructions on given
+ * remote harts for a guest physical address range belonging to a specific VMID.
+ *
+ * @hart_mask: A cpu mask containing all the target harts.
+ * @start: Start of the guest physical address
+ * @size: Total size of the guest physical address range.
+ * @vmid: The value of guest ID (VMID).
+ *
+ * Return: 0 if success, Error otherwise.
+ */
+int sbi_remote_hfence_gvma_vmid(const unsigned long *hart_mask,
+ unsigned long start,
+ unsigned long size,
+ unsigned long vmid)
+{
+ return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID,
+ hart_mask, start, size, vmid, 0);
+}
+EXPORT_SYMBOL(sbi_remote_hfence_gvma_vmid);
+
+/**
+ * sbi_remote_hfence_vvma() - Execute HFENCE.VVMA instructions on given remote
+ * harts for the current guest virtual address range.
+ * @hart_mask: A cpu mask containing all the target harts.
+ * @start: Start of the current guest virtual address
+ * @size: Total size of the current guest virtual address range.
+ *
+ * Return: None
+ */
+int sbi_remote_hfence_vvma(const unsigned long *hart_mask,
+ unsigned long start,
+ unsigned long size)
+{
+ return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA,
+ hart_mask, start, size, 0, 0);
+}
+EXPORT_SYMBOL(sbi_remote_hfence_vvma);
+
+/**
+ * sbi_remote_hfence_vvma_asid() - Execute HFENCE.VVMA instructions on given
+ * remote harts for current guest virtual address range belonging to a specific
+ * ASID.
+ *
+ * @hart_mask: A cpu mask containing all the target harts.
+ * @start: Start of the current guest virtual address
+ * @size: Total size of the current guest virtual address range.
+ * @asid: The value of address space identifier (ASID).
+ *
+ * Return: None
+ */
+int sbi_remote_hfence_vvma_asid(const unsigned long *hart_mask,
+ unsigned long start,
+ unsigned long size,
+ unsigned long asid)
+{
+ return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID,
+ hart_mask, start, size, asid, 0);
+}
+EXPORT_SYMBOL(sbi_remote_hfence_vvma_asid);
+
+/**
+ * sbi_probe_extension() - Check if an SBI extension ID is supported or not.
+ * @extid: The extension ID to be probed.
+ *
+ * Return: Extension specific nonzero value f yes, -ENOTSUPP otherwise.
+ */
+int sbi_probe_extension(int extid)
+{
+ struct sbiret ret;
+
+ ret = sbi_ecall(SBI_EXT_BASE, SBI_EXT_BASE_PROBE_EXT, extid,
+ 0, 0, 0, 0, 0);
+ if (!ret.error)
+ if (ret.value)
+ return ret.value;
+
+ return -ENOTSUPP;
+}
+EXPORT_SYMBOL(sbi_probe_extension);
+
+static long __sbi_base_ecall(int fid)
+{
+ struct sbiret ret;
+
+ ret = sbi_ecall(SBI_EXT_BASE, fid, 0, 0, 0, 0, 0, 0);
+ if (!ret.error)
+ return ret.value;
+ else
+ return sbi_err_map_linux_errno(ret.error);
+}
+
+static inline long sbi_get_spec_version(void)
+{
+ return __sbi_base_ecall(SBI_EXT_BASE_GET_SPEC_VERSION);
+}
+
+static inline long sbi_get_firmware_id(void)
+{
+ return __sbi_base_ecall(SBI_EXT_BASE_GET_IMP_ID);
+}
+
+static inline long sbi_get_firmware_version(void)
+{
+ return __sbi_base_ecall(SBI_EXT_BASE_GET_IMP_VERSION);
+}
static void sbi_power_off(void)
{
sbi_shutdown();
}
-static int __init sbi_init(void)
+int __init sbi_init(void)
{
+ int ret;
+
pm_power_off = sbi_power_off;
+ ret = sbi_get_spec_version();
+ if (ret > 0)
+ sbi_spec_version = ret;
+
+ pr_info("SBI specification v%lu.%lu detected\n",
+ sbi_major_version(), sbi_minor_version());
+
+ if (!sbi_spec_is_0_1()) {
+ pr_info("SBI implementation ID=0x%lx Version=0x%lx\n",
+ sbi_get_firmware_id(), sbi_get_firmware_version());
+ if (sbi_probe_extension(SBI_EXT_TIME) > 0) {
+ __sbi_set_timer = __sbi_set_timer_v02;
+ pr_info("SBI v0.2 TIME extension detected\n");
+ } else {
+ __sbi_set_timer = __sbi_set_timer_v01;
+ }
+ if (sbi_probe_extension(SBI_EXT_IPI) > 0) {
+ __sbi_send_ipi = __sbi_send_ipi_v02;
+ pr_info("SBI v0.2 IPI extension detected\n");
+ } else {
+ __sbi_send_ipi = __sbi_send_ipi_v01;
+ }
+ if (sbi_probe_extension(SBI_EXT_RFENCE) > 0) {
+ __sbi_rfence = __sbi_rfence_v02;
+ pr_info("SBI v0.2 RFENCE extension detected\n");
+ } else {
+ __sbi_rfence = __sbi_rfence_v01;
+ }
+ } else {
+ __sbi_set_timer = __sbi_set_timer_v01;
+ __sbi_send_ipi = __sbi_send_ipi_v01;
+ __sbi_rfence = __sbi_rfence_v01;
+ }
+
return 0;
}
-early_initcall(sbi_init);
diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index 0a6d415b0a5a..145128a7e560 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -16,12 +16,14 @@
#include <linux/of_platform.h>
#include <linux/sched/task.h>
#include <linux/swiotlb.h>
+#include <linux/smp.h>
#include <asm/clint.h>
+#include <asm/cpu_ops.h>
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
-#include <asm/smp.h>
+#include <asm/sbi.h>
#include <asm/tlbflush.h>
#include <asm/thread_info.h>
#include <asm/kasan.h>
@@ -39,9 +41,14 @@ struct screen_info screen_info = {
};
#endif
-/* The lucky hart to first increment this variable will boot the other cores */
-atomic_t hart_lottery;
+/*
+ * The lucky hart to first increment this variable will boot the other cores.
+ * This is used before the kernel initializes the BSS so it can't be in the
+ * BSS.
+ */
+atomic_t hart_lottery __section(.sdata);
unsigned long boot_cpu_hartid;
+static DEFINE_PER_CPU(struct cpu, cpu_devices);
void __init parse_dtb(void)
{
@@ -79,9 +86,28 @@ void __init setup_arch(char **cmdline_p)
kasan_init();
#endif
+#if IS_ENABLED(CONFIG_RISCV_SBI)
+ sbi_init();
+#endif
+
#ifdef CONFIG_SMP
setup_smp();
#endif
riscv_fill_hwcap();
}
+
+static int __init topology_init(void)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct cpu *cpu = &per_cpu(cpu_devices, i);
+
+ cpu->hotpluggable = cpu_has_hotplug(i);
+ register_cpu(cpu, i);
+ }
+
+ return 0;
+}
+subsys_initcall(topology_init);
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index 8bc01f0ca73b..4e9922790f6e 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -25,6 +25,7 @@
#include <linux/sched/task_stack.h>
#include <linux/sched/mm.h>
#include <asm/clint.h>
+#include <asm/cpu_ops.h>
#include <asm/irq.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
@@ -34,8 +35,6 @@
#include "head.h"
-void *__cpu_up_stack_pointer[NR_CPUS];
-void *__cpu_up_task_pointer[NR_CPUS];
static DECLARE_COMPLETION(cpu_running);
void __init smp_prepare_boot_cpu(void)
@@ -46,6 +45,7 @@ void __init smp_prepare_boot_cpu(void)
void __init smp_prepare_cpus(unsigned int max_cpus)
{
int cpuid;
+ int ret;
/* This covers non-smp usecase mandated by "nosmp" option */
if (max_cpus == 0)
@@ -54,6 +54,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
for_each_possible_cpu(cpuid) {
if (cpuid == smp_processor_id())
continue;
+ if (cpu_ops[cpuid]->cpu_prepare) {
+ ret = cpu_ops[cpuid]->cpu_prepare(cpuid);
+ if (ret)
+ continue;
+ }
set_cpu_present(cpuid, true);
}
}
@@ -65,6 +70,8 @@ void __init setup_smp(void)
bool found_boot_cpu = false;
int cpuid = 1;
+ cpu_set_ops(0);
+
for_each_of_cpu_node(dn) {
hart = riscv_of_processor_hartid(dn);
if (hart < 0)
@@ -92,36 +99,38 @@ void __init setup_smp(void)
cpuid, nr_cpu_ids);
for (cpuid = 1; cpuid < nr_cpu_ids; cpuid++) {
- if (cpuid_to_hartid_map(cpuid) != INVALID_HARTID)
+ if (cpuid_to_hartid_map(cpuid) != INVALID_HARTID) {
+ cpu_set_ops(cpuid);
set_cpu_possible(cpuid, true);
+ }
}
}
+int start_secondary_cpu(int cpu, struct task_struct *tidle)
+{
+ if (cpu_ops[cpu]->cpu_start)
+ return cpu_ops[cpu]->cpu_start(cpu, tidle);
+
+ return -EOPNOTSUPP;
+}
+
int __cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int ret = 0;
- int hartid = cpuid_to_hartid_map(cpu);
tidle->thread_info.cpu = cpu;
- /*
- * On RISC-V systems, all harts boot on their own accord. Our _start
- * selects the first hart to boot the kernel and causes the remainder
- * of the harts to spin in a loop waiting for their stack pointer to be
- * setup by that main hart. Writing __cpu_up_stack_pointer signals to
- * the spinning harts that they can continue the boot process.
- */
- smp_mb();
- WRITE_ONCE(__cpu_up_stack_pointer[hartid],
- task_stack_page(tidle) + THREAD_SIZE);
- WRITE_ONCE(__cpu_up_task_pointer[hartid], tidle);
-
- lockdep_assert_held(&cpu_running);
- wait_for_completion_timeout(&cpu_running,
+ ret = start_secondary_cpu(cpu, tidle);
+ if (!ret) {
+ lockdep_assert_held(&cpu_running);
+ wait_for_completion_timeout(&cpu_running,
msecs_to_jiffies(1000));
- if (!cpu_online(cpu)) {
- pr_crit("CPU%u: failed to come online\n", cpu);
- ret = -EIO;
+ if (!cpu_online(cpu)) {
+ pr_crit("CPU%u: failed to come online\n", cpu);
+ ret = -EIO;
+ }
+ } else {
+ pr_crit("CPU%u: failed to start\n", cpu);
}
return ret;
@@ -134,7 +143,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
/*
* C entry point for a secondary processor.
*/
-asmlinkage __visible void __init smp_callin(void)
+asmlinkage __visible void smp_callin(void)
{
struct mm_struct *mm = &init_mm;
diff --git a/arch/riscv/kernel/soc.c b/arch/riscv/kernel/soc.c
new file mode 100644
index 000000000000..0b3b3dc9ad0f
--- /dev/null
+++ b/arch/riscv/kernel/soc.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+#include <linux/init.h>
+#include <linux/libfdt.h>
+#include <asm/pgtable.h>
+#include <asm/soc.h>
+
+/*
+ * This is called extremly early, before parse_dtb(), to allow initializing
+ * SoC hardware before memory or any device driver initialization.
+ */
+void __init soc_early_init(void)
+{
+ void (*early_fn)(const void *fdt);
+ const struct of_device_id *s;
+ const void *fdt = dtb_early_va;
+
+ for (s = (void *)&__soc_early_init_table_start;
+ (void *)s < (void *)&__soc_early_init_table_end; s++) {
+ if (!fdt_node_check_compatible(fdt, 0, s->compatible)) {
+ early_fn = s->data;
+ early_fn(fdt);
+ return;
+ }
+ }
+}
diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c
index 0940681d2f68..02087fe539c6 100644
--- a/arch/riscv/kernel/stacktrace.c
+++ b/arch/riscv/kernel/stacktrace.c
@@ -19,6 +19,8 @@ struct stackframe {
unsigned long ra;
};
+register unsigned long sp_in_global __asm__("sp");
+
void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs,
bool (*fn)(unsigned long, void *), void *arg)
{
@@ -29,7 +31,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs,
sp = user_stack_pointer(regs);
pc = instruction_pointer(regs);
} else if (task == NULL || task == current) {
- const register unsigned long current_sp __asm__ ("sp");
+ const register unsigned long current_sp = sp_in_global;
fp = (unsigned long)__builtin_frame_address(0);
sp = current_sp;
pc = (unsigned long)walk_stackframe;
@@ -73,8 +75,7 @@ static void notrace walk_stackframe(struct task_struct *task,
sp = user_stack_pointer(regs);
pc = instruction_pointer(regs);
} else if (task == NULL || task == current) {
- const register unsigned long current_sp __asm__ ("sp");
- sp = current_sp;
+ sp = sp_in_global;
pc = (unsigned long)walk_stackframe;
} else {
/* task blocked in __switch_to */
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 55ea614d89bf..7f58fa53033f 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -97,12 +97,33 @@ DO_ERROR_INFO(do_trap_insn_fault,
SIGSEGV, SEGV_ACCERR, "instruction access fault");
DO_ERROR_INFO(do_trap_insn_illegal,
SIGILL, ILL_ILLOPC, "illegal instruction");
-DO_ERROR_INFO(do_trap_load_misaligned,
- SIGBUS, BUS_ADRALN, "load address misaligned");
DO_ERROR_INFO(do_trap_load_fault,
SIGSEGV, SEGV_ACCERR, "load access fault");
+#ifndef CONFIG_RISCV_M_MODE
+DO_ERROR_INFO(do_trap_load_misaligned,
+ SIGBUS, BUS_ADRALN, "Oops - load address misaligned");
DO_ERROR_INFO(do_trap_store_misaligned,
- SIGBUS, BUS_ADRALN, "store (or AMO) address misaligned");
+ SIGBUS, BUS_ADRALN, "Oops - store (or AMO) address misaligned");
+#else
+int handle_misaligned_load(struct pt_regs *regs);
+int handle_misaligned_store(struct pt_regs *regs);
+
+asmlinkage void do_trap_load_misaligned(struct pt_regs *regs)
+{
+ if (!handle_misaligned_load(regs))
+ return;
+ do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc,
+ "Oops - load address misaligned");
+}
+
+asmlinkage void do_trap_store_misaligned(struct pt_regs *regs)
+{
+ if (!handle_misaligned_store(regs))
+ return;
+ do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc,
+ "Oops - store (or AMO) address misaligned");
+}
+#endif
DO_ERROR_INFO(do_trap_store_fault,
SIGSEGV, SEGV_ACCERR, "store (or AMO) access fault");
DO_ERROR_INFO(do_trap_ecall_u,
@@ -118,7 +139,8 @@ static inline unsigned long get_break_insn_length(unsigned long pc)
if (probe_kernel_address((bug_insn_t *)pc, insn))
return 0;
- return (((insn & __INSN_LENGTH_MASK) == __INSN_LENGTH_32) ? 4UL : 2UL);
+
+ return GET_INSN_LENGTH(insn);
}
asmlinkage __visible void do_trap_break(struct pt_regs *regs)
@@ -147,7 +169,7 @@ int is_valid_bugaddr(unsigned long pc)
}
#endif /* CONFIG_GENERIC_BUG */
-void __init trap_init(void)
+void trap_init(void)
{
/*
* Set sup0 scratch register to 0, indicating to exception vector
diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
new file mode 100644
index 000000000000..46c4dafe3ba0
--- /dev/null
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/irq.h>
+
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/csr.h>
+
+#define INSN_MATCH_LB 0x3
+#define INSN_MASK_LB 0x707f
+#define INSN_MATCH_LH 0x1003
+#define INSN_MASK_LH 0x707f
+#define INSN_MATCH_LW 0x2003
+#define INSN_MASK_LW 0x707f
+#define INSN_MATCH_LD 0x3003
+#define INSN_MASK_LD 0x707f
+#define INSN_MATCH_LBU 0x4003
+#define INSN_MASK_LBU 0x707f
+#define INSN_MATCH_LHU 0x5003
+#define INSN_MASK_LHU 0x707f
+#define INSN_MATCH_LWU 0x6003
+#define INSN_MASK_LWU 0x707f
+#define INSN_MATCH_SB 0x23
+#define INSN_MASK_SB 0x707f
+#define INSN_MATCH_SH 0x1023
+#define INSN_MASK_SH 0x707f
+#define INSN_MATCH_SW 0x2023
+#define INSN_MASK_SW 0x707f
+#define INSN_MATCH_SD 0x3023
+#define INSN_MASK_SD 0x707f
+
+#define INSN_MATCH_FLW 0x2007
+#define INSN_MASK_FLW 0x707f
+#define INSN_MATCH_FLD 0x3007
+#define INSN_MASK_FLD 0x707f
+#define INSN_MATCH_FLQ 0x4007
+#define INSN_MASK_FLQ 0x707f
+#define INSN_MATCH_FSW 0x2027
+#define INSN_MASK_FSW 0x707f
+#define INSN_MATCH_FSD 0x3027
+#define INSN_MASK_FSD 0x707f
+#define INSN_MATCH_FSQ 0x4027
+#define INSN_MASK_FSQ 0x707f
+
+#define INSN_MATCH_C_LD 0x6000
+#define INSN_MASK_C_LD 0xe003
+#define INSN_MATCH_C_SD 0xe000
+#define INSN_MASK_C_SD 0xe003
+#define INSN_MATCH_C_LW 0x4000
+#define INSN_MASK_C_LW 0xe003
+#define INSN_MATCH_C_SW 0xc000
+#define INSN_MASK_C_SW 0xe003
+#define INSN_MATCH_C_LDSP 0x6002
+#define INSN_MASK_C_LDSP 0xe003
+#define INSN_MATCH_C_SDSP 0xe002
+#define INSN_MASK_C_SDSP 0xe003
+#define INSN_MATCH_C_LWSP 0x4002
+#define INSN_MASK_C_LWSP 0xe003
+#define INSN_MATCH_C_SWSP 0xc002
+#define INSN_MASK_C_SWSP 0xe003
+
+#define INSN_MATCH_C_FLD 0x2000
+#define INSN_MASK_C_FLD 0xe003
+#define INSN_MATCH_C_FLW 0x6000
+#define INSN_MASK_C_FLW 0xe003
+#define INSN_MATCH_C_FSD 0xa000
+#define INSN_MASK_C_FSD 0xe003
+#define INSN_MATCH_C_FSW 0xe000
+#define INSN_MASK_C_FSW 0xe003
+#define INSN_MATCH_C_FLDSP 0x2002
+#define INSN_MASK_C_FLDSP 0xe003
+#define INSN_MATCH_C_FSDSP 0xa002
+#define INSN_MASK_C_FSDSP 0xe003
+#define INSN_MATCH_C_FLWSP 0x6002
+#define INSN_MASK_C_FLWSP 0xe003
+#define INSN_MATCH_C_FSWSP 0xe002
+#define INSN_MASK_C_FSWSP 0xe003
+
+#define INSN_LEN(insn) ((((insn) & 0x3) < 0x3) ? 2 : 4)
+
+#if defined(CONFIG_64BIT)
+#define LOG_REGBYTES 3
+#define XLEN 64
+#else
+#define LOG_REGBYTES 2
+#define XLEN 32
+#endif
+#define REGBYTES (1 << LOG_REGBYTES)
+#define XLEN_MINUS_16 ((XLEN) - 16)
+
+#define SH_RD 7
+#define SH_RS1 15
+#define SH_RS2 20
+#define SH_RS2C 2
+
+#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1))
+#define RVC_LW_IMM(x) ((RV_X(x, 6, 1) << 2) | \
+ (RV_X(x, 10, 3) << 3) | \
+ (RV_X(x, 5, 1) << 6))
+#define RVC_LD_IMM(x) ((RV_X(x, 10, 3) << 3) | \
+ (RV_X(x, 5, 2) << 6))
+#define RVC_LWSP_IMM(x) ((RV_X(x, 4, 3) << 2) | \
+ (RV_X(x, 12, 1) << 5) | \
+ (RV_X(x, 2, 2) << 6))
+#define RVC_LDSP_IMM(x) ((RV_X(x, 5, 2) << 3) | \
+ (RV_X(x, 12, 1) << 5) | \
+ (RV_X(x, 2, 3) << 6))
+#define RVC_SWSP_IMM(x) ((RV_X(x, 9, 4) << 2) | \
+ (RV_X(x, 7, 2) << 6))
+#define RVC_SDSP_IMM(x) ((RV_X(x, 10, 3) << 3) | \
+ (RV_X(x, 7, 3) << 6))
+#define RVC_RS1S(insn) (8 + RV_X(insn, SH_RD, 3))
+#define RVC_RS2S(insn) (8 + RV_X(insn, SH_RS2C, 3))
+#define RVC_RS2(insn) RV_X(insn, SH_RS2C, 5)
+
+#define SHIFT_RIGHT(x, y) \
+ ((y) < 0 ? ((x) << -(y)) : ((x) >> (y)))
+
+#define REG_MASK \
+ ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES))
+
+#define REG_OFFSET(insn, pos) \
+ (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK)
+
+#define REG_PTR(insn, pos, regs) \
+ (ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))
+
+#define GET_RM(insn) (((insn) >> 12) & 7)
+
+#define GET_RS1(insn, regs) (*REG_PTR(insn, SH_RS1, regs))
+#define GET_RS2(insn, regs) (*REG_PTR(insn, SH_RS2, regs))
+#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs))
+#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs))
+#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs))
+#define GET_SP(regs) (*REG_PTR(2, 0, regs))
+#define SET_RD(insn, regs, val) (*REG_PTR(insn, SH_RD, regs) = (val))
+#define IMM_I(insn) ((s32)(insn) >> 20)
+#define IMM_S(insn) (((s32)(insn) >> 25 << 5) | \
+ (s32)(((insn) >> 7) & 0x1f))
+#define MASK_FUNCT3 0x7000
+
+#define GET_PRECISION(insn) (((insn) >> 25) & 3)
+#define GET_RM(insn) (((insn) >> 12) & 7)
+#define PRECISION_S 0
+#define PRECISION_D 1
+
+#define STR(x) XSTR(x)
+#define XSTR(x) #x
+
+#define DECLARE_UNPRIVILEGED_LOAD_FUNCTION(type, insn) \
+static inline type load_##type(const type *addr) \
+{ \
+ type val; \
+ asm (#insn " %0, %1" \
+ : "=&r" (val) : "m" (*addr)); \
+ return val; \
+}
+
+#define DECLARE_UNPRIVILEGED_STORE_FUNCTION(type, insn) \
+static inline void store_##type(type *addr, type val) \
+{ \
+ asm volatile (#insn " %0, %1\n" \
+ : : "r" (val), "m" (*addr)); \
+}
+
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u8, lbu)
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u16, lhu)
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(s8, lb)
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(s16, lh)
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(s32, lw)
+DECLARE_UNPRIVILEGED_STORE_FUNCTION(u8, sb)
+DECLARE_UNPRIVILEGED_STORE_FUNCTION(u16, sh)
+DECLARE_UNPRIVILEGED_STORE_FUNCTION(u32, sw)
+#if defined(CONFIG_64BIT)
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u32, lwu)
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u64, ld)
+DECLARE_UNPRIVILEGED_STORE_FUNCTION(u64, sd)
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(ulong, ld)
+#else
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u32, lw)
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(ulong, lw)
+
+static inline u64 load_u64(const u64 *addr)
+{
+ return load_u32((u32 *)addr)
+ + ((u64)load_u32((u32 *)addr + 1) << 32);
+}
+
+static inline void store_u64(u64 *addr, u64 val)
+{
+ store_u32((u32 *)addr, val);
+ store_u32((u32 *)addr + 1, val >> 32);
+}
+#endif
+
+static inline ulong get_insn(ulong mepc)
+{
+ register ulong __mepc asm ("a2") = mepc;
+ ulong val, rvc_mask = 3, tmp;
+
+ asm ("and %[tmp], %[addr], 2\n"
+ "bnez %[tmp], 1f\n"
+#if defined(CONFIG_64BIT)
+ STR(LWU) " %[insn], (%[addr])\n"
+#else
+ STR(LW) " %[insn], (%[addr])\n"
+#endif
+ "and %[tmp], %[insn], %[rvc_mask]\n"
+ "beq %[tmp], %[rvc_mask], 2f\n"
+ "sll %[insn], %[insn], %[xlen_minus_16]\n"
+ "srl %[insn], %[insn], %[xlen_minus_16]\n"
+ "j 2f\n"
+ "1:\n"
+ "lhu %[insn], (%[addr])\n"
+ "and %[tmp], %[insn], %[rvc_mask]\n"
+ "bne %[tmp], %[rvc_mask], 2f\n"
+ "lhu %[tmp], 2(%[addr])\n"
+ "sll %[tmp], %[tmp], 16\n"
+ "add %[insn], %[insn], %[tmp]\n"
+ "2:"
+ : [insn] "=&r" (val), [tmp] "=&r" (tmp)
+ : [addr] "r" (__mepc), [rvc_mask] "r" (rvc_mask),
+ [xlen_minus_16] "i" (XLEN_MINUS_16));
+
+ return val;
+}
+
+union reg_data {
+ u8 data_bytes[8];
+ ulong data_ulong;
+ u64 data_u64;
+};
+
+int handle_misaligned_load(struct pt_regs *regs)
+{
+ union reg_data val;
+ unsigned long epc = regs->epc;
+ unsigned long insn = get_insn(epc);
+ unsigned long addr = csr_read(mtval);
+ int i, fp = 0, shift = 0, len = 0;
+
+ regs->epc = 0;
+
+ if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) {
+ len = 4;
+ shift = 8 * (sizeof(unsigned long) - len);
+#if defined(CONFIG_64BIT)
+ } else if ((insn & INSN_MASK_LD) == INSN_MATCH_LD) {
+ len = 8;
+ shift = 8 * (sizeof(unsigned long) - len);
+ } else if ((insn & INSN_MASK_LWU) == INSN_MATCH_LWU) {
+ len = 4;
+#endif
+ } else if ((insn & INSN_MASK_FLD) == INSN_MATCH_FLD) {
+ fp = 1;
+ len = 8;
+ } else if ((insn & INSN_MASK_FLW) == INSN_MATCH_FLW) {
+ fp = 1;
+ len = 4;
+ } else if ((insn & INSN_MASK_LH) == INSN_MATCH_LH) {
+ len = 2;
+ shift = 8 * (sizeof(unsigned long) - len);
+ } else if ((insn & INSN_MASK_LHU) == INSN_MATCH_LHU) {
+ len = 2;
+#if defined(CONFIG_64BIT)
+ } else if ((insn & INSN_MASK_C_LD) == INSN_MATCH_C_LD) {
+ len = 8;
+ shift = 8 * (sizeof(unsigned long) - len);
+ insn = RVC_RS2S(insn) << SH_RD;
+ } else if ((insn & INSN_MASK_C_LDSP) == INSN_MATCH_C_LDSP &&
+ ((insn >> SH_RD) & 0x1f)) {
+ len = 8;
+ shift = 8 * (sizeof(unsigned long) - len);
+#endif
+ } else if ((insn & INSN_MASK_C_LW) == INSN_MATCH_C_LW) {
+ len = 4;
+ shift = 8 * (sizeof(unsigned long) - len);
+ insn = RVC_RS2S(insn) << SH_RD;
+ } else if ((insn & INSN_MASK_C_LWSP) == INSN_MATCH_C_LWSP &&
+ ((insn >> SH_RD) & 0x1f)) {
+ len = 4;
+ shift = 8 * (sizeof(unsigned long) - len);
+ } else if ((insn & INSN_MASK_C_FLD) == INSN_MATCH_C_FLD) {
+ fp = 1;
+ len = 8;
+ insn = RVC_RS2S(insn) << SH_RD;
+ } else if ((insn & INSN_MASK_C_FLDSP) == INSN_MATCH_C_FLDSP) {
+ fp = 1;
+ len = 8;
+#if defined(CONFIG_32BIT)
+ } else if ((insn & INSN_MASK_C_FLW) == INSN_MATCH_C_FLW) {
+ fp = 1;
+ len = 4;
+ insn = RVC_RS2S(insn) << SH_RD;
+ } else if ((insn & INSN_MASK_C_FLWSP) == INSN_MATCH_C_FLWSP) {
+ fp = 1;
+ len = 4;
+#endif
+ } else {
+ regs->epc = epc;
+ return -1;
+ }
+
+ val.data_u64 = 0;
+ for (i = 0; i < len; i++)
+ val.data_bytes[i] = load_u8((void *)(addr + i));
+
+ if (fp)
+ return -1;
+ SET_RD(insn, regs, val.data_ulong << shift >> shift);
+
+ regs->epc = epc + INSN_LEN(insn);
+
+ return 0;
+}
+
+int handle_misaligned_store(struct pt_regs *regs)
+{
+ union reg_data val;
+ unsigned long epc = regs->epc;
+ unsigned long insn = get_insn(epc);
+ unsigned long addr = csr_read(mtval);
+ int i, len = 0;
+
+ regs->epc = 0;
+
+ val.data_ulong = GET_RS2(insn, regs);
+
+ if ((insn & INSN_MASK_SW) == INSN_MATCH_SW) {
+ len = 4;
+#if defined(CONFIG_64BIT)
+ } else if ((insn & INSN_MASK_SD) == INSN_MATCH_SD) {
+ len = 8;
+#endif
+ } else if ((insn & INSN_MASK_SH) == INSN_MATCH_SH) {
+ len = 2;
+#if defined(CONFIG_64BIT)
+ } else if ((insn & INSN_MASK_C_SD) == INSN_MATCH_C_SD) {
+ len = 8;
+ val.data_ulong = GET_RS2S(insn, regs);
+ } else if ((insn & INSN_MASK_C_SDSP) == INSN_MATCH_C_SDSP &&
+ ((insn >> SH_RD) & 0x1f)) {
+ len = 8;
+ val.data_ulong = GET_RS2C(insn, regs);
+#endif
+ } else if ((insn & INSN_MASK_C_SW) == INSN_MATCH_C_SW) {
+ len = 4;
+ val.data_ulong = GET_RS2S(insn, regs);
+ } else if ((insn & INSN_MASK_C_SWSP) == INSN_MATCH_C_SWSP &&
+ ((insn >> SH_RD) & 0x1f)) {
+ len = 4;
+ val.data_ulong = GET_RS2C(insn, regs);
+ } else {
+ regs->epc = epc;
+ return -1;
+ }
+
+ for (i = 0; i < len; i++)
+ store_u8((void *)(addr + i), val.data_bytes[i]);
+
+ regs->epc = epc + INSN_LEN(insn);
+
+ return 0;
+}
diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S
index 1e0193ded420..0339b6bbe11a 100644
--- a/arch/riscv/kernel/vmlinux.lds.S
+++ b/arch/riscv/kernel/vmlinux.lds.S
@@ -9,7 +9,9 @@
#include <asm/page.h>
#include <asm/cache.h>
#include <asm/thread_info.h>
+#include <asm/set_memory.h>
+#include <linux/sizes.h>
OUTPUT_ARCH(riscv)
ENTRY(_start)
@@ -20,10 +22,18 @@ SECTIONS
/* Beginning of code and text segment */
. = LOAD_OFFSET;
_start = .;
- __init_begin = .;
HEAD_TEXT_SECTION
+ . = ALIGN(PAGE_SIZE);
+
+ __init_begin = .;
INIT_TEXT_SECTION(PAGE_SIZE)
INIT_DATA_SECTION(16)
+ . = ALIGN(8);
+ __soc_early_init_table : {
+ __soc_early_init_table_start = .;
+ KEEP(*(__soc_early_init_table))
+ __soc_early_init_table_end = .;
+ }
/* we have to discard exit text and such at runtime, not link time */
.exit.text :
{
@@ -36,6 +46,7 @@ SECTIONS
PERCPU_SECTION(L1_CACHE_BYTES)
__init_end = .;
+ . = ALIGN(SECTION_ALIGN);
.text : {
_text = .;
_stext = .;
@@ -53,24 +64,26 @@ SECTIONS
/* Start of data section */
_sdata = .;
- RO_DATA(L1_CACHE_BYTES)
+ RO_DATA(SECTION_ALIGN)
.srodata : {
*(.srodata*)
}
+ EXCEPTION_TABLE(0x10)
+
+ . = ALIGN(SECTION_ALIGN);
+ _data = .;
+
RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
.sdata : {
__global_pointer$ = . + 0x800;
*(.sdata*)
/* End of data section */
_edata = .;
- *(.sbss*)
}
BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0)
- EXCEPTION_TABLE(0x10)
-
.rel.dyn : {
*(.rel.dyn*)
}
diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S
index f29d2ba2c0a6..fceaeb18cc64 100644
--- a/arch/riscv/lib/uaccess.S
+++ b/arch/riscv/lib/uaccess.S
@@ -3,14 +3,12 @@
#include <asm/asm.h>
#include <asm/csr.h>
- .altmacro
.macro fixup op reg addr lbl
- LOCAL _epc
-_epc:
+100:
\op \reg, \addr
.section __ex_table,"a"
.balign RISCV_SZPTR
- RISCV_PTR _epc, \lbl
+ RISCV_PTR 100b, \lbl
.previous
.endm
diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile
index 50b7af58c566..363ef01c30b1 100644
--- a/arch/riscv/mm/Makefile
+++ b/arch/riscv/mm/Makefile
@@ -7,7 +7,7 @@ endif
obj-y += init.o
obj-y += extable.o
-obj-$(CONFIG_MMU) += fault.o
+obj-$(CONFIG_MMU) += fault.o pageattr.o
obj-y += cacheflush.o
obj-y += context.o
@@ -15,6 +15,7 @@ ifeq ($(CONFIG_MMU),y)
obj-$(CONFIG_SMP) += tlbflush.o
endif
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_KASAN) += kasan_init.o
ifdef CONFIG_KASAN
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 0d4747e9d5b5..a6189ed36c5f 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -4,14 +4,12 @@
int pud_huge(pud_t pud)
{
- return pud_present(pud) &&
- (pud_val(pud) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC));
+ return pud_leaf(pud);
}
int pmd_huge(pmd_t pmd)
{
- return pmd_present(pmd) &&
- (pmd_val(pmd) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC));
+ return pmd_leaf(pmd);
}
static __init int setup_hugepagesz(char *opt)
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index fab855963c73..b55be44ff9bd 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -12,6 +12,7 @@
#include <linux/sizes.h>
#include <linux/of_fdt.h>
#include <linux/libfdt.h>
+#include <linux/set_memory.h>
#include <asm/fixmap.h>
#include <asm/tlbflush.h>
@@ -477,6 +478,17 @@ static void __init setup_vm_final(void)
csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE);
local_flush_tlb_all();
}
+
+void free_initmem(void)
+{
+ unsigned long init_begin = (unsigned long)__init_begin;
+ unsigned long init_end = (unsigned long)__init_end;
+
+ /* Make the region as non-execuatble. */
+ set_memory_nx(init_begin, (init_end - init_begin) >> PAGE_SHIFT);
+ free_initmem_default(POISON_FREE_INITMEM);
+}
+
#else
asmlinkage void __init setup_vm(uintptr_t dtb_pa)
{
@@ -488,6 +500,38 @@ static inline void setup_vm_final(void)
}
#endif /* CONFIG_MMU */
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void set_kernel_text_rw(void)
+{
+ unsigned long text_start = (unsigned long)_text;
+ unsigned long text_end = (unsigned long)_etext;
+
+ set_memory_rw(text_start, (text_end - text_start) >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+ unsigned long text_start = (unsigned long)_text;
+ unsigned long text_end = (unsigned long)_etext;
+
+ set_memory_ro(text_start, (text_end - text_start) >> PAGE_SHIFT);
+}
+
+void mark_rodata_ro(void)
+{
+ unsigned long text_start = (unsigned long)_text;
+ unsigned long text_end = (unsigned long)_etext;
+ unsigned long rodata_start = (unsigned long)__start_rodata;
+ unsigned long data_start = (unsigned long)_data;
+ unsigned long max_low = (unsigned long)(__va(PFN_PHYS(max_low_pfn)));
+
+ set_memory_ro(text_start, (text_end - text_start) >> PAGE_SHIFT);
+ set_memory_ro(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT);
+ set_memory_nx(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT);
+ set_memory_nx(data_start, (max_low - data_start) >> PAGE_SHIFT);
+}
+#endif
+
void __init paging_init(void)
{
setup_vm_final();
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
new file mode 100644
index 000000000000..728759eb530a
--- /dev/null
+++ b/arch/riscv/mm/pageattr.c
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019 SiFive
+ */
+
+#include <linux/pagewalk.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/bitops.h>
+
+struct pageattr_masks {
+ pgprot_t set_mask;
+ pgprot_t clear_mask;
+};
+
+static unsigned long set_pageattr_masks(unsigned long val, struct mm_walk *walk)
+{
+ struct pageattr_masks *masks = walk->private;
+ unsigned long new_val = val;
+
+ new_val &= ~(pgprot_val(masks->clear_mask));
+ new_val |= (pgprot_val(masks->set_mask));
+
+ return new_val;
+}
+
+static int pageattr_pgd_entry(pgd_t *pgd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pgd_t val = READ_ONCE(*pgd);
+
+ if (pgd_leaf(val)) {
+ val = __pgd(set_pageattr_masks(pgd_val(val), walk));
+ set_pgd(pgd, val);
+ }
+
+ return 0;
+}
+
+static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ p4d_t val = READ_ONCE(*p4d);
+
+ if (p4d_leaf(val)) {
+ val = __p4d(set_pageattr_masks(p4d_val(val), walk));
+ set_p4d(p4d, val);
+ }
+
+ return 0;
+}
+
+static int pageattr_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pud_t val = READ_ONCE(*pud);
+
+ if (pud_leaf(val)) {
+ val = __pud(set_pageattr_masks(pud_val(val), walk));
+ set_pud(pud, val);
+ }
+
+ return 0;
+}
+
+static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pmd_t val = READ_ONCE(*pmd);
+
+ if (pmd_leaf(val)) {
+ val = __pmd(set_pageattr_masks(pmd_val(val), walk));
+ set_pmd(pmd, val);
+ }
+
+ return 0;
+}
+
+static int pageattr_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t val = READ_ONCE(*pte);
+
+ val = __pte(set_pageattr_masks(pte_val(val), walk));
+ set_pte(pte, val);
+
+ return 0;
+}
+
+static int pageattr_pte_hole(unsigned long addr, unsigned long next,
+ int depth, struct mm_walk *walk)
+{
+ /* Nothing to do here */
+ return 0;
+}
+
+const static struct mm_walk_ops pageattr_ops = {
+ .pgd_entry = pageattr_pgd_entry,
+ .p4d_entry = pageattr_p4d_entry,
+ .pud_entry = pageattr_pud_entry,
+ .pmd_entry = pageattr_pmd_entry,
+ .pte_entry = pageattr_pte_entry,
+ .pte_hole = pageattr_pte_hole,
+};
+
+static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,
+ pgprot_t clear_mask)
+{
+ int ret;
+ unsigned long start = addr;
+ unsigned long end = start + PAGE_SIZE * numpages;
+ struct pageattr_masks masks = {
+ .set_mask = set_mask,
+ .clear_mask = clear_mask
+ };
+
+ if (!numpages)
+ return 0;
+
+ down_read(&init_mm.mmap_sem);
+ ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL,
+ &masks);
+ up_read(&init_mm.mmap_sem);
+
+ flush_tlb_kernel_range(start, end);
+
+ return ret;
+}
+
+int set_memory_ro(unsigned long addr, int numpages)
+{
+ return __set_memory(addr, numpages, __pgprot(_PAGE_READ),
+ __pgprot(_PAGE_WRITE));
+}
+
+int set_memory_rw(unsigned long addr, int numpages)
+{
+ return __set_memory(addr, numpages, __pgprot(_PAGE_READ | _PAGE_WRITE),
+ __pgprot(0));
+}
+
+int set_memory_x(unsigned long addr, int numpages)
+{
+ return __set_memory(addr, numpages, __pgprot(_PAGE_EXEC), __pgprot(0));
+}
+
+int set_memory_nx(unsigned long addr, int numpages)
+{
+ return __set_memory(addr, numpages, __pgprot(0), __pgprot(_PAGE_EXEC));
+}
+
+int set_direct_map_invalid_noflush(struct page *page)
+{
+ unsigned long start = (unsigned long)page_address(page);
+ unsigned long end = start + PAGE_SIZE;
+ struct pageattr_masks masks = {
+ .set_mask = __pgprot(0),
+ .clear_mask = __pgprot(_PAGE_PRESENT)
+ };
+
+ return walk_page_range(&init_mm, start, end, &pageattr_ops, &masks);
+}
+
+int set_direct_map_default_noflush(struct page *page)
+{
+ unsigned long start = (unsigned long)page_address(page);
+ unsigned long end = start + PAGE_SIZE;
+ struct pageattr_masks masks = {
+ .set_mask = PAGE_KERNEL,
+ .clear_mask = __pgprot(0)
+ };
+
+ return walk_page_range(&init_mm, start, end, &pageattr_ops, &masks);
+}
+
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ if (!debug_pagealloc_enabled())
+ return;
+
+ if (enable)
+ __set_memory((unsigned long)page_address(page), numpages,
+ __pgprot(_PAGE_PRESENT), __pgprot(0));
+ else
+ __set_memory((unsigned long)page_address(page), numpages,
+ __pgprot(0), __pgprot(_PAGE_PRESENT));
+}
diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
new file mode 100644
index 000000000000..7eab76a93106
--- /dev/null
+++ b/arch/riscv/mm/ptdump.c
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019 SiFive
+ */
+
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/ptdump.h>
+
+#include <asm/ptdump.h>
+#include <asm/pgtable.h>
+#include <asm/kasan.h>
+
+#define pt_dump_seq_printf(m, fmt, args...) \
+({ \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
+
+#define pt_dump_seq_puts(m, fmt) \
+({ \
+ if (m) \
+ seq_printf(m, fmt); \
+})
+
+/*
+ * The page dumper groups page table entries of the same type into a single
+ * description. It uses pg_state to track the range information while
+ * iterating over the pte entries. When the continuity is broken it then
+ * dumps out a description of the range.
+ */
+struct pg_state {
+ struct ptdump_state ptdump;
+ struct seq_file *seq;
+ const struct addr_marker *marker;
+ unsigned long start_address;
+ unsigned long start_pa;
+ unsigned long last_pa;
+ int level;
+ u64 current_prot;
+ bool check_wx;
+ unsigned long wx_pages;
+};
+
+/* Address marker */
+struct addr_marker {
+ unsigned long start_address;
+ const char *name;
+};
+
+static struct addr_marker address_markers[] = {
+#ifdef CONFIG_KASAN
+ {KASAN_SHADOW_START, "Kasan shadow start"},
+ {KASAN_SHADOW_END, "Kasan shadow end"},
+#endif
+ {FIXADDR_START, "Fixmap start"},
+ {FIXADDR_TOP, "Fixmap end"},
+ {PCI_IO_START, "PCI I/O start"},
+ {PCI_IO_END, "PCI I/O end"},
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ {VMEMMAP_START, "vmemmap start"},
+ {VMEMMAP_END, "vmemmap end"},
+#endif
+ {VMALLOC_START, "vmalloc() area"},
+ {VMALLOC_END, "vmalloc() end"},
+ {PAGE_OFFSET, "Linear mapping"},
+ {-1, NULL},
+};
+
+/* Page Table Entry */
+struct prot_bits {
+ u64 mask;
+ u64 val;
+ const char *set;
+ const char *clear;
+};
+
+static const struct prot_bits pte_bits[] = {
+ {
+ .mask = _PAGE_SOFT,
+ .val = _PAGE_SOFT,
+ .set = "RSW",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_DIRTY,
+ .val = _PAGE_DIRTY,
+ .set = "D",
+ .clear = ".",
+ }, {
+ .mask = _PAGE_ACCESSED,
+ .val = _PAGE_ACCESSED,
+ .set = "A",
+ .clear = ".",
+ }, {
+ .mask = _PAGE_GLOBAL,
+ .val = _PAGE_GLOBAL,
+ .set = "G",
+ .clear = ".",
+ }, {
+ .mask = _PAGE_USER,
+ .val = _PAGE_USER,
+ .set = "U",
+ .clear = ".",
+ }, {
+ .mask = _PAGE_EXEC,
+ .val = _PAGE_EXEC,
+ .set = "X",
+ .clear = ".",
+ }, {
+ .mask = _PAGE_WRITE,
+ .val = _PAGE_WRITE,
+ .set = "W",
+ .clear = ".",
+ }, {
+ .mask = _PAGE_READ,
+ .val = _PAGE_READ,
+ .set = "R",
+ .clear = ".",
+ }, {
+ .mask = _PAGE_PRESENT,
+ .val = _PAGE_PRESENT,
+ .set = "V",
+ .clear = ".",
+ }
+};
+
+/* Page Level */
+struct pg_level {
+ const char *name;
+ u64 mask;
+};
+
+static struct pg_level pg_level[] = {
+ { /* pgd */
+ .name = "PGD",
+ }, { /* p4d */
+ .name = (CONFIG_PGTABLE_LEVELS > 4) ? "P4D" : "PGD",
+ }, { /* pud */
+ .name = (CONFIG_PGTABLE_LEVELS > 3) ? "PUD" : "PGD",
+ }, { /* pmd */
+ .name = (CONFIG_PGTABLE_LEVELS > 2) ? "PMD" : "PGD",
+ }, { /* pte */
+ .name = "PTE",
+ },
+};
+
+static void dump_prot(struct pg_state *st)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(pte_bits); i++) {
+ const char *s;
+
+ if ((st->current_prot & pte_bits[i].mask) == pte_bits[i].val)
+ s = pte_bits[i].set;
+ else
+ s = pte_bits[i].clear;
+
+ if (s)
+ pt_dump_seq_printf(st->seq, " %s", s);
+ }
+}
+
+#ifdef CONFIG_64BIT
+#define ADDR_FORMAT "0x%016lx"
+#else
+#define ADDR_FORMAT "0x%08lx"
+#endif
+static void dump_addr(struct pg_state *st, unsigned long addr)
+{
+ static const char units[] = "KMGTPE";
+ const char *unit = units;
+ unsigned long delta;
+
+ pt_dump_seq_printf(st->seq, ADDR_FORMAT "-" ADDR_FORMAT " ",
+ st->start_address, addr);
+
+ pt_dump_seq_printf(st->seq, " " ADDR_FORMAT " ", st->start_pa);
+ delta = (addr - st->start_address) >> 10;
+
+ while (!(delta & 1023) && unit[1]) {
+ delta >>= 10;
+ unit++;
+ }
+
+ pt_dump_seq_printf(st->seq, "%9lu%c %s", delta, *unit,
+ pg_level[st->level].name);
+}
+
+static void note_prot_wx(struct pg_state *st, unsigned long addr)
+{
+ if (!st->check_wx)
+ return;
+
+ if ((st->current_prot & (_PAGE_WRITE | _PAGE_EXEC)) !=
+ (_PAGE_WRITE | _PAGE_EXEC))
+ return;
+
+ WARN_ONCE(1, "riscv/mm: Found insecure W+X mapping at address %p/%pS\n",
+ (void *)st->start_address, (void *)st->start_address);
+
+ st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
+}
+
+static void note_page(struct ptdump_state *pt_st, unsigned long addr,
+ int level, unsigned long val)
+{
+ struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
+ u64 pa = PFN_PHYS(pte_pfn(__pte(val)));
+ u64 prot = 0;
+
+ if (level >= 0)
+ prot = val & pg_level[level].mask;
+
+ if (st->level == -1) {
+ st->level = level;
+ st->current_prot = prot;
+ st->start_address = addr;
+ st->start_pa = pa;
+ st->last_pa = pa;
+ pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+ } else if (prot != st->current_prot ||
+ level != st->level || addr >= st->marker[1].start_address) {
+ if (st->current_prot) {
+ note_prot_wx(st, addr);
+ dump_addr(st, addr);
+ dump_prot(st);
+ pt_dump_seq_puts(st->seq, "\n");
+ }
+
+ while (addr >= st->marker[1].start_address) {
+ st->marker++;
+ pt_dump_seq_printf(st->seq, "---[ %s ]---\n",
+ st->marker->name);
+ }
+
+ st->start_address = addr;
+ st->start_pa = pa;
+ st->last_pa = pa;
+ st->current_prot = prot;
+ st->level = level;
+ } else {
+ st->last_pa = pa;
+ }
+}
+
+static void ptdump_walk(struct seq_file *s)
+{
+ struct pg_state st = {
+ .seq = s,
+ .marker = address_markers,
+ .level = -1,
+ .ptdump = {
+ .note_page = note_page,
+ .range = (struct ptdump_range[]) {
+ {KERN_VIRT_START, ULONG_MAX},
+ {0, 0}
+ }
+ }
+ };
+
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+}
+
+void ptdump_check_wx(void)
+{
+ struct pg_state st = {
+ .seq = NULL,
+ .marker = (struct addr_marker[]) {
+ {0, NULL},
+ {-1, NULL},
+ },
+ .level = -1,
+ .check_wx = true,
+ .ptdump = {
+ .note_page = note_page,
+ .range = (struct ptdump_range[]) {
+ {KERN_VIRT_START, ULONG_MAX},
+ {0, 0}
+ }
+ }
+ };
+
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+
+ if (st.wx_pages)
+ pr_warn("Checked W+X mappings: failed, %lu W+X pages found\n",
+ st.wx_pages);
+ else
+ pr_info("Checked W+X mappings: passed, no W+X pages found\n");
+}
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+ ptdump_walk(m);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(ptdump);
+
+static int ptdump_init(void)
+{
+ unsigned int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(pg_level); i++)
+ for (j = 0; j < ARRAY_SIZE(pte_bits); j++)
+ pg_level[i].mask |= pte_bits[j].mask;
+
+ debugfs_create_file("kernel_page_tables", 0400, NULL, NULL,
+ &ptdump_fops);
+
+ return 0;
+}
+
+device_initcall(ptdump_init);
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index f2d4c1bd3429..cc98f9b78fd4 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -181,8 +181,7 @@ int arch_make_page_accessible(struct page *page);
#define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr))
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h
index e577f8533009..86a3796e9be8 100644
--- a/arch/s390/include/asm/qdio.h
+++ b/arch/s390/include/asm/qdio.h
@@ -325,7 +325,6 @@ typedef void qdio_handler_t(struct ccw_device *, unsigned int, int,
/**
* struct qdio_initialize - qdio initialization data
- * @cdev: associated ccw device
* @q_format: queue format
* @qdr_ac: feature flags to set
* @adapter_name: name for the adapter
@@ -341,12 +340,11 @@ typedef void qdio_handler_t(struct ccw_device *, unsigned int, int,
* @irq_poll: Data IRQ polling handler (NULL when not supported)
* @scan_threshold: # of in-use buffers that triggers scan on output queue
* @int_parm: interruption parameter
- * @input_sbal_addr_array: address of no_input_qs * 128 pointers
- * @output_sbal_addr_array: address of no_output_qs * 128 pointers
+ * @input_sbal_addr_array: per-queue array, each element points to 128 SBALs
+ * @output_sbal_addr_array: per-queue array, each element points to 128 SBALs
* @output_sbal_state_array: no_output_qs * 128 state info (for CQ or NULL)
*/
struct qdio_initialize {
- struct ccw_device *cdev;
unsigned char q_format;
unsigned char qdr_ac;
unsigned char adapter_name[8];
@@ -362,8 +360,8 @@ struct qdio_initialize {
void (*irq_poll)(struct ccw_device *cdev, unsigned long data);
unsigned int scan_threshold;
unsigned long int_parm;
- struct qdio_buffer **input_sbal_addr_array;
- struct qdio_buffer **output_sbal_addr_array;
+ struct qdio_buffer ***input_sbal_addr_array;
+ struct qdio_buffer ***output_sbal_addr_array;
struct qdio_outbuf_state *output_sbal_state_array;
};
@@ -408,8 +406,10 @@ int qdio_alloc_buffers(struct qdio_buffer **buf, unsigned int count);
void qdio_free_buffers(struct qdio_buffer **buf, unsigned int count);
void qdio_reset_buffers(struct qdio_buffer **buf, unsigned int count);
-extern int qdio_allocate(struct qdio_initialize *);
-extern int qdio_establish(struct qdio_initialize *);
+extern int qdio_allocate(struct ccw_device *cdev, unsigned int no_input_qs,
+ unsigned int no_output_qs);
+extern int qdio_establish(struct ccw_device *cdev,
+ struct qdio_initialize *init_data);
extern int qdio_activate(struct ccw_device *);
extern void qdio_release_aob(struct qaob *);
extern int do_QDIO(struct ccw_device *, unsigned int, int, unsigned int,
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index d3db3d7ed077..def3b60f1fe8 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -55,8 +55,4 @@ config KVM_S390_UCONTROL
If unsure, say N.
-# OK, it's a little counter-intuitive to do this, but it puts it neatly under
-# the virtualization menu.
-source "drivers/vhost/Kconfig"
-
endif # VIRTUALIZATION
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 076090f9e666..4f6c22d72072 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -1202,6 +1202,7 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
scb_s->iprcc = PGM_ADDRESSING;
scb_s->pgmilc = 4;
scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
+ rc = 1;
}
return rc;
}
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index d56f67745e3e..dedc28be27ab 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -580,7 +580,7 @@ void do_dat_exception(struct pt_regs *regs)
int access;
vm_fault_t fault;
- access = VM_READ | VM_EXEC | VM_WRITE;
+ access = VM_ACCESS_FLAGS;
fault = do_exception(regs, access);
if (unlikely(fault))
do_fault_error(regs, access, fault);
@@ -852,9 +852,7 @@ void do_secure_storage_access(struct pt_regs *regs)
BUG();
break;
case VDSO_FAULT:
- /* fallthrough */
case GMAP_FAULT:
- /* fallthrough */
default:
do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
WARN_ON_ONCE(1);
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 2fbece47ef6f..1a95d8809cc3 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -787,14 +787,18 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
static inline unsigned long *gmap_table_walk(struct gmap *gmap,
unsigned long gaddr, int level)
{
+ const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
unsigned long *table;
if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
return NULL;
if (gmap_is_shadow(gmap) && gmap->removed)
return NULL;
- if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+
+ if (asce_type != _ASCE_TYPE_REGION1 &&
+ gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
return NULL;
+
table = gmap->table;
switch (gmap->asce & _ASCE_TYPE_MASK) {
case _ASCE_TYPE_REGION1:
@@ -1840,6 +1844,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
goto out_free;
} else if (*table & _REGION_ENTRY_ORIGIN) {
rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
}
crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
/* mark as invalid as long as the parent table is not protected */
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index ac44bd76db4b..87b2d024e75a 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -268,20 +268,23 @@ device_initcall(s390_cma_mem_init);
#endif /* CONFIG_CMA */
int arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long size_pages = PFN_DOWN(size);
int rc;
- if (WARN_ON_ONCE(restrictions->altmap))
+ if (WARN_ON_ONCE(params->altmap))
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
return -EINVAL;
rc = vmem_add_mapping(start, size);
if (rc)
return rc;
- rc = __add_pages(nid, start_pfn, size_pages, restrictions);
+ rc = __add_pages(nid, start_pfn, size_pages, params);
if (rc)
vmem_remove_mapping(start, size);
return rc;
diff --git a/arch/sh/include/asm/bitops-op32.h b/arch/sh/include/asm/bitops-op32.h
index 466880362ad1..cfe5465acce7 100644
--- a/arch/sh/include/asm/bitops-op32.h
+++ b/arch/sh/include/asm/bitops-op32.h
@@ -16,11 +16,9 @@
#define BYTE_OFFSET(nr) ((nr) % BITS_PER_BYTE)
#endif
-#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr))
-
static inline void __set_bit(int nr, volatile unsigned long *addr)
{
- if (IS_IMMEDIATE(nr)) {
+ if (__builtin_constant_p(nr)) {
__asm__ __volatile__ (
"bset.b %1, @(%O2,%0) ! __set_bit\n\t"
: "+r" (addr)
@@ -37,7 +35,7 @@ static inline void __set_bit(int nr, volatile unsigned long *addr)
static inline void __clear_bit(int nr, volatile unsigned long *addr)
{
- if (IS_IMMEDIATE(nr)) {
+ if (__builtin_constant_p(nr)) {
__asm__ __volatile__ (
"bclr.b %1, @(%O2,%0) ! __clear_bit\n\t"
: "+r" (addr)
@@ -64,7 +62,7 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr)
*/
static inline void __change_bit(int nr, volatile unsigned long *addr)
{
- if (IS_IMMEDIATE(nr)) {
+ if (__builtin_constant_p(nr)) {
__asm__ __volatile__ (
"bxor.b %1, @(%O2,%0) ! __change_bit\n\t"
: "+r" (addr)
diff --git a/arch/sh/include/asm/page.h b/arch/sh/include/asm/page.h
index 5eef8be3e59f..ea8d68f58e39 100644
--- a/arch/sh/include/asm/page.h
+++ b/arch/sh/include/asm/page.h
@@ -182,9 +182,6 @@ typedef struct page *pgtable_t;
#endif
#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
diff --git a/arch/sh/include/uapi/asm/setup.h b/arch/sh/include/uapi/asm/setup.h
deleted file mode 100644
index 4bd19f80f9b0..000000000000
--- a/arch/sh/include/uapi/asm/setup.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#include <asm-generic/setup.h>
diff --git a/arch/sh/include/uapi/asm/types.h b/arch/sh/include/uapi/asm/types.h
deleted file mode 100644
index 68100e108ea6..000000000000
--- a/arch/sh/include/uapi/asm/types.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#include <asm-generic/types.h>
diff --git a/arch/sh/kernel/syscalls/syscallhdr.sh b/arch/sh/kernel/syscalls/syscallhdr.sh
index 1de0334e577f..4c0519861e97 100644
--- a/arch/sh/kernel/syscalls/syscallhdr.sh
+++ b/arch/sh/kernel/syscalls/syscallhdr.sh
@@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
printf "#define __NR_syscalls\t%s\n" "${nxt}"
printf "#endif\n"
printf "\n"
- printf "#endif /* %s */" "${fileguard}"
+ printf "#endif /* %s */\n" "${fileguard}"
) > "$out"
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 13ee4d20e622..5f23d7907597 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -355,7 +355,7 @@ static inline int access_error(int error_code, struct vm_area_struct *vma)
return 1;
/* read, not present: */
- if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+ if (unlikely(!vma_is_accessible(vma)))
return 1;
return 0;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index d1b1ff2be17a..b9de2d4fa57e 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -406,14 +406,17 @@ void __init mem_init(void)
#ifdef CONFIG_MEMORY_HOTPLUG
int arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
+ if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)
+ return -EINVAL;
+
/* We only have ZONE_NORMAL, so this is easy.. */
- ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
+ ret = __add_pages(nid, start_pfn, nr_pages, params);
if (unlikely(ret))
printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index ed32845bd2d2..2f051343612e 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -2,23 +2,12 @@
#ifndef ___ASM_SPARC_DMA_MAPPING_H
#define ___ASM_SPARC_DMA_MAPPING_H
-#include <asm/cpu_type.h>
-
extern const struct dma_map_ops *dma_ops;
-extern struct bus_type pci_bus_type;
-
static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
{
-#ifdef CONFIG_SPARC_LEON
- if (sparc_cpu_model == sparc_leon)
- return NULL;
-#endif
-#if defined(CONFIG_SPARC32) && defined(CONFIG_PCI)
- if (bus == &pci_bus_type)
- return NULL;
-#endif
- return dma_ops;
+ /* sparc32 uses per-device dma_ops */
+ return IS_ENABLED(CONFIG_SPARC64) ? dma_ops : NULL;
}
#endif
diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h
index b76d59edec8c..478260002836 100644
--- a/arch/sparc/include/asm/page_32.h
+++ b/arch/sparc/include/asm/page_32.h
@@ -133,9 +133,6 @@ extern unsigned long pfn_base;
#define pfn_valid(pfn) (((pfn) >= (pfn_base)) && (((pfn)-(pfn_base)) < max_mapnr))
#define virt_addr_valid(kaddr) ((((unsigned long)(kaddr)-PAGE_OFFSET)>>PAGE_SHIFT) < max_mapnr)
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
index e80f2d5bf62f..254dffd85fb1 100644
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -158,9 +158,6 @@ extern unsigned long PAGE_OFFSET;
#endif /* !(__ASSEMBLY__) */
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
#include <asm-generic/getorder.h>
#endif /* _SPARC64_PAGE_H */
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index 6d6f44c0cad9..0de659ae0ba4 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -223,11 +223,6 @@ static inline int pte_young(pte_t pte)
return pte_val(pte) & SRMMU_REF;
}
-static inline int pte_special(pte_t pte)
-{
- return 0;
-}
-
static inline pte_t pte_wrprotect(pte_t pte)
{
return __pte(pte_val(pte) & ~SRMMU_WRITE);
@@ -258,8 +253,6 @@ static inline pte_t pte_mkyoung(pte_t pte)
return __pte(pte_val(pte) | SRMMU_REF);
}
-#define pte_mkspecial(pte) (pte)
-
#define pfn_pte(pfn, prot) mk_pte(pfn_to_page(pfn), prot)
static inline unsigned long pte_pfn(pte_t pte)
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 65494c3a420e..da527b27cf7d 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -907,11 +907,11 @@ static inline unsigned long pud_pfn(pud_t pud)
(((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)))
/* Find an entry in the third-level page table.. */
-#define pte_index(dir, address) \
- ((pte_t *) __pmd_page(*(dir)) + \
- ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)))
-#define pte_offset_kernel pte_index
-#define pte_offset_map pte_index
+#define pte_index(address) \
+ ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+#define pte_offset_kernel(dir, address) \
+ ((pte_t *) __pmd_page(*(dir)) + pte_index(address))
+#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
#define pte_unmap(pte) do { } while (0)
/* We cannot include <linux/mm_types.h> at this point yet: */
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index e59461d03b9a..d6874c9b639f 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -373,9 +373,6 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
dma_make_coherent(paddr, PAGE_ALIGN(size));
}
-const struct dma_map_ops *dma_ops;
-EXPORT_SYMBOL(dma_ops);
-
#ifdef CONFIG_PROC_FS
static int sparc_io_proc_show(struct seq_file *m, void *v)
diff --git a/arch/sparc/kernel/of_device_common.c b/arch/sparc/kernel/of_device_common.c
index b32cc5610712..e717a56efc5d 100644
--- a/arch/sparc/kernel/of_device_common.c
+++ b/arch/sparc/kernel/of_device_common.c
@@ -67,6 +67,7 @@ void of_propagate_archdata(struct platform_device *bus)
op->dev.archdata.stc = bus_sd->stc;
op->dev.archdata.host_controller = bus_sd->host_controller;
op->dev.archdata.numa_node = bus_sd->numa_node;
+ op->dev.dma_ops = bus->dev.dma_ops;
if (dp->child)
of_propagate_archdata(op);
diff --git a/arch/sparc/kernel/syscalls/syscallhdr.sh b/arch/sparc/kernel/syscalls/syscallhdr.sh
index 626b5740a9f1..cf50a75cc0bb 100644
--- a/arch/sparc/kernel/syscalls/syscallhdr.sh
+++ b/arch/sparc/kernel/syscalls/syscallhdr.sh
@@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
printf "#define __NR_syscalls\t%s\n" "${nxt}"
printf "#endif\n"
printf "\n"
- printf "#endif /* %s */" "${fileguard}"
+ printf "#endif /* %s */\n" "${fileguard}"
) > "$out"
diff --git a/arch/sparc/mm/io-unit.c b/arch/sparc/mm/io-unit.c
index 33a0facd9eb5..289276b99b01 100644
--- a/arch/sparc/mm/io-unit.c
+++ b/arch/sparc/mm/io-unit.c
@@ -38,6 +38,8 @@
#define IOPERM (IOUPTE_CACHE | IOUPTE_WRITE | IOUPTE_VALID)
#define MKIOPTE(phys) __iopte((((phys)>>4) & IOUPTE_PAGE) | IOPERM)
+static const struct dma_map_ops iounit_dma_ops;
+
static void __init iounit_iommu_init(struct platform_device *op)
{
struct iounit_struct *iounit;
@@ -70,6 +72,8 @@ static void __init iounit_iommu_init(struct platform_device *op)
xptend = iounit->page_table + (16 * PAGE_SIZE) / sizeof(iopte_t);
for (; xpt < xptend; xpt++)
sbus_writel(0, xpt);
+
+ op->dev.dma_ops = &iounit_dma_ops;
}
static int __init iounit_init(void)
@@ -288,8 +292,3 @@ static const struct dma_map_ops iounit_dma_ops = {
.map_sg = iounit_map_sg,
.unmap_sg = iounit_unmap_sg,
};
-
-void __init ld_mmu_iounit(void)
-{
- dma_ops = &iounit_dma_ops;
-}
diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index 4d3c6991f0ae..b00dde13681b 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -54,6 +54,9 @@ static pgprot_t dvma_prot; /* Consistent mapping pte flags */
#define IOPERM (IOPTE_CACHE | IOPTE_WRITE | IOPTE_VALID)
#define MKIOPTE(pfn, perm) (((((pfn)<<8) & IOPTE_PAGE) | (perm)) & ~IOPTE_WAZ)
+static const struct dma_map_ops sbus_iommu_dma_gflush_ops;
+static const struct dma_map_ops sbus_iommu_dma_pflush_ops;
+
static void __init sbus_iommu_init(struct platform_device *op)
{
struct iommu_struct *iommu;
@@ -129,6 +132,11 @@ static void __init sbus_iommu_init(struct platform_device *op)
(int)(IOMMU_NPTES*sizeof(iopte_t)), (int)IOMMU_NPTES);
op->dev.archdata.iommu = iommu;
+
+ if (flush_page_for_dma_global)
+ op->dev.dma_ops = &sbus_iommu_dma_gflush_ops;
+ else
+ op->dev.dma_ops = &sbus_iommu_dma_pflush_ops;
}
static int __init iommu_init(void)
@@ -445,13 +453,6 @@ static const struct dma_map_ops sbus_iommu_dma_pflush_ops = {
void __init ld_mmu_iommu(void)
{
- if (flush_page_for_dma_global) {
- /* flush_page_for_dma flushes everything, no matter of what page is it */
- dma_ops = &sbus_iommu_dma_gflush_ops;
- } else {
- dma_ops = &sbus_iommu_dma_pflush_ops;
- }
-
if (viking_mxcc_present || srmmu_modtype == HyperSparc) {
dvma_prot = __pgprot(SRMMU_CACHE | SRMMU_ET_PTE | SRMMU_PRIV);
ioperm_noc = IOPTE_CACHE | IOPTE_WRITE | IOPTE_VALID;
diff --git a/arch/sparc/mm/mm_32.h b/arch/sparc/mm/mm_32.h
index 0d0b06e952a5..ce750a99eea9 100644
--- a/arch/sparc/mm/mm_32.h
+++ b/arch/sparc/mm/mm_32.h
@@ -20,6 +20,3 @@ void __init srmmu_paging_init(void);
/* iommu.c */
void ld_mmu_iommu(void);
-
-/* io-unit.c */
-void ld_mmu_iounit(void);
diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c
index f56c3c9a9793..b7c94de70cca 100644
--- a/arch/sparc/mm/srmmu.c
+++ b/arch/sparc/mm/srmmu.c
@@ -1865,9 +1865,7 @@ void __init load_mmu(void)
&smp_cachetlb_ops;
#endif
- if (sparc_cpu_model == sun4d)
- ld_mmu_iounit();
- else
+ if (sparc_cpu_model != sun4d)
ld_mmu_iommu();
#ifdef CONFIG_SMP
if (sparc_cpu_model == sun4d)
diff --git a/arch/sparc/vdso/vdso32/vclock_gettime.c b/arch/sparc/vdso/vdso32/vclock_gettime.c
index 026abb3b826c..d7f99e6745ea 100644
--- a/arch/sparc/vdso/vdso32/vclock_gettime.c
+++ b/arch/sparc/vdso/vdso32/vclock_gettime.c
@@ -4,10 +4,6 @@
#define BUILD_VDSO32
-#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE
-#undef CONFIG_OPTIMIZE_INLINING
-#endif
-
#ifdef CONFIG_SPARC64
/*
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 0917f8443c28..96ab7026b037 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -62,9 +62,12 @@ config NR_CPUS
source "arch/$(HEADER_ARCH)/um/Kconfig"
+config FORBID_STATIC_LINK
+ bool
+
config STATIC_LINK
bool "Force a static link"
- default n
+ depends on !FORBID_STATIC_LINK
help
This option gives you the ability to force a static link of UML.
Normally, UML is linked as a shared binary. This is inconvenient for
@@ -73,6 +76,9 @@ config STATIC_LINK
Additionally, this option enables using higher memory spaces (up to
2.75G) for UML.
+ NOTE: This option is incompatible with some networking features which
+ depend on features that require being dynamically loaded (like NSS).
+
config LD_SCRIPT_STATIC
bool
default y
@@ -191,6 +197,7 @@ config UML_TIME_TRAVEL_SUPPORT
prompt "Support time-travel mode (e.g. for test execution)"
# inf-cpu mode is incompatible with the benchmarking
depends on !RAID6_PQ_BENCHMARK
+ depends on !SMP
help
Enable this option to support time travel inside the UML instance.
diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig
index 73e98bb57bf5..fb51bd206dbe 100644
--- a/arch/um/configs/i386_defconfig
+++ b/arch/um/configs/i386_defconfig
@@ -26,7 +26,7 @@ CONFIG_SLAB=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
-CONFIG_IOSCHED_CFQ=m
+CONFIG_IOSCHED_BFQ=m
CONFIG_SSL=y
CONFIG_NULL_CHAN=y
CONFIG_PORT_CHAN=y
diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig
index 3281d7600225..477b87317424 100644
--- a/arch/um/configs/x86_64_defconfig
+++ b/arch/um/configs/x86_64_defconfig
@@ -24,7 +24,7 @@ CONFIG_SLAB=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
-CONFIG_IOSCHED_CFQ=m
+CONFIG_IOSCHED_BFQ=m
CONFIG_SSL=y
CONFIG_NULL_CHAN=y
CONFIG_PORT_CHAN=y
diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
index 72d417055782..9160ead56e33 100644
--- a/arch/um/drivers/Kconfig
+++ b/arch/um/drivers/Kconfig
@@ -234,6 +234,7 @@ config UML_NET_DAEMON
config UML_NET_VECTOR
bool "Vector I/O high performance network devices"
depends on UML_NET
+ select FORBID_STATIC_LINK
help
This User-Mode Linux network driver uses multi-message send
and receive functions. The host running the UML guest must have
@@ -245,6 +246,7 @@ config UML_NET_VECTOR
config UML_NET_VDE
bool "VDE transport (obsolete)"
depends on UML_NET
+ select FORBID_STATIC_LINK
help
This User-Mode Linux network transport allows one or more running
UMLs on a single host to communicate with each other and also
@@ -292,6 +294,7 @@ config UML_NET_MCAST
config UML_NET_PCAP
bool "pcap transport (obsolete)"
depends on UML_NET
+ select FORBID_STATIC_LINK
help
The pcap transport makes a pcap packet stream on the host look
like an ethernet device inside UML. This is useful for making
diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index 35ebeebfc1a8..1802cf4ef5a5 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -266,7 +266,6 @@ static void uml_net_get_drvinfo(struct net_device *dev,
struct ethtool_drvinfo *info)
{
strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver));
- strlcpy(info->version, "42", sizeof(info->version));
}
static const struct ethtool_ops uml_net_ethtool_ops = {
@@ -275,17 +274,6 @@ static const struct ethtool_ops uml_net_ethtool_ops = {
.get_ts_info = ethtool_op_get_ts_info,
};
-static void uml_net_user_timer_expire(struct timer_list *t)
-{
-#ifdef undef
- struct uml_net_private *lp = from_timer(lp, t, tl);
- struct connection *conn = &lp->user;
-
- dprintk(KERN_INFO "uml_net_user_timer_expire [%p]\n", conn);
- do_connect(conn);
-#endif
-}
-
void uml_net_setup_etheraddr(struct net_device *dev, char *str)
{
unsigned char *addr = dev->dev_addr;
@@ -456,7 +444,6 @@ static void eth_configure(int n, void *init, char *mac,
.add_address = transport->user->add_address,
.delete_address = transport->user->delete_address });
- timer_setup(&lp->tl, uml_net_user_timer_expire, 0);
spin_lock_init(&lp->lock);
memcpy(lp->mac, dev->dev_addr, sizeof(lp->mac));
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 247f95da057b..eae8c83364f7 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1592,11 +1592,11 @@ int io_thread(void *arg)
&io_remainder_size,
UBD_REQ_BUFFER_SIZE
);
- if (n < 0) {
- if (n == -EAGAIN) {
+ if (n <= 0) {
+ if (n == -EAGAIN)
ubd_read_poll(-1);
- continue;
- }
+
+ continue;
}
for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
@@ -1607,7 +1607,9 @@ int io_thread(void *arg)
written = 0;
do {
- res = os_write_file(kernel_fd, ((char *) io_req_buffer) + written, n);
+ res = os_write_file(kernel_fd,
+ ((char *) io_req_buffer) + written,
+ n - written);
if (res >= 0) {
written += res;
}
diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c
index e98304d0219e..8735c468230a 100644
--- a/arch/um/drivers/vector_kern.c
+++ b/arch/um/drivers/vector_kern.c
@@ -46,7 +46,6 @@
#define DRIVER_NAME "uml-vector"
-#define DRIVER_VERSION "01"
struct vector_cmd_line_arg {
struct list_head list;
int unit;
@@ -198,6 +197,9 @@ static int get_transport_options(struct arglist *def)
long parsed;
int result = 0;
+ if (transport == NULL)
+ return -EINVAL;
+
if (vector != NULL) {
if (kstrtoul(vector, 10, &parsed) == 0) {
if (parsed == 0) {
@@ -1378,7 +1380,6 @@ static void vector_net_get_drvinfo(struct net_device *dev,
struct ethtool_drvinfo *info)
{
strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver));
- strlcpy(info->version, DRIVER_VERSION, sizeof(info->version));
}
static int vector_net_load_bpf_flash(struct net_device *dev,
diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c
index ddcd917be0af..aa28e9eecb7b 100644
--- a/arch/um/drivers/vector_user.c
+++ b/arch/um/drivers/vector_user.c
@@ -221,8 +221,7 @@ static struct vector_fds *user_init_tap_fds(struct arglist *ifspec)
return result;
tap_cleanup:
printk(UM_KERN_ERR "user_init_tap: init failed, error %d", fd);
- if (result != NULL)
- kfree(result);
+ kfree(result);
return NULL;
}
@@ -266,8 +265,7 @@ static struct vector_fds *user_init_hybrid_fds(struct arglist *ifspec)
return result;
hybrid_cleanup:
printk(UM_KERN_ERR "user_init_hybrid: init failed");
- if (result != NULL)
- kfree(result);
+ kfree(result);
return NULL;
}
@@ -344,10 +342,8 @@ static struct vector_fds *user_init_unix_fds(struct arglist *ifspec, int id)
unix_cleanup:
if (fd >= 0)
os_close_file(fd);
- if (remote_addr != NULL)
- kfree(remote_addr);
- if (result != NULL)
- kfree(result);
+ kfree(remote_addr);
+ kfree(result);
return NULL;
}
@@ -382,8 +378,7 @@ static struct vector_fds *user_init_raw_fds(struct arglist *ifspec)
return result;
raw_cleanup:
printk(UM_KERN_ERR "user_init_raw: init failed, error %d", err);
- if (result != NULL)
- kfree(result);
+ kfree(result);
return NULL;
}
diff --git a/arch/um/drivers/vhost_user.h b/arch/um/drivers/vhost_user.h
index 45ff5ea22fea..6c71b6005177 100644
--- a/arch/um/drivers/vhost_user.h
+++ b/arch/um/drivers/vhost_user.h
@@ -10,9 +10,10 @@
/* Feature bits */
#define VHOST_USER_F_PROTOCOL_FEATURES 30
/* Protocol feature bits */
-#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3
-#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
-#define VHOST_USER_PROTOCOL_F_CONFIG 9
+#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3
+#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
+#define VHOST_USER_PROTOCOL_F_CONFIG 9
+#define VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS 14
/* Vring state index masks */
#define VHOST_USER_VRING_INDEX_MASK 0xff
#define VHOST_USER_VRING_POLL_MASK BIT(8)
@@ -24,7 +25,8 @@
/* Supported protocol features */
#define VHOST_USER_SUPPORTED_PROTOCOL_F (BIT_ULL(VHOST_USER_PROTOCOL_F_REPLY_ACK) | \
BIT_ULL(VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \
- BIT_ULL(VHOST_USER_PROTOCOL_F_CONFIG))
+ BIT_ULL(VHOST_USER_PROTOCOL_F_CONFIG) | \
+ BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS))
enum vhost_user_request {
VHOST_USER_GET_FEATURES = 1,
@@ -52,12 +54,14 @@ enum vhost_user_request {
VHOST_USER_SET_VRING_ENDIAN = 23,
VHOST_USER_GET_CONFIG = 24,
VHOST_USER_SET_CONFIG = 25,
+ VHOST_USER_VRING_KICK = 35,
};
enum vhost_user_slave_request {
VHOST_USER_SLAVE_IOTLB_MSG = 1,
VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
+ VHOST_USER_SLAVE_VRING_CALL = 4,
};
struct vhost_user_header {
diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c
index 023ced2250ea..be54d368e73d 100644
--- a/arch/um/drivers/virtio_uml.c
+++ b/arch/um/drivers/virtio_uml.c
@@ -26,6 +26,7 @@
#include <linux/virtio.h>
#include <linux/virtio_config.h>
#include <linux/virtio_ring.h>
+#include <linux/time-internal.h>
#include <shared/as-layout.h>
#include <irq_kern.h>
#include <init.h>
@@ -53,6 +54,7 @@ struct virtio_uml_device {
struct virtio_device vdev;
struct platform_device *pdev;
+ spinlock_t sock_lock;
int sock, req_fd;
u64 features;
u64 protocol_features;
@@ -63,6 +65,11 @@ struct virtio_uml_device {
struct virtio_uml_vq_info {
int kick_fd, call_fd;
char name[32];
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+ struct virtqueue *vq;
+ vq_callback_t *callback;
+ struct time_travel_event defer;
+#endif
};
extern unsigned long long physmem_size, highmem;
@@ -117,10 +124,27 @@ static int vhost_user_recv_header(int fd, struct vhost_user_msg *msg)
static int vhost_user_recv(struct virtio_uml_device *vu_dev,
int fd, struct vhost_user_msg *msg,
- size_t max_payload_size)
+ size_t max_payload_size, bool wait)
{
size_t size;
- int rc = vhost_user_recv_header(fd, msg);
+ int rc;
+
+ /*
+ * In virtio time-travel mode, we're handling all the vhost-user
+ * FDs by polling them whenever appropriate. However, we may get
+ * into a situation where we're sending out an interrupt message
+ * to a device (e.g. a net device) and need to handle a simulation
+ * time message while doing so, e.g. one that tells us to update
+ * our idea of how long we can run without scheduling.
+ *
+ * Thus, we need to not just read() from the given fd, but need
+ * to also handle messages for the simulation time - this function
+ * does that for us while waiting for the given fd to be readable.
+ */
+ if (wait)
+ time_travel_wait_readable(fd);
+
+ rc = vhost_user_recv_header(fd, msg);
if (rc == -ECONNRESET && vu_dev->registered) {
struct virtio_uml_platform_data *pdata;
@@ -142,7 +166,8 @@ static int vhost_user_recv_resp(struct virtio_uml_device *vu_dev,
struct vhost_user_msg *msg,
size_t max_payload_size)
{
- int rc = vhost_user_recv(vu_dev, vu_dev->sock, msg, max_payload_size);
+ int rc = vhost_user_recv(vu_dev, vu_dev->sock, msg,
+ max_payload_size, true);
if (rc)
return rc;
@@ -172,7 +197,8 @@ static int vhost_user_recv_req(struct virtio_uml_device *vu_dev,
struct vhost_user_msg *msg,
size_t max_payload_size)
{
- int rc = vhost_user_recv(vu_dev, vu_dev->req_fd, msg, max_payload_size);
+ int rc = vhost_user_recv(vu_dev, vu_dev->req_fd, msg,
+ max_payload_size, false);
if (rc)
return rc;
@@ -189,6 +215,7 @@ static int vhost_user_send(struct virtio_uml_device *vu_dev,
int *fds, size_t num_fds)
{
size_t size = sizeof(msg->header) + msg->header.size;
+ unsigned long flags;
bool request_ack;
int rc;
@@ -207,24 +234,28 @@ static int vhost_user_send(struct virtio_uml_device *vu_dev,
if (request_ack)
msg->header.flags |= VHOST_USER_FLAG_NEED_REPLY;
+ spin_lock_irqsave(&vu_dev->sock_lock, flags);
rc = full_sendmsg_fds(vu_dev->sock, msg, size, fds, num_fds);
if (rc < 0)
- return rc;
+ goto out;
if (request_ack) {
uint64_t status;
rc = vhost_user_recv_u64(vu_dev, &status);
if (rc)
- return rc;
+ goto out;
if (status) {
vu_err(vu_dev, "slave reports error: %llu\n", status);
- return -EIO;
+ rc = -EIO;
+ goto out;
}
}
- return 0;
+out:
+ spin_unlock_irqrestore(&vu_dev->sock_lock, flags);
+ return rc;
}
static int vhost_user_send_no_payload(struct virtio_uml_device *vu_dev,
@@ -324,6 +355,7 @@ static void vhost_user_reply(struct virtio_uml_device *vu_dev,
static irqreturn_t vu_req_interrupt(int irq, void *data)
{
struct virtio_uml_device *vu_dev = data;
+ struct virtqueue *vq;
int response = 1;
struct {
struct vhost_user_msg msg;
@@ -343,6 +375,15 @@ static irqreturn_t vu_req_interrupt(int irq, void *data)
virtio_config_changed(&vu_dev->vdev);
response = 0;
break;
+ case VHOST_USER_SLAVE_VRING_CALL:
+ virtio_device_for_each_vq((&vu_dev->vdev), vq) {
+ if (vq->index == msg.msg.payload.vring_state.index) {
+ response = 0;
+ vring_interrupt(0 /* ignored */, vq);
+ break;
+ }
+ }
+ break;
case VHOST_USER_SLAVE_IOTLB_MSG:
/* not supported - VIRTIO_F_IOMMU_PLATFORM */
case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG:
@@ -684,6 +725,17 @@ static bool vu_notify(struct virtqueue *vq)
const uint64_t n = 1;
int rc;
+ time_travel_propagate_time();
+
+ if (info->kick_fd < 0) {
+ struct virtio_uml_device *vu_dev;
+
+ vu_dev = to_virtio_uml_device(vq->vdev);
+
+ return vhost_user_set_vring_state(vu_dev, VHOST_USER_VRING_KICK,
+ vq->index, 0) == 0;
+ }
+
do {
rc = os_write_file(info->kick_fd, &n, sizeof(n));
} while (rc == -EINTR);
@@ -749,10 +801,13 @@ static void vu_del_vq(struct virtqueue *vq)
{
struct virtio_uml_vq_info *info = vq->priv;
- um_free_irq(VIRTIO_IRQ, vq);
+ if (info->call_fd >= 0) {
+ um_free_irq(VIRTIO_IRQ, vq);
+ os_close_file(info->call_fd);
+ }
- os_close_file(info->call_fd);
- os_close_file(info->kick_fd);
+ if (info->kick_fd >= 0)
+ os_close_file(info->kick_fd);
vring_del_virtqueue(vq);
kfree(info);
@@ -782,6 +837,15 @@ static int vu_setup_vq_call_fd(struct virtio_uml_device *vu_dev,
int call_fds[2];
int rc;
+ /* no call FD needed/desired in this case */
+ if (vu_dev->protocol_features &
+ BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) &&
+ vu_dev->protocol_features &
+ BIT_ULL(VHOST_USER_PROTOCOL_F_SLAVE_REQ)) {
+ info->call_fd = -1;
+ return 0;
+ }
+
/* Use a pipe for call fd, since SIGIO is not supported for eventfd */
rc = os_pipe(call_fds, true, true);
if (rc < 0)
@@ -810,6 +874,23 @@ out:
return rc;
}
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+static void vu_defer_irq_handle(struct time_travel_event *d)
+{
+ struct virtio_uml_vq_info *info;
+
+ info = container_of(d, struct virtio_uml_vq_info, defer);
+ info->callback(info->vq);
+}
+
+static void vu_defer_irq_callback(struct virtqueue *vq)
+{
+ struct virtio_uml_vq_info *info = vq->priv;
+
+ time_travel_add_irq_event(&info->defer);
+}
+#endif
+
static struct virtqueue *vu_setup_vq(struct virtio_device *vdev,
unsigned index, vq_callback_t *callback,
const char *name, bool ctx)
@@ -829,6 +910,19 @@ static struct virtqueue *vu_setup_vq(struct virtio_device *vdev,
snprintf(info->name, sizeof(info->name), "%s.%d-%s", pdev->name,
pdev->id, name);
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+ /*
+ * When we get an interrupt, we must bounce it through the simulation
+ * calendar (the simtime device), except for the simtime device itself
+ * since that's part of the simulation control.
+ */
+ if (time_travel_mode == TT_MODE_EXTERNAL && callback) {
+ info->callback = callback;
+ callback = vu_defer_irq_callback;
+ time_travel_set_event_fn(&info->defer, vu_defer_irq_handle);
+ }
+#endif
+
vq = vring_create_virtqueue(index, num, PAGE_SIZE, vdev, true, true,
ctx, vu_notify, callback, info->name);
if (!vq) {
@@ -837,11 +931,19 @@ static struct virtqueue *vu_setup_vq(struct virtio_device *vdev,
}
vq->priv = info;
num = virtqueue_get_vring_size(vq);
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+ info->vq = vq;
+#endif
- rc = os_eventfd(0, 0);
- if (rc < 0)
- goto error_kick;
- info->kick_fd = rc;
+ if (vu_dev->protocol_features &
+ BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS)) {
+ info->kick_fd = -1;
+ } else {
+ rc = os_eventfd(0, 0);
+ if (rc < 0)
+ goto error_kick;
+ info->kick_fd = rc;
+ }
rc = vu_setup_vq_call_fd(vu_dev, vq);
if (rc)
@@ -866,10 +968,13 @@ static struct virtqueue *vu_setup_vq(struct virtio_device *vdev,
return vq;
error_setup:
- um_free_irq(VIRTIO_IRQ, vq);
- os_close_file(info->call_fd);
+ if (info->call_fd >= 0) {
+ um_free_irq(VIRTIO_IRQ, vq);
+ os_close_file(info->call_fd);
+ }
error_call:
- os_close_file(info->kick_fd);
+ if (info->kick_fd >= 0)
+ os_close_file(info->kick_fd);
error_kick:
vring_del_virtqueue(vq);
error_create:
@@ -908,10 +1013,12 @@ static int vu_find_vqs(struct virtio_device *vdev, unsigned nvqs,
list_for_each_entry(vq, &vdev->vqs, list) {
struct virtio_uml_vq_info *info = vq->priv;
- rc = vhost_user_set_vring_kick(vu_dev, vq->index,
- info->kick_fd);
- if (rc)
- goto error_setup;
+ if (info->kick_fd >= 0) {
+ rc = vhost_user_set_vring_kick(vu_dev, vq->index,
+ info->kick_fd);
+ if (rc)
+ goto error_setup;
+ }
rc = vhost_user_set_vring_enable(vu_dev, vq->index, true);
if (rc)
@@ -1008,6 +1115,8 @@ static int virtio_uml_probe(struct platform_device *pdev)
return rc;
vu_dev->sock = rc;
+ spin_lock_init(&vu_dev->sock_lock);
+
rc = vhost_user_init(vu_dev);
if (rc)
goto error_init;
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index db7d9d4e30d8..8d435f8a6dec 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -3,7 +3,6 @@ generic-y += bpf_perf_event.h
generic-y += bug.h
generic-y += compat.h
generic-y += current.h
-generic-y += delay.h
generic-y += device.h
generic-y += emergency-restart.h
generic-y += exec.h
diff --git a/arch/um/include/asm/delay.h b/arch/um/include/asm/delay.h
new file mode 100644
index 000000000000..56fc2b8f2dd0
--- /dev/null
+++ b/arch/um/include/asm/delay.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __UM_DELAY_H
+#define __UM_DELAY_H
+#include <asm-generic/delay.h>
+#include <linux/time-internal.h>
+
+static inline void um_ndelay(unsigned long nsecs)
+{
+ if (time_travel_mode == TT_MODE_INFCPU ||
+ time_travel_mode == TT_MODE_EXTERNAL) {
+ time_travel_ndelay(nsecs);
+ return;
+ }
+ ndelay(nsecs);
+}
+#undef ndelay
+#define ndelay um_ndelay
+
+static inline void um_udelay(unsigned long usecs)
+{
+ if (time_travel_mode == TT_MODE_INFCPU ||
+ time_travel_mode == TT_MODE_EXTERNAL) {
+ time_travel_ndelay(1000 * usecs);
+ return;
+ }
+ udelay(usecs);
+}
+#undef udelay
+#define udelay um_udelay
+#endif /* __UM_DELAY_H */
diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index 2daa58df2190..b5ddf5d98bd5 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h
@@ -167,11 +167,6 @@ static inline int pte_newprot(pte_t pte)
return(pte_present(pte) && (pte_get_bits(pte, _PAGE_NEWPROT)));
}
-static inline int pte_special(pte_t pte)
-{
- return 0;
-}
-
/*
* =================================
* Flags setting section.
@@ -247,11 +242,6 @@ static inline pte_t pte_mknewpage(pte_t pte)
return(pte);
}
-static inline pte_t pte_mkspecial(pte_t pte)
-{
- return(pte);
-}
-
static inline void set_pte(pte_t *pteptr, pte_t pteval)
{
pte_copy(*pteptr, pteval);
diff --git a/arch/um/include/linux/time-internal.h b/arch/um/include/linux/time-internal.h
new file mode 100644
index 000000000000..f3b03d39a854
--- /dev/null
+++ b/arch/um/include/linux/time-internal.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ */
+
+#ifndef __TIMER_INTERNAL_H__
+#define __TIMER_INTERNAL_H__
+#include <linux/list.h>
+
+#define TIMER_MULTIPLIER 256
+#define TIMER_MIN_DELTA 500
+
+enum time_travel_mode {
+ TT_MODE_OFF,
+ TT_MODE_BASIC,
+ TT_MODE_INFCPU,
+ TT_MODE_EXTERNAL,
+};
+
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+struct time_travel_event {
+ unsigned long long time;
+ void (*fn)(struct time_travel_event *d);
+ struct list_head list;
+ bool pending, onstack;
+};
+
+extern enum time_travel_mode time_travel_mode;
+
+void time_travel_sleep(unsigned long long duration);
+
+static inline void
+time_travel_set_event_fn(struct time_travel_event *e,
+ void (*fn)(struct time_travel_event *d))
+{
+ e->fn = fn;
+}
+
+void __time_travel_propagate_time(void);
+
+static inline void time_travel_propagate_time(void)
+{
+ if (time_travel_mode == TT_MODE_EXTERNAL)
+ __time_travel_propagate_time();
+}
+
+void __time_travel_wait_readable(int fd);
+
+static inline void time_travel_wait_readable(int fd)
+{
+ if (time_travel_mode == TT_MODE_EXTERNAL)
+ __time_travel_wait_readable(fd);
+}
+
+void time_travel_add_irq_event(struct time_travel_event *e);
+#else
+struct time_travel_event {
+};
+
+#define time_travel_mode TT_MODE_OFF
+
+static inline void time_travel_sleep(unsigned long long duration)
+{
+}
+
+/* this is a macro so the event/function need not exist */
+#define time_travel_set_event_fn(e, fn) do {} while (0)
+
+static inline void time_travel_propagate_time(void)
+{
+}
+
+static inline void time_travel_wait_readable(int fd)
+{
+}
+#endif /* CONFIG_UML_TIME_TRAVEL_SUPPORT */
+
+/*
+ * Without CONFIG_UML_TIME_TRAVEL_SUPPORT this is a linker error if used,
+ * which is intentional since we really shouldn't link it in that case.
+ */
+void time_travel_ndelay(unsigned long nsec);
+#endif /* __TIMER_INTERNAL_H__ */
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index 0f30204b6afa..f467d28fc0b4 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -181,6 +181,7 @@ extern int os_falloc_punch(int fd, unsigned long long offset, int count);
extern int os_eventfd(unsigned int initval, int flags);
extern int os_sendmsg_fds(int fd, const void *buf, unsigned int len,
const int *fds, unsigned int fds_num);
+int os_poll(unsigned int n, const int *fds);
/* start_up.c */
extern void os_early_checks(void);
diff --git a/arch/um/include/shared/timer-internal.h b/arch/um/include/shared/timer-internal.h
deleted file mode 100644
index 2d2d13c9b46f..000000000000
--- a/arch/um/include/shared/timer-internal.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2012 - 2014 Cisco Systems
- * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- */
-
-#ifndef __TIMER_INTERNAL_H__
-#define __TIMER_INTERNAL_H__
-
-#define TIMER_MULTIPLIER 256
-#define TIMER_MIN_DELTA 500
-
-enum time_travel_mode {
- TT_MODE_OFF,
- TT_MODE_BASIC,
- TT_MODE_INFCPU,
-};
-
-enum time_travel_timer_mode {
- TT_TMR_DISABLED,
- TT_TMR_ONESHOT,
- TT_TMR_PERIODIC,
-};
-
-#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
-extern enum time_travel_mode time_travel_mode;
-extern unsigned long long time_travel_time;
-extern enum time_travel_timer_mode time_travel_timer_mode;
-extern unsigned long long time_travel_timer_expiry;
-extern unsigned long long time_travel_timer_interval;
-
-static inline void time_travel_set_time(unsigned long long ns)
-{
- time_travel_time = ns;
-}
-
-static inline void time_travel_set_timer_mode(enum time_travel_timer_mode mode)
-{
- time_travel_timer_mode = mode;
-}
-
-static inline void time_travel_set_timer_expiry(unsigned long long expiry)
-{
- time_travel_timer_expiry = expiry;
-}
-
-static inline void time_travel_set_timer_interval(unsigned long long interval)
-{
- time_travel_timer_interval = interval;
-}
-#else
-#define time_travel_mode TT_MODE_OFF
-#define time_travel_time 0
-#define time_travel_timer_expiry 0
-#define time_travel_timer_interval 0
-
-static inline void time_travel_set_time(unsigned long long ns)
-{
-}
-
-static inline void time_travel_set_timer_mode(enum time_travel_timer_mode mode)
-{
-}
-
-static inline void time_travel_set_timer_expiry(unsigned long long expiry)
-{
-}
-
-static inline void time_travel_set_timer_interval(unsigned long long interval)
-{
-}
-
-#define time_travel_timer_mode TT_TMR_DISABLED
-#endif
-
-#endif
diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c
index 98bdf69e4c2e..e4abac6c9727 100644
--- a/arch/um/kernel/kmsg_dump.c
+++ b/arch/um/kernel/kmsg_dump.c
@@ -9,20 +9,19 @@ static void kmsg_dumper_stdout(struct kmsg_dumper *dumper,
enum kmsg_dump_reason reason)
{
static char line[1024];
-
+ struct console *con;
size_t len = 0;
- bool con_available = false;
/* only dump kmsg when no console is available */
if (!console_trylock())
return;
- if (console_drivers != NULL)
- con_available = true;
+ for_each_console(con)
+ break;
console_unlock();
- if (con_available == true)
+ if (con)
return;
printf("kmsg_dump:\n");
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 56a094182bf5..cbe33af2a880 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -32,7 +32,7 @@
#include <kern_util.h>
#include <os.h>
#include <skas.h>
-#include <timer-internal.h>
+#include <linux/time-internal.h>
/*
* This is a per-cpu array. A processor only modifies its entry and it only
@@ -203,43 +203,6 @@ void initial_thread_cb(void (*proc)(void *), void *arg)
kmalloc_ok = save_kmalloc_ok;
}
-static void time_travel_sleep(unsigned long long duration)
-{
- unsigned long long next = time_travel_time + duration;
-
- if (time_travel_mode != TT_MODE_INFCPU)
- os_timer_disable();
-
- while (time_travel_timer_mode == TT_TMR_PERIODIC &&
- time_travel_timer_expiry < time_travel_time)
- time_travel_set_timer_expiry(time_travel_timer_expiry +
- time_travel_timer_interval);
-
- if (time_travel_timer_mode != TT_TMR_DISABLED &&
- time_travel_timer_expiry < next) {
- if (time_travel_timer_mode == TT_TMR_ONESHOT)
- time_travel_set_timer_mode(TT_TMR_DISABLED);
- /*
- * In basic mode, time_travel_time will be adjusted in
- * the timer IRQ handler so it works even when the signal
- * comes from the OS timer, see there.
- */
- if (time_travel_mode != TT_MODE_BASIC)
- time_travel_set_time(time_travel_timer_expiry);
-
- deliver_alarm();
- } else {
- time_travel_set_time(next);
- }
-
- if (time_travel_mode != TT_MODE_INFCPU) {
- if (time_travel_timer_mode == TT_TMR_PERIODIC)
- os_timer_set_interval(time_travel_timer_interval);
- else if (time_travel_timer_mode == TT_TMR_ONESHOT)
- os_timer_one_shot(time_travel_timer_expiry - next);
- }
-}
-
static void um_idle_sleep(void)
{
unsigned long long duration = UM_NSEC_PER_SEC;
diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c
index 40d90dddf3f1..0a12d5a09217 100644
--- a/arch/um/kernel/skas/syscall.c
+++ b/arch/um/kernel/skas/syscall.c
@@ -10,7 +10,7 @@
#include <sysdep/ptrace.h>
#include <sysdep/ptrace_user.h>
#include <sysdep/syscalls.h>
-#include <shared/timer-internal.h>
+#include <linux/time-internal.h>
void handle_syscall(struct uml_pt_regs *r)
{
@@ -24,7 +24,8 @@ void handle_syscall(struct uml_pt_regs *r)
* went to sleep, even if said userspace interacts with the kernel in
* various ways.
*/
- if (time_travel_mode == TT_MODE_INFCPU)
+ if (time_travel_mode == TT_MODE_INFCPU ||
+ time_travel_mode == TT_MODE_EXTERNAL)
schedule();
/* Initialize the syscall number and default return value. */
diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index 94ea87bd231c..25eaa6a0c658 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -4,6 +4,7 @@
* Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de)
* Copyright (C) 2012-2014 Cisco Systems
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2019 Intel Corporation
*/
#include <linux/clockchips.h>
@@ -18,21 +19,484 @@
#include <asm/param.h>
#include <kern_util.h>
#include <os.h>
-#include <timer-internal.h>
+#include <linux/time-internal.h>
+#include <linux/um_timetravel.h>
#include <shared/init.h>
#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
enum time_travel_mode time_travel_mode;
-unsigned long long time_travel_time;
-enum time_travel_timer_mode time_travel_timer_mode;
-unsigned long long time_travel_timer_expiry;
-unsigned long long time_travel_timer_interval;
+EXPORT_SYMBOL_GPL(time_travel_mode);
static bool time_travel_start_set;
static unsigned long long time_travel_start;
-#else
+static unsigned long long time_travel_time;
+static LIST_HEAD(time_travel_events);
+static unsigned long long time_travel_timer_interval;
+static unsigned long long time_travel_next_event;
+static struct time_travel_event time_travel_timer_event;
+static int time_travel_ext_fd = -1;
+static unsigned int time_travel_ext_waiting;
+static bool time_travel_ext_prev_request_valid;
+static unsigned long long time_travel_ext_prev_request;
+static bool time_travel_ext_free_until_valid;
+static unsigned long long time_travel_ext_free_until;
+
+static void time_travel_set_time(unsigned long long ns)
+{
+ if (unlikely(ns < time_travel_time))
+ panic("time-travel: time goes backwards %lld -> %lld\n",
+ time_travel_time, ns);
+ time_travel_time = ns;
+}
+
+enum time_travel_message_handling {
+ TTMH_IDLE,
+ TTMH_POLL,
+ TTMH_READ,
+};
+
+static void time_travel_handle_message(struct um_timetravel_msg *msg,
+ enum time_travel_message_handling mode)
+{
+ struct um_timetravel_msg resp = {
+ .op = UM_TIMETRAVEL_ACK,
+ };
+ int ret;
+
+ /*
+ * Poll outside the locked section (if we're not called to only read
+ * the response) so we can get interrupts for e.g. virtio while we're
+ * here, but then we need to lock to not get interrupted between the
+ * read of the message and write of the ACK.
+ */
+ if (mode != TTMH_READ) {
+ while (os_poll(1, &time_travel_ext_fd) != 0) {
+ if (mode == TTMH_IDLE) {
+ BUG_ON(!irqs_disabled());
+ local_irq_enable();
+ local_irq_disable();
+ }
+ }
+ }
+
+ ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg));
+
+ if (ret == 0)
+ panic("time-travel external link is broken\n");
+ if (ret != sizeof(*msg))
+ panic("invalid time-travel message - %d bytes\n", ret);
+
+ switch (msg->op) {
+ default:
+ WARN_ONCE(1, "time-travel: unexpected message %lld\n",
+ (unsigned long long)msg->op);
+ break;
+ case UM_TIMETRAVEL_ACK:
+ return;
+ case UM_TIMETRAVEL_RUN:
+ time_travel_set_time(msg->time);
+ break;
+ case UM_TIMETRAVEL_FREE_UNTIL:
+ time_travel_ext_free_until_valid = true;
+ time_travel_ext_free_until = msg->time;
+ break;
+ }
+
+ os_write_file(time_travel_ext_fd, &resp, sizeof(resp));
+}
+
+static u64 time_travel_ext_req(u32 op, u64 time)
+{
+ static int seq;
+ int mseq = ++seq;
+ struct um_timetravel_msg msg = {
+ .op = op,
+ .time = time,
+ .seq = mseq,
+ };
+ unsigned long flags;
+
+ /*
+ * We need to save interrupts here and only restore when we
+ * got the ACK - otherwise we can get interrupted and send
+ * another request while we're still waiting for an ACK, but
+ * the peer doesn't know we got interrupted and will send
+ * the ACKs in the same order as the message, but we'd need
+ * to see them in the opposite order ...
+ *
+ * This wouldn't matter *too* much, but some ACKs carry the
+ * current time (for UM_TIMETRAVEL_GET) and getting another
+ * ACK without a time would confuse us a lot!
+ *
+ * The sequence number assignment that happens here lets us
+ * debug such message handling issues more easily.
+ */
+ local_irq_save(flags);
+ os_write_file(time_travel_ext_fd, &msg, sizeof(msg));
+
+ while (msg.op != UM_TIMETRAVEL_ACK)
+ time_travel_handle_message(&msg, TTMH_READ);
+
+ if (msg.seq != mseq)
+ panic("time-travel: ACK message has different seqno! op=%d, seq=%d != %d time=%lld\n",
+ msg.op, msg.seq, mseq, msg.time);
+
+ if (op == UM_TIMETRAVEL_GET)
+ time_travel_set_time(msg.time);
+ local_irq_restore(flags);
+
+ return msg.time;
+}
+
+void __time_travel_wait_readable(int fd)
+{
+ int fds[2] = { fd, time_travel_ext_fd };
+ int ret;
+
+ if (time_travel_mode != TT_MODE_EXTERNAL)
+ return;
+
+ while ((ret = os_poll(2, fds))) {
+ struct um_timetravel_msg msg;
+
+ if (ret == 1)
+ time_travel_handle_message(&msg, TTMH_READ);
+ }
+}
+EXPORT_SYMBOL_GPL(__time_travel_wait_readable);
+
+static void time_travel_ext_update_request(unsigned long long time)
+{
+ if (time_travel_mode != TT_MODE_EXTERNAL)
+ return;
+
+ /* asked for exactly this time previously */
+ if (time_travel_ext_prev_request_valid &&
+ time == time_travel_ext_prev_request)
+ return;
+
+ time_travel_ext_prev_request = time;
+ time_travel_ext_prev_request_valid = true;
+ time_travel_ext_req(UM_TIMETRAVEL_REQUEST, time);
+}
+
+void __time_travel_propagate_time(void)
+{
+ time_travel_ext_req(UM_TIMETRAVEL_UPDATE, time_travel_time);
+}
+EXPORT_SYMBOL_GPL(__time_travel_propagate_time);
+
+/* returns true if we must do a wait to the simtime device */
+static bool time_travel_ext_request(unsigned long long time)
+{
+ /*
+ * If we received an external sync point ("free until") then we
+ * don't have to request/wait for anything until then, unless
+ * we're already waiting.
+ */
+ if (!time_travel_ext_waiting && time_travel_ext_free_until_valid &&
+ time < time_travel_ext_free_until)
+ return false;
+
+ time_travel_ext_update_request(time);
+ return true;
+}
+
+static void time_travel_ext_wait(bool idle)
+{
+ struct um_timetravel_msg msg = {
+ .op = UM_TIMETRAVEL_ACK,
+ };
+
+ time_travel_ext_prev_request_valid = false;
+ time_travel_ext_waiting++;
+
+ time_travel_ext_req(UM_TIMETRAVEL_WAIT, -1);
+
+ /*
+ * Here we are deep in the idle loop, so we have to break out of the
+ * kernel abstraction in a sense and implement this in terms of the
+ * UML system waiting on the VQ interrupt while sleeping, when we get
+ * the signal it'll call time_travel_ext_vq_notify_done() completing the
+ * call.
+ */
+ while (msg.op != UM_TIMETRAVEL_RUN)
+ time_travel_handle_message(&msg, idle ? TTMH_IDLE : TTMH_POLL);
+
+ time_travel_ext_waiting--;
+
+ /* we might request more stuff while polling - reset when we run */
+ time_travel_ext_prev_request_valid = false;
+}
+
+static void time_travel_ext_get_time(void)
+{
+ time_travel_ext_req(UM_TIMETRAVEL_GET, -1);
+}
+
+static void __time_travel_update_time(unsigned long long ns, bool idle)
+{
+ if (time_travel_mode == TT_MODE_EXTERNAL && time_travel_ext_request(ns))
+ time_travel_ext_wait(idle);
+ else
+ time_travel_set_time(ns);
+}
+
+static struct time_travel_event *time_travel_first_event(void)
+{
+ return list_first_entry_or_null(&time_travel_events,
+ struct time_travel_event,
+ list);
+}
+
+static void __time_travel_add_event(struct time_travel_event *e,
+ unsigned long long time)
+{
+ struct time_travel_event *tmp;
+ bool inserted = false;
+
+ if (WARN(time_travel_mode == TT_MODE_BASIC &&
+ e != &time_travel_timer_event,
+ "only timer events can be handled in basic mode"))
+ return;
+
+ if (e->pending)
+ return;
+
+ e->pending = true;
+ e->time = time;
+
+ list_for_each_entry(tmp, &time_travel_events, list) {
+ /*
+ * Add the new entry before one with higher time,
+ * or if they're equal and both on stack, because
+ * in that case we need to unwind the stack in the
+ * right order, and the later event (timer sleep
+ * or such) must be dequeued first.
+ */
+ if ((tmp->time > e->time) ||
+ (tmp->time == e->time && tmp->onstack && e->onstack)) {
+ list_add_tail(&e->list, &tmp->list);
+ inserted = true;
+ break;
+ }
+ }
+
+ if (!inserted)
+ list_add_tail(&e->list, &time_travel_events);
+
+ tmp = time_travel_first_event();
+ time_travel_ext_update_request(tmp->time);
+ time_travel_next_event = tmp->time;
+}
+
+static void time_travel_add_event(struct time_travel_event *e,
+ unsigned long long time)
+{
+ if (WARN_ON(!e->fn))
+ return;
+
+ __time_travel_add_event(e, time);
+}
+
+void time_travel_periodic_timer(struct time_travel_event *e)
+{
+ time_travel_add_event(&time_travel_timer_event,
+ time_travel_time + time_travel_timer_interval);
+ deliver_alarm();
+}
+
+static void time_travel_deliver_event(struct time_travel_event *e)
+{
+ if (e == &time_travel_timer_event) {
+ /*
+ * deliver_alarm() does the irq_enter/irq_exit
+ * by itself, so must handle it specially here
+ */
+ e->fn(e);
+ } else {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ irq_enter();
+ e->fn(e);
+ irq_exit();
+ local_irq_restore(flags);
+ }
+}
+
+static bool time_travel_del_event(struct time_travel_event *e)
+{
+ if (!e->pending)
+ return false;
+ list_del(&e->list);
+ e->pending = false;
+ return true;
+}
+
+static void time_travel_update_time(unsigned long long next, bool idle)
+{
+ struct time_travel_event ne = {
+ .onstack = true,
+ };
+ struct time_travel_event *e;
+ bool finished = idle;
+
+ /* add it without a handler - we deal with that specifically below */
+ __time_travel_add_event(&ne, next);
+
+ do {
+ e = time_travel_first_event();
+
+ BUG_ON(!e);
+ __time_travel_update_time(e->time, idle);
+
+ /* new events may have been inserted while we were waiting */
+ if (e == time_travel_first_event()) {
+ BUG_ON(!time_travel_del_event(e));
+ BUG_ON(time_travel_time != e->time);
+
+ if (e == &ne) {
+ finished = true;
+ } else {
+ if (e->onstack)
+ panic("On-stack event dequeued outside of the stack! time=%lld, event time=%lld, event=%pS\n",
+ time_travel_time, e->time, e);
+ time_travel_deliver_event(e);
+ }
+ }
+
+ e = time_travel_first_event();
+ if (e)
+ time_travel_ext_update_request(e->time);
+ } while (ne.pending && !finished);
+
+ time_travel_del_event(&ne);
+}
+
+void time_travel_ndelay(unsigned long nsec)
+{
+ time_travel_update_time(time_travel_time + nsec, false);
+}
+EXPORT_SYMBOL(time_travel_ndelay);
+
+void time_travel_add_irq_event(struct time_travel_event *e)
+{
+ BUG_ON(time_travel_mode != TT_MODE_EXTERNAL);
+
+ time_travel_ext_get_time();
+ /*
+ * We could model interrupt latency here, for now just
+ * don't have any latency at all and request the exact
+ * same time (again) to run the interrupt...
+ */
+ time_travel_add_event(e, time_travel_time);
+}
+EXPORT_SYMBOL_GPL(time_travel_add_irq_event);
+
+static void time_travel_oneshot_timer(struct time_travel_event *e)
+{
+ deliver_alarm();
+}
+
+void time_travel_sleep(unsigned long long duration)
+{
+ unsigned long long next = time_travel_time + duration;
+
+ if (time_travel_mode == TT_MODE_BASIC)
+ os_timer_disable();
+
+ time_travel_update_time(next, true);
+
+ if (time_travel_mode == TT_MODE_BASIC &&
+ time_travel_timer_event.pending) {
+ if (time_travel_timer_event.fn == time_travel_periodic_timer) {
+ /*
+ * This is somewhat wrong - we should get the first
+ * one sooner like the os_timer_one_shot() below...
+ */
+ os_timer_set_interval(time_travel_timer_interval);
+ } else {
+ os_timer_one_shot(time_travel_timer_event.time - next);
+ }
+ }
+}
+
+static void time_travel_handle_real_alarm(void)
+{
+ time_travel_set_time(time_travel_next_event);
+
+ time_travel_del_event(&time_travel_timer_event);
+
+ if (time_travel_timer_event.fn == time_travel_periodic_timer)
+ time_travel_add_event(&time_travel_timer_event,
+ time_travel_time +
+ time_travel_timer_interval);
+}
+
+static void time_travel_set_interval(unsigned long long interval)
+{
+ time_travel_timer_interval = interval;
+}
+
+static int time_travel_connect_external(const char *socket)
+{
+ const char *sep;
+ unsigned long long id = (unsigned long long)-1;
+ int rc;
+
+ if ((sep = strchr(socket, ':'))) {
+ char buf[25] = {};
+ if (sep - socket > sizeof(buf) - 1)
+ goto invalid_number;
+
+ memcpy(buf, socket, sep - socket);
+ if (kstrtoull(buf, 0, &id)) {
+invalid_number:
+ panic("time-travel: invalid external ID in string '%s'\n",
+ socket);
+ return -EINVAL;
+ }
+
+ socket = sep + 1;
+ }
+
+ rc = os_connect_socket(socket);
+ if (rc < 0) {
+ panic("time-travel: failed to connect to external socket %s\n",
+ socket);
+ return rc;
+ }
+
+ time_travel_ext_fd = rc;
+
+ time_travel_ext_req(UM_TIMETRAVEL_START, id);
+
+ return 1;
+}
+#else /* CONFIG_UML_TIME_TRAVEL_SUPPORT */
#define time_travel_start_set 0
#define time_travel_start 0
+#define time_travel_time 0
+
+static inline void time_travel_update_time(unsigned long long ns, bool retearly)
+{
+}
+
+static inline void time_travel_handle_real_alarm(void)
+{
+}
+
+static void time_travel_set_interval(unsigned long long interval)
+{
+}
+
+/* fail link if this actually gets used */
+extern u64 time_travel_ext_req(u32 op, u64 time);
+
+/* these are empty macros so the struct/fn need not exist */
+#define time_travel_add_event(e, time) do { } while (0)
+#define time_travel_del_event(e) do { } while (0)
#endif
void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
@@ -48,7 +512,7 @@ void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
* never get any real signals from the OS.
*/
if (time_travel_mode == TT_MODE_BASIC)
- time_travel_set_time(time_travel_timer_expiry);
+ time_travel_handle_real_alarm();
local_irq_save(flags);
do_IRQ(TIMER_IRQ, regs);
@@ -58,9 +522,10 @@ void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
static int itimer_shutdown(struct clock_event_device *evt)
{
if (time_travel_mode != TT_MODE_OFF)
- time_travel_set_timer_mode(TT_TMR_DISABLED);
+ time_travel_del_event(&time_travel_timer_event);
- if (time_travel_mode != TT_MODE_INFCPU)
+ if (time_travel_mode != TT_MODE_INFCPU &&
+ time_travel_mode != TT_MODE_EXTERNAL)
os_timer_disable();
return 0;
@@ -71,12 +536,16 @@ static int itimer_set_periodic(struct clock_event_device *evt)
unsigned long long interval = NSEC_PER_SEC / HZ;
if (time_travel_mode != TT_MODE_OFF) {
- time_travel_set_timer_mode(TT_TMR_PERIODIC);
- time_travel_set_timer_expiry(time_travel_time + interval);
- time_travel_set_timer_interval(interval);
+ time_travel_del_event(&time_travel_timer_event);
+ time_travel_set_event_fn(&time_travel_timer_event,
+ time_travel_periodic_timer);
+ time_travel_set_interval(interval);
+ time_travel_add_event(&time_travel_timer_event,
+ time_travel_time + interval);
}
- if (time_travel_mode != TT_MODE_INFCPU)
+ if (time_travel_mode != TT_MODE_INFCPU &&
+ time_travel_mode != TT_MODE_EXTERNAL)
os_timer_set_interval(interval);
return 0;
@@ -88,11 +557,15 @@ static int itimer_next_event(unsigned long delta,
delta += 1;
if (time_travel_mode != TT_MODE_OFF) {
- time_travel_set_timer_mode(TT_TMR_ONESHOT);
- time_travel_set_timer_expiry(time_travel_time + delta);
+ time_travel_del_event(&time_travel_timer_event);
+ time_travel_set_event_fn(&time_travel_timer_event,
+ time_travel_oneshot_timer);
+ time_travel_add_event(&time_travel_timer_event,
+ time_travel_time + delta);
}
- if (time_travel_mode != TT_MODE_INFCPU)
+ if (time_travel_mode != TT_MODE_INFCPU &&
+ time_travel_mode != TT_MODE_EXTERNAL)
return os_timer_one_shot(delta);
return 0;
@@ -143,8 +616,17 @@ static u64 timer_read(struct clocksource *cs)
* stuck in loops that expect time to move more than the
* exact requested sleep amount, e.g. python's socket server,
* see https://bugs.python.org/issue37026.
+ *
+ * However, don't do that when we're in interrupt or such as
+ * then we might recurse into our own processing, and get to
+ * even more waiting, and that's not good - it messes up the
+ * "what do I do next" and onstack event we use to know when
+ * to return from time_travel_update_time().
*/
- time_travel_set_time(time_travel_time + TIMER_MULTIPLIER);
+ if (!irqs_disabled() && !in_interrupt() && !in_softirq())
+ time_travel_update_time(time_travel_time +
+ TIMER_MULTIPLIER,
+ false);
return time_travel_time / TIMER_MULTIPLIER;
}
@@ -188,6 +670,8 @@ void read_persistent_clock64(struct timespec64 *ts)
if (time_travel_start_set)
nsecs = time_travel_start + time_travel_time;
+ else if (time_travel_mode == TT_MODE_EXTERNAL)
+ nsecs = time_travel_ext_req(UM_TIMETRAVEL_GET_TOD, -1);
else
nsecs = os_persistent_clock_emulation();
@@ -204,7 +688,8 @@ void __init time_init(void)
#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
unsigned long calibrate_delay_is_known(void)
{
- if (time_travel_mode == TT_MODE_INFCPU)
+ if (time_travel_mode == TT_MODE_INFCPU ||
+ time_travel_mode == TT_MODE_EXTERNAL)
return 1;
return 0;
}
@@ -218,6 +703,13 @@ int setup_time_travel(char *str)
return 1;
}
+ if (strncmp(str, "=ext:", 5) == 0) {
+ time_travel_mode = TT_MODE_EXTERNAL;
+ timer_clockevent.name = "time-travel-timer-external";
+ timer_clocksource.name = "time-travel-clock-external";
+ return time_travel_connect_external(str + 5);
+ }
+
if (!*str) {
time_travel_mode = TT_MODE_BASIC;
timer_clockevent.name = "time-travel-timer";
@@ -242,7 +734,15 @@ __uml_help(setup_time_travel,
"are no wall clock timers, and any CPU processing happens - as seen from the\n"
"guest - instantly. This can be useful for accurate simulation regardless of\n"
"debug overhead, physical CPU speed, etc. but is somewhat dangerous as it can\n"
-"easily lead to getting stuck (e.g. if anything in the system busy loops).\n");
+"easily lead to getting stuck (e.g. if anything in the system busy loops).\n"
+"\n"
+"time-travel=ext:[ID:]/path/to/socket\n"
+"This enables time travel mode similar to =inf-cpu, except the system will\n"
+"use the given socket to coordinate with a central scheduler, in order to\n"
+"have more than one system simultaneously be on simulated time. The virtio\n"
+"driver code in UML knows about this so you can also simulate networks and\n"
+"devices using it, assuming the device has the right capabilities.\n"
+"The optional ID is a 64-bit integer that's sent to the central scheduler.\n");
int setup_time_travel_start(char *str)
{
diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S
index 9f21443be2c9..3b6dab3d4501 100644
--- a/arch/um/kernel/uml.lds.S
+++ b/arch/um/kernel/uml.lds.S
@@ -19,10 +19,10 @@ SECTIONS
__binary_start = START;
. = START + SIZEOF_HEADERS;
+ . = ALIGN(PAGE_SIZE);
_text = .;
INIT_TEXT_SECTION(0)
- . = ALIGN(PAGE_SIZE);
.text :
{
diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c
index fbda10535dab..26ecbd64c409 100644
--- a/arch/um/os-Linux/file.c
+++ b/arch/um/os-Linux/file.c
@@ -5,9 +5,11 @@
#include <stdio.h>
#include <unistd.h>
+#include <stdlib.h>
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
+#include <linux/falloc.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/socket.h>
@@ -16,6 +18,7 @@
#include <sys/un.h>
#include <sys/types.h>
#include <sys/eventfd.h>
+#include <poll.h>
#include <os.h>
static void copy_stat(struct uml_stat *dst, const struct stat64 *src)
@@ -664,3 +667,31 @@ int os_sendmsg_fds(int fd, const void *buf, unsigned int len, const int *fds,
return -errno;
return err;
}
+
+int os_poll(unsigned int n, const int *fds)
+{
+ /* currently need 2 FDs at most so avoid dynamic allocation */
+ struct pollfd pollfds[2] = {};
+ unsigned int i;
+ int ret;
+
+ if (n > ARRAY_SIZE(pollfds))
+ return -EINVAL;
+
+ for (i = 0; i < n; i++) {
+ pollfds[i].fd = fds[i];
+ pollfds[i].events = POLLIN;
+ }
+
+ ret = poll(pollfds, n, -1);
+ if (ret < 0)
+ return -errno;
+
+ /* Return the index of the available FD */
+ for (i = 0; i < n; i++) {
+ if (pollfds[i].revents)
+ return i;
+ }
+
+ return -EIO;
+}
diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c
index 432f8e1f55c2..90f6de224c70 100644
--- a/arch/um/os-Linux/time.c
+++ b/arch/um/os-Linux/time.c
@@ -14,7 +14,6 @@
#include <kern_util.h>
#include <os.h>
#include <string.h>
-#include <timer-internal.h>
static timer_t event_high_res_timer = 0;
diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c
index 44def53a11cd..9e16078a4bf8 100644
--- a/arch/um/os-Linux/umid.c
+++ b/arch/um/os-Linux/umid.c
@@ -220,11 +220,12 @@ static void __init create_pid_file(void)
char pid[sizeof("nnnnn\0")], *file;
int fd, n;
- file = malloc(strlen(uml_dir) + UMID_LEN + sizeof("/pid\0"));
+ n = strlen(uml_dir) + UMID_LEN + sizeof("/pid\0");
+ file = malloc(n);
if (!file)
return;
- if (umid_file_name("pid", file, sizeof(file)))
+ if (umid_file_name("pid", file, n))
goto out;
fd = open(file, O_RDWR | O_CREAT | O_EXCL, 0644);
diff --git a/arch/unicore32/include/asm/page.h b/arch/unicore32/include/asm/page.h
index 8a89335673f9..96d6bdf180bd 100644
--- a/arch/unicore32/include/asm/page.h
+++ b/arch/unicore32/include/asm/page.h
@@ -69,9 +69,6 @@ extern int pfn_valid(unsigned long);
#endif /* !__ASSEMBLY__ */
-#define VM_DATA_DEFAULT_FLAGS \
- (VM_READ | VM_WRITE | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
#include <asm-generic/getorder.h>
#endif
diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h
index c8f7ba12f309..3b8731b3a937 100644
--- a/arch/unicore32/include/asm/pgtable.h
+++ b/arch/unicore32/include/asm/pgtable.h
@@ -177,7 +177,6 @@ extern struct page *empty_zero_page;
#define pte_dirty(pte) (pte_val(pte) & PTE_DIRTY)
#define pte_young(pte) (pte_val(pte) & PTE_YOUNG)
#define pte_exec(pte) (pte_val(pte) & PTE_EXEC)
-#define pte_special(pte) (0)
#define PTE_BIT_FUNC(fn, op) \
static inline pte_t pte_##fn(pte_t pte) { pte_val(pte) op; return pte; }
@@ -189,8 +188,6 @@ PTE_BIT_FUNC(mkdirty, |= PTE_DIRTY);
PTE_BIT_FUNC(mkold, &= ~PTE_YOUNG);
PTE_BIT_FUNC(mkyoung, |= PTE_YOUNG);
-static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
-
/*
* Mark the prot value as uncacheable.
*/
diff --git a/arch/unicore32/kernel/puv3-nb0916.c b/arch/unicore32/kernel/puv3-nb0916.c
index a3bf2ffc54dd..e251f5028396 100644
--- a/arch/unicore32/kernel/puv3-nb0916.c
+++ b/arch/unicore32/kernel/puv3-nb0916.c
@@ -55,7 +55,6 @@ static struct pwm_lookup nb0916_pwm_lookup[] = {
static struct platform_pwm_backlight_data nb0916_backlight_data = {
.max_brightness = 100,
.dft_brightness = 100,
- .enable_gpio = -1,
};
static struct gpio_keys_button nb0916_gpio_keys[] = {
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index a9bd08fbe588..3022104aa613 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -149,7 +149,7 @@ void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
*/
static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma)
{
- unsigned int mask = VM_READ | VM_WRITE | VM_EXEC;
+ unsigned int mask = VM_ACCESS_FLAGS;
if (!(fsr ^ 0x12)) /* write? */
mask = VM_WRITE;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1edf788d301c..1d6104ea8af0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -149,6 +149,7 @@ config X86
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
+ select HAVE_ARCH_USERFAULTFD_WP if USERFAULTFD
select HAVE_ARCH_VMAP_STACK if X86_64
select HAVE_ARCH_WITHIN_STACK_FRAMES
select HAVE_ASM_MODVERSIONS
@@ -1660,6 +1661,7 @@ config X86_PMEM_LEGACY
depends on PHYS_ADDR_T_64BIT
depends on BLK_DEV
select X86_PMEM_LEGACY_DEVICE
+ select NUMA_KEEP_MEMINFO if NUMA
select LIBNVDIMM
help
Treat memory marked using the non-standard e820 type of 12 as used
@@ -2930,3 +2932,5 @@ config HAVE_ATOMIC_IOMAP
source "drivers/firmware/Kconfig"
source "arch/x86/kvm/Kconfig"
+
+source "arch/x86/Kconfig.assembler"
diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler
new file mode 100644
index 000000000000..13de0db38d4e
--- /dev/null
+++ b/arch/x86/Kconfig.assembler
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+
+config AS_AVX512
+ def_bool $(as-instr,vpmovm2b %k1$(comma)%zmm5)
+ help
+ Supported by binutils >= 2.25 and LLVM integrated assembler
+
+config AS_SHA1_NI
+ def_bool $(as-instr,sha1msg1 %xmm0$(comma)%xmm1)
+ help
+ Supported by binutils >= 2.24 and LLVM integrated assembler
+
+config AS_SHA256_NI
+ def_bool $(as-instr,sha256msg1 %xmm0$(comma)%xmm1)
+ help
+ Supported by binutils >= 2.24 and LLVM integrated assembler
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 513a55562d75..b65ec63c7db7 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -177,28 +177,6 @@ ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1)
KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args,)
endif
-# Stackpointer is addressed different for 32 bit and 64 bit x86
-sp-$(CONFIG_X86_32) := esp
-sp-$(CONFIG_X86_64) := rsp
-
-# do binutils support CFI?
-cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
-# is .cfi_signal_frame supported too?
-cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
-cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
-
-# does binutils support specific instructions?
-asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
-avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
-avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
-avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
-sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
-sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
-adx_instr := $(call as-instr,adox %r10$(comma)%r10,-DCONFIG_AS_ADX=1)
-
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
-
KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)
#
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index ab8b30cb978e..550904591e94 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -285,7 +285,6 @@ CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_STACKOVERFLOW=y
# CONFIG_DEBUG_RODATA_TEST is not set
CONFIG_DEBUG_BOOT_PARAMS=y
-CONFIG_OPTIMIZE_INLINING=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 2d196cb49084..614961009075 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -282,7 +282,6 @@ CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_STACKOVERFLOW=y
# CONFIG_DEBUG_RODATA_TEST is not set
CONFIG_DEBUG_BOOT_PARAMS=y
-CONFIG_OPTIMIZE_INLINING=y
CONFIG_UNWINDER_ORC=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 8c2e9eadee8a..a31de0c6ccde 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -1,131 +1,97 @@
# SPDX-License-Identifier: GPL-2.0
#
-# Arch-specific CryptoAPI modules.
-#
+# x86 crypto algorithms
OBJECT_FILES_NON_STANDARD := y
-avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
-avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
- $(comma)4)$(comma)%ymm2,yes,no)
-avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
-sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
-sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
-adx_supported := $(call as-instr,adox %r10$(comma)%r10,yes,no)
-
obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
+twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
+twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
+twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
+obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
+twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o
+
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
+serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
+obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
+serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
+obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
+serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
+obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
+serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
obj-$(CONFIG_CRYPTO_DES3_EDE_X86_64) += des3_ede-x86_64.o
+des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o
+
obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
+camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
+obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
+camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o camellia_aesni_avx_glue.o
+obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
+camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
+
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
-obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
-obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
-obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
-obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
-obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
+blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
-obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
-obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
-obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
-obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
-obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
-obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
-obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
+obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
+cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
+
+obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
+cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o
+aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
-obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
-obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
+obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
+chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha_glue.o
+chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o
-# These modules require the assembler to support ADX.
-ifeq ($(adx_supported),yes)
- obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
-endif
-
-# These modules require assembler to support AVX.
-ifeq ($(avx_supported),yes)
- obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
- camellia-aesni-avx-x86_64.o
- obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
- obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
- obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
- obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
- obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
-endif
-
-# These modules require assembler to support AVX2.
-ifeq ($(avx2_supported),yes)
- obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
- obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
-endif
+obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
+aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
+aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
-twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
-serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
+obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
+sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o
+sha1-ssse3-$(CONFIG_AS_SHA1_NI) += sha1_ni_asm.o
-des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o
-camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
-blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
-twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
-twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
-serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
+obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
+sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
+sha256-ssse3-$(CONFIG_AS_SHA256_NI) += sha256_ni_asm.o
-aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
+obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
+sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
-nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
+obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
-poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
-ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
-targets += poly1305-x86_64-cryptogams.S
-endif
-
-ifeq ($(avx_supported),yes)
- camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
- camellia_aesni_avx_glue.o
- cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
- cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
- twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o \
- twofish_avx_glue.o
- serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o \
- serpent_avx_glue.o
-endif
-
-ifeq ($(avx2_supported),yes)
- camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
- chacha-x86_64-y += chacha-avx2-x86_64.o
- serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
-
- nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o
-endif
-
-ifeq ($(avx512_supported),yes)
- chacha-x86_64-y += chacha-avx512vl-x86_64.o
-endif
-aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
-aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
+obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
-sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
-ifeq ($(avx2_supported),yes)
-sha1-ssse3-y += sha1_avx2_x86_64_asm.o
-endif
-ifeq ($(sha1_ni_supported),yes)
-sha1-ssse3-y += sha1_ni_asm.o
-endif
+
+obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
crc32c-intel-y := crc32c-intel_glue.o
crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
+
+obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
-sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
-ifeq ($(sha256_ni_supported),yes)
-sha256-ssse3-y += sha256_ni_asm.o
-endif
-sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
+
+obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
+obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
+poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
+targets += poly1305-x86_64-cryptogams.S
+
+obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
+nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
+nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o
+
+obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
+
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $< > $@
$(obj)/%.S: $(src)/%.pl FORCE
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index bfa1c0b3e5b4..0cea33295287 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -886,7 +886,6 @@ _less_than_8_bytes_left_\@:
_partial_block_done_\@:
.endm # PARTIAL_BLOCK
-#ifdef CONFIG_AS_AVX
###############################################################################
# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
# Input: A and B (128-bits each, bit-reflected)
@@ -1869,9 +1868,6 @@ key_256_finalize:
ret
SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
-#endif /* CONFIG_AS_AVX */
-
-#ifdef CONFIG_AS_AVX2
###############################################################################
# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
# Input: A and B (128-bits each, bit-reflected)
@@ -2839,5 +2835,3 @@ key_256_finalize4:
FUNC_RESTORE
ret
SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
-
-#endif /* CONFIG_AS_AVX2 */
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 75b6ea20491e..ad8a7188a2bf 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -185,7 +185,6 @@ static const struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = {
.finalize = &aesni_gcm_finalize,
};
-#ifdef CONFIG_AS_AVX
asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
@@ -234,9 +233,6 @@ static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = {
.finalize = &aesni_gcm_finalize_avx_gen2,
};
-#endif
-
-#ifdef CONFIG_AS_AVX2
/*
* asmlinkage void aesni_gcm_init_avx_gen4()
* gcm_data *my_ctx_data, context data
@@ -279,8 +275,6 @@ static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = {
.finalize = &aesni_gcm_finalize_avx_gen4,
};
-#endif
-
static inline struct
aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
{
@@ -476,7 +470,6 @@ static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
crypto_inc(ctrblk, AES_BLOCK_SIZE);
}
-#ifdef CONFIG_AS_AVX
static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv)
{
@@ -493,7 +486,6 @@ static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
else
aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len);
}
-#endif
static int ctr_crypt(struct skcipher_request *req)
{
@@ -711,14 +703,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
if (!enc)
left -= auth_tag_len;
-#ifdef CONFIG_AS_AVX2
if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4)
gcm_tfm = &aesni_gcm_tfm_avx_gen2;
-#endif
-#ifdef CONFIG_AS_AVX
if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2)
gcm_tfm = &aesni_gcm_tfm_sse;
-#endif
/* Linearize assoc, if not already linear */
if (req->src->length >= assoclen && req->src->length &&
@@ -1076,31 +1064,24 @@ static int __init aesni_init(void)
if (!x86_match_cpu(aesni_cpu_id))
return -ENODEV;
#ifdef CONFIG_X86_64
-#ifdef CONFIG_AS_AVX2
if (boot_cpu_has(X86_FEATURE_AVX2)) {
pr_info("AVX2 version of gcm_enc/dec engaged.\n");
aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4;
} else
-#endif
-#ifdef CONFIG_AS_AVX
if (boot_cpu_has(X86_FEATURE_AVX)) {
pr_info("AVX version of gcm_enc/dec engaged.\n");
aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2;
- } else
-#endif
- {
+ } else {
pr_info("SSE version of gcm_enc/dec engaged.\n");
aesni_gcm_tfm = &aesni_gcm_tfm_sse;
}
aesni_ctr_enc_tfm = aesni_ctr_enc;
-#ifdef CONFIG_AS_AVX
if (boot_cpu_has(X86_FEATURE_AVX)) {
/* optimize performance of ctr mode encryption transform */
aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm;
pr_info("AES CTR mode by8 optimization enabled\n");
}
#endif
-#endif
err = crypto_register_alg(&aesni_cipher_alg);
if (err)
diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S
index 24910b766bdd..2ca79974f819 100644
--- a/arch/x86/crypto/blake2s-core.S
+++ b/arch/x86/crypto/blake2s-core.S
@@ -46,7 +46,6 @@ SIGMA2:
#endif /* CONFIG_AS_AVX512 */
.text
-#ifdef CONFIG_AS_SSSE3
SYM_FUNC_START(blake2s_compress_ssse3)
testq %rdx,%rdx
je .Lendofloop
@@ -174,7 +173,6 @@ SYM_FUNC_START(blake2s_compress_ssse3)
.Lendofloop:
ret
SYM_FUNC_END(blake2s_compress_ssse3)
-#endif /* CONFIG_AS_SSSE3 */
#ifdef CONFIG_AS_AVX512
SYM_FUNC_START(blake2s_compress_avx512)
diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
index 68a74953efaf..b412c21ee06e 100644
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -79,8 +79,7 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
}
}
- if (IS_ENABLED(CONFIG_AS_AVX2) &&
- static_branch_likely(&chacha_use_avx2)) {
+ if (static_branch_likely(&chacha_use_avx2)) {
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
bytes -= CHACHA_BLOCK_SIZE * 8;
@@ -288,8 +287,7 @@ static int __init chacha_simd_mod_init(void)
static_branch_enable(&chacha_use_simd);
- if (IS_ENABLED(CONFIG_AS_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX) &&
+ if (boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
static_branch_enable(&chacha_use_avx2);
diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
index 7a6b5380a46f..137edcf038cb 100644
--- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
@@ -404,10 +404,6 @@ ___
&end_function("poly1305_emit_x86_64");
if ($avx) {
-if($kernel) {
- $code .= "#ifdef CONFIG_AS_AVX\n";
-}
-
########################################################################
# Layout of opaque area is following.
#
@@ -1516,16 +1512,8 @@ $code.=<<___;
___
&end_function("poly1305_emit_avx");
-if ($kernel) {
- $code .= "#endif\n";
-}
-
if ($avx>1) {
-if ($kernel) {
- $code .= "#ifdef CONFIG_AS_AVX2\n";
-}
-
my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
map("%ymm$_",(0..15));
my $S4=$MASK;
@@ -2816,10 +2804,6 @@ ___
poly1305_blocks_avxN(0);
&end_function("poly1305_blocks_avx2");
-if($kernel) {
- $code .= "#endif\n";
-}
-
#######################################################################
if ($avx>2) {
# On entry we have input length divisible by 64. But since inner loop
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 79bb58737d52..6dfec19f7d57 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -94,7 +94,7 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
PAGE_SIZE % POLY1305_BLOCK_SIZE);
- if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
+ if (!static_branch_likely(&poly1305_use_avx) ||
(len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
!crypto_simd_usable()) {
convert_to_base2_64(ctx);
@@ -108,7 +108,7 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
poly1305_blocks_avx512(ctx, inp, bytes, padbit);
- else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2))
+ else if (static_branch_likely(&poly1305_use_avx2))
poly1305_blocks_avx2(ctx, inp, bytes, padbit);
else
poly1305_blocks_avx(ctx, inp, bytes, padbit);
@@ -123,7 +123,7 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
const u32 nonce[4])
{
- if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx))
+ if (!static_branch_likely(&poly1305_use_avx))
poly1305_emit_x86_64(ctx, mac, nonce);
else
poly1305_emit_avx(ctx, mac, nonce);
@@ -261,11 +261,10 @@ static struct shash_alg alg = {
static int __init poly1305_simd_mod_init(void)
{
- if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) &&
+ if (boot_cpu_has(X86_FEATURE_AVX) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
static_branch_enable(&poly1305_use_avx);
- if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
+ if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
static_branch_enable(&poly1305_use_avx2);
if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index 12e2d19d7402..d25668d2a1e9 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -467,8 +467,6 @@ W_PRECALC_SSSE3
*/
SHA1_VECTOR_ASM sha1_transform_ssse3
-#ifdef CONFIG_AS_AVX
-
.macro W_PRECALC_AVX
.purgem W_PRECALC_00_15
@@ -553,5 +551,3 @@ W_PRECALC_AVX
* const u8 *data, int blocks);
*/
SHA1_VECTOR_ASM sha1_transform_avx
-
-#endif
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index d70b40ad594c..a801ffc10cbb 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -114,7 +114,6 @@ static void unregister_sha1_ssse3(void)
crypto_unregister_shash(&sha1_ssse3_alg);
}
-#ifdef CONFIG_AS_AVX
asmlinkage void sha1_transform_avx(struct sha1_state *state,
const u8 *data, int blocks);
@@ -175,13 +174,6 @@ static void unregister_sha1_avx(void)
crypto_unregister_shash(&sha1_avx_alg);
}
-#else /* CONFIG_AS_AVX */
-static inline int register_sha1_avx(void) { return 0; }
-static inline void unregister_sha1_avx(void) { }
-#endif /* CONFIG_AS_AVX */
-
-
-#if defined(CONFIG_AS_AVX2) && (CONFIG_AS_AVX)
#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
asmlinkage void sha1_transform_avx2(struct sha1_state *state,
@@ -253,11 +245,6 @@ static void unregister_sha1_avx2(void)
crypto_unregister_shash(&sha1_avx2_alg);
}
-#else
-static inline int register_sha1_avx2(void) { return 0; }
-static inline void unregister_sha1_avx2(void) { }
-#endif
-
#ifdef CONFIG_AS_SHA1_NI
asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data,
int rounds);
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index fcbc30f58c38..4739cd31b9db 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -47,7 +47,6 @@
# This code schedules 1 block at a time, with 4 lanes per block
########################################################################
-#ifdef CONFIG_AS_AVX
#include <linux/linkage.h>
## assume buffers not aligned
@@ -498,5 +497,3 @@ _SHUF_00BA:
# shuffle xDxC -> DC00
_SHUF_DC00:
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
-
-#endif
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 499d9ec129de..11ff60c29c8b 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -48,7 +48,6 @@
# This code schedules 2 blocks at a time, with 4 lanes per block
########################################################################
-#ifdef CONFIG_AS_AVX2
#include <linux/linkage.h>
## assume buffers not aligned
@@ -767,5 +766,3 @@ _SHUF_00BA:
.align 32
_SHUF_DC00:
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
-
-#endif
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 03ad657c04bd..6394b5fe8db6 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -144,7 +144,6 @@ static void unregister_sha256_ssse3(void)
ARRAY_SIZE(sha256_ssse3_algs));
}
-#ifdef CONFIG_AS_AVX
asmlinkage void sha256_transform_avx(struct sha256_state *state,
const u8 *data, int blocks);
@@ -221,12 +220,6 @@ static void unregister_sha256_avx(void)
ARRAY_SIZE(sha256_avx_algs));
}
-#else
-static inline int register_sha256_avx(void) { return 0; }
-static inline void unregister_sha256_avx(void) { }
-#endif
-
-#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
asmlinkage void sha256_transform_rorx(struct sha256_state *state,
const u8 *data, int blocks);
@@ -301,11 +294,6 @@ static void unregister_sha256_avx2(void)
ARRAY_SIZE(sha256_avx2_algs));
}
-#else
-static inline int register_sha256_avx2(void) { return 0; }
-static inline void unregister_sha256_avx2(void) { }
-#endif
-
#ifdef CONFIG_AS_SHA256_NI
asmlinkage void sha256_ni_transform(struct sha256_state *digest,
const u8 *data, int rounds);
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 90ea945ba5e6..63470fd6ae32 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -47,7 +47,6 @@
#
########################################################################
-#ifdef CONFIG_AS_AVX
#include <linux/linkage.h>
.text
@@ -424,4 +423,3 @@ K512:
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
-#endif
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 3dd886b14e7d..3a44bdcfd583 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -49,7 +49,6 @@
# This code schedules 1 blocks at a time, with 4 lanes per block
########################################################################
-#ifdef CONFIG_AS_AVX2
#include <linux/linkage.h>
.text
@@ -749,5 +748,3 @@ PSHUFFLE_BYTE_FLIP_MASK:
MASK_YMM_LO:
.octa 0x00000000000000000000000000000000
.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
-
-#endif
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 1c444f41037c..82cc1b3ced1d 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -142,7 +142,6 @@ static void unregister_sha512_ssse3(void)
ARRAY_SIZE(sha512_ssse3_algs));
}
-#ifdef CONFIG_AS_AVX
asmlinkage void sha512_transform_avx(struct sha512_state *state,
const u8 *data, int blocks);
static bool avx_usable(void)
@@ -218,12 +217,7 @@ static void unregister_sha512_avx(void)
crypto_unregister_shashes(sha512_avx_algs,
ARRAY_SIZE(sha512_avx_algs));
}
-#else
-static inline int register_sha512_avx(void) { return 0; }
-static inline void unregister_sha512_avx(void) { }
-#endif
-#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
asmlinkage void sha512_transform_rorx(struct sha512_state *state,
const u8 *data, int blocks);
@@ -298,10 +292,6 @@ static void unregister_sha512_avx2(void)
crypto_unregister_shashes(sha512_avx2_algs,
ARRAY_SIZE(sha512_avx2_algs));
}
-#else
-static inline int register_sha512_avx2(void) { return 0; }
-static inline void unregister_sha512_avx2(void) { }
-#endif
static int __init sha512_ssse3_mod_init(void)
{
diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
index 1e82bd43286c..84a4a73f77f7 100644
--- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
@@ -1,10 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#define BUILD_VDSO32
-#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE
-#undef CONFIG_OPTIMIZE_INLINING
-#endif
-
#ifdef CONFIG_X86_64
/*
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 1ba72c563313..cf76d6631afa 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1476,6 +1476,12 @@ static const struct intel_uncore_init_fun tgl_l_uncore_init __initconst = {
.mmio_init = tgl_l_uncore_mmio_init,
};
+static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
+ .cpu_init = icx_uncore_cpu_init,
+ .pci_init = icx_uncore_pci_init,
+ .mmio_init = icx_uncore_mmio_init,
+};
+
static const struct intel_uncore_init_fun snr_uncore_init __initconst = {
.cpu_init = snr_uncore_cpu_init,
.pci_init = snr_uncore_pci_init,
@@ -1511,6 +1517,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &icl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init),
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index b30429f8a53a..0da4a4605536 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -550,6 +550,9 @@ void skx_uncore_cpu_init(void);
int snr_uncore_pci_init(void);
void snr_uncore_cpu_init(void);
void snr_uncore_mmio_init(void);
+int icx_uncore_pci_init(void);
+void icx_uncore_cpu_init(void);
+void icx_uncore_mmio_init(void);
/* uncore_nhmex.c */
void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 01023f0d935b..07652fa20ebb 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -382,6 +382,42 @@
#define SNR_IMC_MMIO_MEM0_OFFSET 0xd8
#define SNR_IMC_MMIO_MEM0_MASK 0x7FF
+/* ICX CHA */
+#define ICX_C34_MSR_PMON_CTR0 0xb68
+#define ICX_C34_MSR_PMON_CTL0 0xb61
+#define ICX_C34_MSR_PMON_BOX_CTL 0xb60
+#define ICX_C34_MSR_PMON_BOX_FILTER0 0xb65
+
+/* ICX IIO */
+#define ICX_IIO_MSR_PMON_CTL0 0xa58
+#define ICX_IIO_MSR_PMON_CTR0 0xa51
+#define ICX_IIO_MSR_PMON_BOX_CTL 0xa50
+
+/* ICX IRP */
+#define ICX_IRP0_MSR_PMON_CTL0 0xa4d
+#define ICX_IRP0_MSR_PMON_CTR0 0xa4b
+#define ICX_IRP0_MSR_PMON_BOX_CTL 0xa4a
+
+/* ICX M2PCIE */
+#define ICX_M2PCIE_MSR_PMON_CTL0 0xa46
+#define ICX_M2PCIE_MSR_PMON_CTR0 0xa41
+#define ICX_M2PCIE_MSR_PMON_BOX_CTL 0xa40
+
+/* ICX UPI */
+#define ICX_UPI_PCI_PMON_CTL0 0x350
+#define ICX_UPI_PCI_PMON_CTR0 0x320
+#define ICX_UPI_PCI_PMON_BOX_CTL 0x318
+#define ICX_UPI_CTL_UMASK_EXT 0xffffff
+
+/* ICX M3UPI*/
+#define ICX_M3UPI_PCI_PMON_CTL0 0xd8
+#define ICX_M3UPI_PCI_PMON_CTR0 0xa8
+#define ICX_M3UPI_PCI_PMON_BOX_CTL 0xa0
+
+/* ICX IMC */
+#define ICX_NUMBER_IMC_CHN 2
+#define ICX_IMC_MEM_STRIDE 0x4
+
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
@@ -390,6 +426,7 @@ DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-43,45-55");
DEFINE_UNCORE_FORMAT_ATTR(umask_ext2, umask, "config:8-15,32-57");
DEFINE_UNCORE_FORMAT_ATTR(umask_ext3, umask, "config:8-15,32-39");
+DEFINE_UNCORE_FORMAT_ATTR(umask_ext4, umask, "config:8-15,32-55");
DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
@@ -4551,3 +4588,477 @@ void snr_uncore_mmio_init(void)
}
/* end of SNR uncore support */
+
+/* ICX uncore support */
+
+static unsigned icx_cha_msr_offsets[] = {
+ 0x2a0, 0x2ae, 0x2bc, 0x2ca, 0x2d8, 0x2e6, 0x2f4, 0x302, 0x310,
+ 0x31e, 0x32c, 0x33a, 0x348, 0x356, 0x364, 0x372, 0x380, 0x38e,
+ 0x3aa, 0x3b8, 0x3c6, 0x3d4, 0x3e2, 0x3f0, 0x3fe, 0x40c, 0x41a,
+ 0x428, 0x436, 0x444, 0x452, 0x460, 0x46e, 0x47c, 0x0, 0xe,
+ 0x1c, 0x2a, 0x38, 0x46,
+};
+
+static int icx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ bool tie_en = !!(event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN);
+
+ if (tie_en) {
+ reg1->reg = ICX_C34_MSR_PMON_BOX_FILTER0 +
+ icx_cha_msr_offsets[box->pmu->pmu_idx];
+ reg1->config = event->attr.config1 & SKX_CHA_MSR_PMON_BOX_FILTER_TID;
+ reg1->idx = 0;
+ }
+
+ return 0;
+}
+
+static struct intel_uncore_ops icx_uncore_chabox_ops = {
+ .init_box = ivbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = snr_cha_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = icx_cha_hw_config,
+};
+
+static struct intel_uncore_type icx_uncore_chabox = {
+ .name = "cha",
+ .num_counters = 4,
+ .perf_ctr_bits = 48,
+ .event_ctl = ICX_C34_MSR_PMON_CTL0,
+ .perf_ctr = ICX_C34_MSR_PMON_CTR0,
+ .box_ctl = ICX_C34_MSR_PMON_BOX_CTL,
+ .msr_offsets = icx_cha_msr_offsets,
+ .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SNR_CHA_RAW_EVENT_MASK_EXT,
+ .constraints = skx_uncore_chabox_constraints,
+ .ops = &icx_uncore_chabox_ops,
+ .format_group = &snr_uncore_chabox_format_group,
+};
+
+static unsigned icx_msr_offsets[] = {
+ 0x0, 0x20, 0x40, 0x90, 0xb0, 0xd0,
+};
+
+static struct event_constraint icx_uncore_iio_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x03, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x83, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type icx_uncore_iio = {
+ .name = "iio",
+ .num_counters = 4,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .event_ctl = ICX_IIO_MSR_PMON_CTL0,
+ .perf_ctr = ICX_IIO_MSR_PMON_CTR0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT,
+ .box_ctl = ICX_IIO_MSR_PMON_BOX_CTL,
+ .msr_offsets = icx_msr_offsets,
+ .constraints = icx_uncore_iio_constraints,
+ .ops = &skx_uncore_iio_ops,
+ .format_group = &snr_uncore_iio_format_group,
+};
+
+static struct intel_uncore_type icx_uncore_irp = {
+ .name = "irp",
+ .num_counters = 2,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .event_ctl = ICX_IRP0_MSR_PMON_CTL0,
+ .perf_ctr = ICX_IRP0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = ICX_IRP0_MSR_PMON_BOX_CTL,
+ .msr_offsets = icx_msr_offsets,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_format_group,
+};
+
+static struct event_constraint icx_uncore_m2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type icx_uncore_m2pcie = {
+ .name = "m2pcie",
+ .num_counters = 4,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .event_ctl = ICX_M2PCIE_MSR_PMON_CTL0,
+ .perf_ctr = ICX_M2PCIE_MSR_PMON_CTR0,
+ .box_ctl = ICX_M2PCIE_MSR_PMON_BOX_CTL,
+ .msr_offsets = icx_msr_offsets,
+ .constraints = icx_uncore_m2pcie_constraints,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_format_group,
+};
+
+enum perf_uncore_icx_iio_freerunning_type_id {
+ ICX_IIO_MSR_IOCLK,
+ ICX_IIO_MSR_BW_IN,
+
+ ICX_IIO_FREERUNNING_TYPE_MAX,
+};
+
+static unsigned icx_iio_clk_freerunning_box_offsets[] = {
+ 0x0, 0x20, 0x40, 0x90, 0xb0, 0xd0,
+};
+
+static unsigned icx_iio_bw_freerunning_box_offsets[] = {
+ 0x0, 0x10, 0x20, 0x90, 0xa0, 0xb0,
+};
+
+static struct freerunning_counters icx_iio_freerunning[] = {
+ [ICX_IIO_MSR_IOCLK] = { 0xa55, 0x1, 0x20, 1, 48, icx_iio_clk_freerunning_box_offsets },
+ [ICX_IIO_MSR_BW_IN] = { 0xaa0, 0x1, 0x10, 8, 48, icx_iio_bw_freerunning_box_offsets },
+};
+
+static struct uncore_event_desc icx_uncore_iio_freerunning_events[] = {
+ /* Free-Running IIO CLOCKS Counter */
+ INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"),
+ /* Free-Running IIO BANDWIDTH IN Counters */
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port4, "event=0xff,umask=0x24"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port5, "event=0xff,umask=0x25"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port6, "event=0xff,umask=0x26"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port7, "event=0xff,umask=0x27"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit, "MiB"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type icx_uncore_iio_free_running = {
+ .name = "iio_free_running",
+ .num_counters = 9,
+ .num_boxes = 6,
+ .num_freerunning_types = ICX_IIO_FREERUNNING_TYPE_MAX,
+ .freerunning = icx_iio_freerunning,
+ .ops = &skx_uncore_iio_freerunning_ops,
+ .event_descs = icx_uncore_iio_freerunning_events,
+ .format_group = &skx_uncore_iio_freerunning_format_group,
+};
+
+static struct intel_uncore_type *icx_msr_uncores[] = {
+ &skx_uncore_ubox,
+ &icx_uncore_chabox,
+ &icx_uncore_iio,
+ &icx_uncore_irp,
+ &icx_uncore_m2pcie,
+ &skx_uncore_pcu,
+ &icx_uncore_iio_free_running,
+ NULL,
+};
+
+/*
+ * To determine the number of CHAs, it should read CAPID6(Low) and CAPID7 (High)
+ * registers which located at Device 30, Function 3
+ */
+#define ICX_CAPID6 0x9c
+#define ICX_CAPID7 0xa0
+
+static u64 icx_count_chabox(void)
+{
+ struct pci_dev *dev = NULL;
+ u64 caps = 0;
+
+ dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x345b, dev);
+ if (!dev)
+ goto out;
+
+ pci_read_config_dword(dev, ICX_CAPID6, (u32 *)&caps);
+ pci_read_config_dword(dev, ICX_CAPID7, (u32 *)&caps + 1);
+out:
+ pci_dev_put(dev);
+ return hweight64(caps);
+}
+
+void icx_uncore_cpu_init(void)
+{
+ u64 num_boxes = icx_count_chabox();
+
+ if (WARN_ON(num_boxes > ARRAY_SIZE(icx_cha_msr_offsets)))
+ return;
+ icx_uncore_chabox.num_boxes = num_boxes;
+ uncore_msr_uncores = icx_msr_uncores;
+}
+
+static struct intel_uncore_type icx_uncore_m2m = {
+ .name = "m2m",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNR_M2M_PCI_PMON_CTR0,
+ .event_ctl = SNR_M2M_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNR_M2M_PCI_PMON_BOX_CTL,
+ .ops = &snr_m2m_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct attribute *icx_upi_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask_ext4.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static const struct attribute_group icx_upi_uncore_format_group = {
+ .name = "format",
+ .attrs = icx_upi_uncore_formats_attr,
+};
+
+static struct intel_uncore_type icx_uncore_upi = {
+ .name = "upi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = ICX_UPI_PCI_PMON_CTR0,
+ .event_ctl = ICX_UPI_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = ICX_UPI_CTL_UMASK_EXT,
+ .box_ctl = ICX_UPI_PCI_PMON_BOX_CTL,
+ .ops = &skx_upi_uncore_pci_ops,
+ .format_group = &icx_upi_uncore_format_group,
+};
+
+static struct event_constraint icx_uncore_m3upi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x1c, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x1d, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x1e, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x40, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x4e, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x4f, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x50, 0x7),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type icx_uncore_m3upi = {
+ .name = "m3upi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = ICX_M3UPI_PCI_PMON_CTR0,
+ .event_ctl = ICX_M3UPI_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = ICX_M3UPI_PCI_PMON_BOX_CTL,
+ .constraints = icx_uncore_m3upi_constraints,
+ .ops = &ivbep_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+enum {
+ ICX_PCI_UNCORE_M2M,
+ ICX_PCI_UNCORE_UPI,
+ ICX_PCI_UNCORE_M3UPI,
+};
+
+static struct intel_uncore_type *icx_pci_uncores[] = {
+ [ICX_PCI_UNCORE_M2M] = &icx_uncore_m2m,
+ [ICX_PCI_UNCORE_UPI] = &icx_uncore_upi,
+ [ICX_PCI_UNCORE_M3UPI] = &icx_uncore_m3upi,
+ NULL,
+};
+
+static const struct pci_device_id icx_uncore_pci_ids[] = {
+ { /* M2M 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 0, ICX_PCI_UNCORE_M2M, 0),
+ },
+ { /* M2M 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(13, 0, ICX_PCI_UNCORE_M2M, 1),
+ },
+ { /* M2M 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(14, 0, ICX_PCI_UNCORE_M2M, 2),
+ },
+ { /* M2M 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, ICX_PCI_UNCORE_M2M, 3),
+ },
+ { /* UPI Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3441),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(2, 1, ICX_PCI_UNCORE_UPI, 0),
+ },
+ { /* UPI Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3441),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(3, 1, ICX_PCI_UNCORE_UPI, 1),
+ },
+ { /* UPI Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3441),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(4, 1, ICX_PCI_UNCORE_UPI, 2),
+ },
+ { /* M3UPI Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3446),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(5, 1, ICX_PCI_UNCORE_M3UPI, 0),
+ },
+ { /* M3UPI Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3446),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(6, 1, ICX_PCI_UNCORE_M3UPI, 1),
+ },
+ { /* M3UPI Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3446),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(7, 1, ICX_PCI_UNCORE_M3UPI, 2),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver icx_uncore_pci_driver = {
+ .name = "icx_uncore",
+ .id_table = icx_uncore_pci_ids,
+};
+
+int icx_uncore_pci_init(void)
+{
+ /* ICX UBOX DID */
+ int ret = snbep_pci2phy_map_init(0x3450, SKX_CPUNODEID,
+ SKX_GIDNIDMAP, true);
+
+ if (ret)
+ return ret;
+
+ uncore_pci_uncores = icx_pci_uncores;
+ uncore_pci_driver = &icx_uncore_pci_driver;
+ return 0;
+}
+
+static void icx_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+ unsigned int box_ctl = box->pmu->type->box_ctl +
+ box->pmu->type->mmio_offset * (box->pmu->pmu_idx % ICX_NUMBER_IMC_CHN);
+ int mem_offset = (box->pmu->pmu_idx / ICX_NUMBER_IMC_CHN) * ICX_IMC_MEM_STRIDE +
+ SNR_IMC_MMIO_MEM0_OFFSET;
+
+ __snr_uncore_mmio_init_box(box, box_ctl, mem_offset);
+}
+
+static struct intel_uncore_ops icx_uncore_mmio_ops = {
+ .init_box = icx_uncore_imc_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = snr_uncore_mmio_disable_box,
+ .enable_box = snr_uncore_mmio_enable_box,
+ .disable_event = snr_uncore_mmio_disable_event,
+ .enable_event = snr_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+static struct intel_uncore_type icx_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR,
+ .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL,
+ .event_descs = hswep_uncore_imc_events,
+ .perf_ctr = SNR_IMC_MMIO_PMON_CTR0,
+ .event_ctl = SNR_IMC_MMIO_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL,
+ .mmio_offset = SNR_IMC_MMIO_OFFSET,
+ .ops = &icx_uncore_mmio_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+enum perf_uncore_icx_imc_freerunning_type_id {
+ ICX_IMC_DCLK,
+ ICX_IMC_DDR,
+ ICX_IMC_DDRT,
+
+ ICX_IMC_FREERUNNING_TYPE_MAX,
+};
+
+static struct freerunning_counters icx_imc_freerunning[] = {
+ [ICX_IMC_DCLK] = { 0x22b0, 0x0, 0, 1, 48 },
+ [ICX_IMC_DDR] = { 0x2290, 0x8, 0, 2, 48 },
+ [ICX_IMC_DDRT] = { 0x22a0, 0x8, 0, 2, 48 },
+};
+
+static struct uncore_event_desc icx_uncore_imc_freerunning_events[] = {
+ INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"),
+
+ INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"),
+ INTEL_UNCORE_EVENT_DESC(read.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"),
+ INTEL_UNCORE_EVENT_DESC(write.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"),
+
+ INTEL_UNCORE_EVENT_DESC(ddrt_read, "event=0xff,umask=0x30"),
+ INTEL_UNCORE_EVENT_DESC(ddrt_read.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(ddrt_read.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(ddrt_write, "event=0xff,umask=0x31"),
+ INTEL_UNCORE_EVENT_DESC(ddrt_write.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(ddrt_write.unit, "MiB"),
+ { /* end: all zeroes */ },
+};
+
+static void icx_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+{
+ int mem_offset = box->pmu->pmu_idx * ICX_IMC_MEM_STRIDE +
+ SNR_IMC_MMIO_MEM0_OFFSET;
+
+ __snr_uncore_mmio_init_box(box, uncore_mmio_box_ctl(box), mem_offset);
+}
+
+static struct intel_uncore_ops icx_uncore_imc_freerunning_ops = {
+ .init_box = icx_uncore_imc_freerunning_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .read_counter = uncore_mmio_read_counter,
+ .hw_config = uncore_freerunning_hw_config,
+};
+
+static struct intel_uncore_type icx_uncore_imc_free_running = {
+ .name = "imc_free_running",
+ .num_counters = 5,
+ .num_boxes = 4,
+ .num_freerunning_types = ICX_IMC_FREERUNNING_TYPE_MAX,
+ .freerunning = icx_imc_freerunning,
+ .ops = &icx_uncore_imc_freerunning_ops,
+ .event_descs = icx_uncore_imc_freerunning_events,
+ .format_group = &skx_uncore_iio_freerunning_format_group,
+};
+
+static struct intel_uncore_type *icx_mmio_uncores[] = {
+ &icx_uncore_imc,
+ &icx_uncore_imc_free_running,
+ NULL,
+};
+
+void icx_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = icx_mmio_uncores;
+}
+
+/* end of ICX uncore support */
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index f71a0cce9373..430fca13bb56 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -6,15 +6,6 @@
#warning "asm/dwarf2.h should be only included in pure assembly files"
#endif
-/*
- * Macros for dwarf2 CFI unwind table entries.
- * See "as.info" for details on these pseudo ops. Unfortunately
- * they are only supported in very new binutils, so define them
- * away for older version.
- */
-
-#ifdef CONFIG_AS_CFI
-
#define CFI_STARTPROC .cfi_startproc
#define CFI_ENDPROC .cfi_endproc
#define CFI_DEF_CFA .cfi_def_cfa
@@ -30,13 +21,6 @@
#define CFI_UNDEFINED .cfi_undefined
#define CFI_ESCAPE .cfi_escape
-#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
-#define CFI_SIGNAL_FRAME .cfi_signal_frame
-#else
-#define CFI_SIGNAL_FRAME
-#endif
-
-#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__)
#ifndef BUILD_VDSO
/*
* Emit CFI data in .debug_frame sections, not .eh_frame sections.
@@ -53,33 +37,5 @@
*/
.cfi_sections .eh_frame, .debug_frame
#endif
-#endif
-
-#else
-
-/*
- * Due to the structure of pre-exisiting code, don't use assembler line
- * comment character # to ignore the arguments. Instead, use a dummy macro.
- */
-.macro cfi_ignore a=0, b=0, c=0, d=0
-.endm
-
-#define CFI_STARTPROC cfi_ignore
-#define CFI_ENDPROC cfi_ignore
-#define CFI_DEF_CFA cfi_ignore
-#define CFI_DEF_CFA_REGISTER cfi_ignore
-#define CFI_DEF_CFA_OFFSET cfi_ignore
-#define CFI_ADJUST_CFA_OFFSET cfi_ignore
-#define CFI_OFFSET cfi_ignore
-#define CFI_REL_OFFSET cfi_ignore
-#define CFI_REGISTER cfi_ignore
-#define CFI_RESTORE cfi_ignore
-#define CFI_REMEMBER_STATE cfi_ignore
-#define CFI_RESTORE_STATE cfi_ignore
-#define CFI_UNDEFINED cfi_ignore
-#define CFI_ESCAPE cfi_ignore
-#define CFI_SIGNAL_FRAME cfi_ignore
-
-#endif
#endif /* _ASM_X86_DWARF2_H */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index c85e15010f48..a506a411474d 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -35,9 +35,7 @@
#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
-#define VM_DATA_DEFAULT_FLAGS \
- (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
- VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC
#define __PHYSICAL_START ALIGN(CONFIG_PHYSICAL_START, \
CONFIG_PHYSICAL_ALIGN)
@@ -73,9 +71,6 @@ static inline phys_addr_t get_max_mapped(void)
bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
-extern unsigned long init_memory_mapping(unsigned long start,
- unsigned long end);
-
extern void initmem_init(void);
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index afda66a6d325..4d02e64af1b3 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -25,6 +25,7 @@
#include <asm/x86_init.h>
#include <asm/fpu/xstate.h>
#include <asm/fpu/api.h>
+#include <asm-generic/pgtable_uffd.h>
extern pgd_t early_top_pgt[PTRS_PER_PGD];
int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
@@ -313,6 +314,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
return native_make_pte(v & ~clear);
}
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline int pte_uffd_wp(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_UFFD_WP;
+}
+
+static inline pte_t pte_mkuffd_wp(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_UFFD_WP);
+}
+
+static inline pte_t pte_clear_uffd_wp(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_UFFD_WP);
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
static inline pte_t pte_mkclean(pte_t pte)
{
return pte_clear_flags(pte, _PAGE_DIRTY);
@@ -392,6 +410,23 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
return native_make_pmd(v & ~clear);
}
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline int pmd_uffd_wp(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_UFFD_WP;
+}
+
+static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_UFFD_WP);
+}
+
+static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_UFFD_WP);
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
static inline pmd_t pmd_mkold(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_ACCESSED);
@@ -825,7 +860,10 @@ static inline unsigned long pmd_index(unsigned long address)
*
* this function returns the index of the entry in the pte page which would
* control the given virtual address
+ *
+ * Also define macro so we can test if pte_index is defined for arch.
*/
+#define pte_index pte_index
static inline unsigned long pte_index(unsigned long address)
{
return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
@@ -1043,6 +1081,9 @@ static inline void __meminit init_trampoline_default(void)
void __init poking_init(void);
+unsigned long init_memory_mapping(unsigned long start,
+ unsigned long end, pgprot_t prot);
+
# ifdef CONFIG_RANDOMIZE_MEMORY
void __meminit init_trampoline(void);
# else
@@ -1374,6 +1415,38 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
#endif
#endif
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_SWP_UFFD_WP);
+}
+
+static inline int pte_swp_uffd_wp(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_SWP_UFFD_WP;
+}
+
+static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP);
+}
+
+static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_SWP_UFFD_WP);
+}
+
+static inline int pmd_swp_uffd_wp(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_SWP_UFFD_WP;
+}
+
+static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_SWP_UFFD_WP);
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
#define PKRU_AD_BIT 0x1
#define PKRU_WD_BIT 0x2
#define PKRU_BITS_PER_PKEY 2
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 0b6c4042942a..df1373415f11 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -189,7 +189,7 @@ extern void sync_global_pgds(unsigned long start, unsigned long end);
*
* | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
* | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
- * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry
+ * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|F|SD|0| <- swp entry
*
* G (8) is aliased and used as a PROT_NONE indicator for
* !present ptes. We need to start storing swap entries above
@@ -197,9 +197,15 @@ extern void sync_global_pgds(unsigned long start, unsigned long end);
* erratum where they can be incorrectly set by hardware on
* non-present PTEs.
*
+ * SD Bits 1-4 are not used in non-present format and available for
+ * special use described below:
+ *
* SD (1) in swp entry is used to store soft dirty bit, which helps us
* remember soft dirty over page migration
*
+ * F (2) in swp entry is used to record when a pagetable is
+ * writeprotected by userfaultfd WP support.
+ *
* Bit 7 in swp entry should be 0 because pmd_present checks not only P,
* but also L and G.
*
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 65c2ecd730c5..b6606fe6cfdf 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -32,6 +32,7 @@
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
+#define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
@@ -100,6 +101,14 @@
#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0))
#endif
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+#define _PAGE_UFFD_WP (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP)
+#define _PAGE_SWP_UFFD_WP _PAGE_USER
+#else
+#define _PAGE_UFFD_WP (_AT(pteval_t, 0))
+#define _PAGE_SWP_UFFD_WP (_AT(pteval_t, 0))
+#endif
+
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
#define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
@@ -118,7 +127,8 @@
*/
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
- _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC)
+ _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC | \
+ _PAGE_UFFD_WP)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
/*
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 950532ccbc4a..ec2c0a094b5d 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -34,6 +34,7 @@
* The caller is required to take care of these.
*/
+int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot);
int _set_memory_uc(unsigned long addr, int numpages);
int _set_memory_wc(unsigned long addr, int numpages);
int _set_memory_wt(unsigned long addr, int numpages);
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
index d61ddf3d052b..0c4e5b5e3852 100644
--- a/arch/x86/include/asm/xor_avx.h
+++ b/arch/x86/include/asm/xor_avx.h
@@ -11,8 +11,6 @@
* Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
*/
-#ifdef CONFIG_AS_AVX
-
#include <linux/compiler.h>
#include <asm/fpu/api.h>
@@ -170,11 +168,4 @@ do { \
#define AVX_SELECT(FASTEST) \
(boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
-#else
-
-#define AVX_XOR_SPEED {}
-
-#define AVX_SELECT(FASTEST) (FASTEST)
-
-#endif
#endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 1ae5439a9a85..683ed9e12e6b 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -45,7 +45,7 @@ EXPORT_SYMBOL(acpi_disabled);
#define PREFIX "ACPI: "
int acpi_noirq; /* skip ACPI IRQ initialization */
-int acpi_nobgrt; /* skip ACPI BGRT */
+static int acpi_nobgrt; /* skip ACPI BGRT */
int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */
EXPORT_SYMBOL(acpi_pci_disabled);
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 4e5f50236048..16133819415c 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -744,7 +744,8 @@ int __init gart_iommu_init(void)
start_pfn = PFN_DOWN(aper_base);
if (!pfn_range_is_mapped(start_pfn, end_pfn))
- init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+ init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT,
+ PAGE_KERNEL);
pr_info("PCI-DMA: using GART IOMMU.\n");
iommu_size = check_iommu_size(info.aper_base, aper_size);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e6b545047f38..4b3fa6cd3106 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -16,6 +16,7 @@
#include <linux/pci.h>
#include <linux/root_dev.h>
#include <linux/sfi.h>
+#include <linux/hugetlb.h>
#include <linux/tboot.h>
#include <linux/usb/xhci-dbgp.h>
@@ -1157,6 +1158,9 @@ void __init setup_arch(char **cmdline_p)
initmem_init();
dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
+ if (boot_cpu_has(X86_FEATURE_GBPAGES))
+ hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
+
/*
* Reserve memory for crash kernel after SRAT is parsed so that it
* won't consume hotpluggable memory.
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 9fea0757db92..d8154e0684b6 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -107,8 +107,4 @@ config KVM_MMU_AUDIT
This option adds a R/W kVM module parameter 'mmu_audit', which allows
auditing of KVM MMU events at runtime.
-# OK, it's a little counter-intuitive to do this, but it puts it neatly under
-# the virtualization menu.
-source "drivers/vhost/Kconfig"
-
endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index e553f0fdd87d..a789759b7261 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -14,7 +14,7 @@ kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o
-kvm-amd-y += svm.o pmu_amd.o
+kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
obj-$(CONFIG_KVM) += kvm.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ca80daf8f878..9af25c97612a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -59,9 +59,6 @@
#define MAX_APIC_VECTOR 256
#define APIC_VECTORS_PER_REG 32
-#define APIC_BROADCAST 0xFF
-#define X2APIC_BROADCAST 0xFFFFFFFFul
-
static bool lapic_timer_advance_dynamic __read_mostly;
#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */
#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 40ed6ed22751..a0ffb4331418 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -17,6 +17,9 @@
#define APIC_BUS_CYCLE_NS 1
#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS)
+#define APIC_BROADCAST 0xFF
+#define X2APIC_BROADCAST 0xFFFFFFFFul
+
enum lapic_mode {
LAPIC_MODE_DISABLED = 0,
LAPIC_MODE_INVALID = X2APIC_ENABLE,
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
new file mode 100644
index 000000000000..e80daa98682f
--- /dev/null
+++ b/arch/x86/kvm/svm/avic.c
@@ -0,0 +1,1027 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+#define pr_fmt(fmt) "SVM: " fmt
+
+#include <linux/kvm_types.h>
+#include <linux/hashtable.h>
+#include <linux/amd-iommu.h>
+#include <linux/kvm_host.h>
+
+#include <asm/irq_remapping.h>
+
+#include "trace.h"
+#include "lapic.h"
+#include "x86.h"
+#include "irq.h"
+#include "svm.h"
+
+/* enable / disable AVIC */
+int avic;
+#ifdef CONFIG_X86_LOCAL_APIC
+module_param(avic, int, S_IRUGO);
+#endif
+
+#define SVM_AVIC_DOORBELL 0xc001011b
+
+#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
+
+/*
+ * 0xff is broadcast, so the max index allowed for physical APIC ID
+ * table is 0xfe. APIC IDs above 0xff are reserved.
+ */
+#define AVIC_MAX_PHYSICAL_ID_COUNT 255
+
+#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
+#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
+#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
+
+/* AVIC GATAG is encoded using VM and VCPU IDs */
+#define AVIC_VCPU_ID_BITS 8
+#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1)
+
+#define AVIC_VM_ID_BITS 24
+#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS)
+#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1)
+
+#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
+ (y & AVIC_VCPU_ID_MASK))
+#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
+#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
+
+/* Note:
+ * This hash table is used to map VM_ID to a struct kvm_svm,
+ * when handling AMD IOMMU GALOG notification to schedule in
+ * a particular vCPU.
+ */
+#define SVM_VM_DATA_HASH_BITS 8
+static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static u32 next_vm_id = 0;
+static bool next_vm_id_wrapped = 0;
+static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
+
+/*
+ * This is a wrapper of struct amd_iommu_ir_data.
+ */
+struct amd_svm_iommu_ir {
+ struct list_head node; /* Used by SVM for per-vcpu ir_list */
+ void *data; /* Storing pointer to struct amd_ir_data */
+};
+
+enum avic_ipi_failure_cause {
+ AVIC_IPI_FAILURE_INVALID_INT_TYPE,
+ AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
+ AVIC_IPI_FAILURE_INVALID_TARGET,
+ AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
+};
+
+/* Note:
+ * This function is called from IOMMU driver to notify
+ * SVM to schedule in a particular vCPU of a particular VM.
+ */
+int avic_ga_log_notifier(u32 ga_tag)
+{
+ unsigned long flags;
+ struct kvm_svm *kvm_svm;
+ struct kvm_vcpu *vcpu = NULL;
+ u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
+ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+
+ pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+ trace_kvm_avic_ga_log(vm_id, vcpu_id);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
+ if (kvm_svm->avic_vm_id != vm_id)
+ continue;
+ vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
+ break;
+ }
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ /* Note:
+ * At this point, the IOMMU should have already set the pending
+ * bit in the vAPIC backing page. So, we just need to schedule
+ * in the vcpu.
+ */
+ if (vcpu)
+ kvm_vcpu_wake_up(vcpu);
+
+ return 0;
+}
+
+void avic_vm_destroy(struct kvm *kvm)
+{
+ unsigned long flags;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+ if (!avic)
+ return;
+
+ if (kvm_svm->avic_logical_id_table_page)
+ __free_page(kvm_svm->avic_logical_id_table_page);
+ if (kvm_svm->avic_physical_id_table_page)
+ __free_page(kvm_svm->avic_physical_id_table_page);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_del(&kvm_svm->hnode);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+}
+
+int avic_vm_init(struct kvm *kvm)
+{
+ unsigned long flags;
+ int err = -ENOMEM;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ struct kvm_svm *k2;
+ struct page *p_page;
+ struct page *l_page;
+ u32 vm_id;
+
+ if (!avic)
+ return 0;
+
+ /* Allocating physical APIC ID table (4KB) */
+ p_page = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!p_page)
+ goto free_avic;
+
+ kvm_svm->avic_physical_id_table_page = p_page;
+ clear_page(page_address(p_page));
+
+ /* Allocating logical APIC ID table (4KB) */
+ l_page = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!l_page)
+ goto free_avic;
+
+ kvm_svm->avic_logical_id_table_page = l_page;
+ clear_page(page_address(l_page));
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ again:
+ vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
+ if (vm_id == 0) { /* id is 1-based, zero is not okay */
+ next_vm_id_wrapped = 1;
+ goto again;
+ }
+ /* Is it still in use? Only possible if wrapped at least once */
+ if (next_vm_id_wrapped) {
+ hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
+ if (k2->avic_vm_id == vm_id)
+ goto again;
+ }
+ }
+ kvm_svm->avic_vm_id = vm_id;
+ hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ return 0;
+
+free_avic:
+ avic_vm_destroy(kvm);
+ return err;
+}
+
+void avic_init_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb;
+ struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
+ phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
+ phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
+ phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
+
+ vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
+ vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
+ vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
+ vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
+ if (kvm_apicv_activated(svm->vcpu.kvm))
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+ else
+ vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+}
+
+static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
+ unsigned int index)
+{
+ u64 *avic_physical_id_table;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+
+ if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
+ return NULL;
+
+ avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
+
+ return &avic_physical_id_table[index];
+}
+
+/**
+ * Note:
+ * AVIC hardware walks the nested page table to check permissions,
+ * but does not use the SPA address specified in the leaf page
+ * table entry since it uses address in the AVIC_BACKING_PAGE pointer
+ * field of the VMCB. Therefore, we set up the
+ * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
+ */
+static int avic_update_access_page(struct kvm *kvm, bool activate)
+{
+ int ret = 0;
+
+ mutex_lock(&kvm->slots_lock);
+ /*
+ * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger
+ * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT
+ * memory region. So, we need to ensure that kvm->mm == current->mm.
+ */
+ if ((kvm->arch.apic_access_page_done == activate) ||
+ (kvm->mm != current->mm))
+ goto out;
+
+ ret = __x86_set_memory_region(kvm,
+ APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE,
+ activate ? PAGE_SIZE : 0);
+ if (ret)
+ goto out;
+
+ kvm->arch.apic_access_page_done = activate;
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return ret;
+}
+
+static int avic_init_backing_page(struct kvm_vcpu *vcpu)
+{
+ u64 *entry, new_entry;
+ int id = vcpu->vcpu_id;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
+ return -EINVAL;
+
+ if (!svm->vcpu.arch.apic->regs)
+ return -EINVAL;
+
+ if (kvm_apicv_activated(vcpu->kvm)) {
+ int ret;
+
+ ret = avic_update_access_page(vcpu->kvm, true);
+ if (ret)
+ return ret;
+ }
+
+ svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
+
+ /* Setting AVIC backing page address in the phy APIC ID table */
+ entry = avic_get_physical_id_entry(vcpu, id);
+ if (!entry)
+ return -EINVAL;
+
+ new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
+ AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+ AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
+ WRITE_ONCE(*entry, new_entry);
+
+ svm->avic_physical_id_cache = entry;
+
+ return 0;
+}
+
+int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
+{
+ u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
+ u32 icrl = svm->vmcb->control.exit_info_1;
+ u32 id = svm->vmcb->control.exit_info_2 >> 32;
+ u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
+ struct kvm_lapic *apic = svm->vcpu.arch.apic;
+
+ trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
+
+ switch (id) {
+ case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
+ /*
+ * AVIC hardware handles the generation of
+ * IPIs when the specified Message Type is Fixed
+ * (also known as fixed delivery mode) and
+ * the Trigger Mode is edge-triggered. The hardware
+ * also supports self and broadcast delivery modes
+ * specified via the Destination Shorthand(DSH)
+ * field of the ICRL. Logical and physical APIC ID
+ * formats are supported. All other IPI types cause
+ * a #VMEXIT, which needs to emulated.
+ */
+ kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
+ kvm_lapic_reg_write(apic, APIC_ICR, icrl);
+ break;
+ case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
+ int i;
+ struct kvm_vcpu *vcpu;
+ struct kvm *kvm = svm->vcpu.kvm;
+ struct kvm_lapic *apic = svm->vcpu.arch.apic;
+
+ /*
+ * At this point, we expect that the AVIC HW has already
+ * set the appropriate IRR bits on the valid target
+ * vcpus. So, we just need to kick the appropriate vcpu.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ bool m = kvm_apic_match_dest(vcpu, apic,
+ icrl & APIC_SHORT_MASK,
+ GET_APIC_DEST_FIELD(icrh),
+ icrl & APIC_DEST_MASK);
+
+ if (m && !avic_vcpu_is_running(vcpu))
+ kvm_vcpu_wake_up(vcpu);
+ }
+ break;
+ }
+ case AVIC_IPI_FAILURE_INVALID_TARGET:
+ WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
+ index, svm->vcpu.vcpu_id, icrh, icrl);
+ break;
+ case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
+ WARN_ONCE(1, "Invalid backing page\n");
+ break;
+ default:
+ pr_err("Unknown IPI interception\n");
+ }
+
+ return 1;
+}
+
+static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+ int index;
+ u32 *logical_apic_id_table;
+ int dlid = GET_APIC_LOGICAL_ID(ldr);
+
+ if (!dlid)
+ return NULL;
+
+ if (flat) { /* flat */
+ index = ffs(dlid) - 1;
+ if (index > 7)
+ return NULL;
+ } else { /* cluster */
+ int cluster = (dlid & 0xf0) >> 4;
+ int apic = ffs(dlid & 0x0f) - 1;
+
+ if ((apic < 0) || (apic > 7) ||
+ (cluster >= 0xf))
+ return NULL;
+ index = (cluster << 2) + apic;
+ }
+
+ logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
+
+ return &logical_apic_id_table[index];
+}
+
+static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
+{
+ bool flat;
+ u32 *entry, new_entry;
+
+ flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
+ entry = avic_get_logical_id_entry(vcpu, ldr, flat);
+ if (!entry)
+ return -EINVAL;
+
+ new_entry = READ_ONCE(*entry);
+ new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
+ new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+ WRITE_ONCE(*entry, new_entry);
+
+ return 0;
+}
+
+static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool flat = svm->dfr_reg == APIC_DFR_FLAT;
+ u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
+
+ if (entry)
+ clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
+}
+
+static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
+{
+ int ret = 0;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
+ u32 id = kvm_xapic_id(vcpu->arch.apic);
+
+ if (ldr == svm->ldr_reg)
+ return 0;
+
+ avic_invalidate_logical_id_entry(vcpu);
+
+ if (ldr)
+ ret = avic_ldr_write(vcpu, id, ldr);
+
+ if (!ret)
+ svm->ldr_reg = ldr;
+
+ return ret;
+}
+
+static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
+{
+ u64 *old, *new;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 id = kvm_xapic_id(vcpu->arch.apic);
+
+ if (vcpu->vcpu_id == id)
+ return 0;
+
+ old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
+ new = avic_get_physical_id_entry(vcpu, id);
+ if (!new || !old)
+ return 1;
+
+ /* We need to move physical_id_entry to new offset */
+ *new = *old;
+ *old = 0ULL;
+ to_svm(vcpu)->avic_physical_id_cache = new;
+
+ /*
+ * Also update the guest physical APIC ID in the logical
+ * APIC ID table entry if already setup the LDR.
+ */
+ if (svm->ldr_reg)
+ avic_handle_ldr_update(vcpu);
+
+ return 0;
+}
+
+static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
+
+ if (svm->dfr_reg == dfr)
+ return;
+
+ avic_invalidate_logical_id_entry(vcpu);
+ svm->dfr_reg = dfr;
+}
+
+static int avic_unaccel_trap_write(struct vcpu_svm *svm)
+{
+ struct kvm_lapic *apic = svm->vcpu.arch.apic;
+ u32 offset = svm->vmcb->control.exit_info_1 &
+ AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+
+ switch (offset) {
+ case APIC_ID:
+ if (avic_handle_apic_id_update(&svm->vcpu))
+ return 0;
+ break;
+ case APIC_LDR:
+ if (avic_handle_ldr_update(&svm->vcpu))
+ return 0;
+ break;
+ case APIC_DFR:
+ avic_handle_dfr_update(&svm->vcpu);
+ break;
+ default:
+ break;
+ }
+
+ kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
+
+ return 1;
+}
+
+static bool is_avic_unaccelerated_access_trap(u32 offset)
+{
+ bool ret = false;
+
+ switch (offset) {
+ case APIC_ID:
+ case APIC_EOI:
+ case APIC_RRR:
+ case APIC_LDR:
+ case APIC_DFR:
+ case APIC_SPIV:
+ case APIC_ESR:
+ case APIC_ICR:
+ case APIC_LVTT:
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVT0:
+ case APIC_LVT1:
+ case APIC_LVTERR:
+ case APIC_TMICT:
+ case APIC_TDCR:
+ ret = true;
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
+{
+ int ret = 0;
+ u32 offset = svm->vmcb->control.exit_info_1 &
+ AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+ u32 vector = svm->vmcb->control.exit_info_2 &
+ AVIC_UNACCEL_ACCESS_VECTOR_MASK;
+ bool write = (svm->vmcb->control.exit_info_1 >> 32) &
+ AVIC_UNACCEL_ACCESS_WRITE_MASK;
+ bool trap = is_avic_unaccelerated_access_trap(offset);
+
+ trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
+ trap, write, vector);
+ if (trap) {
+ /* Handling Trap */
+ WARN_ONCE(!write, "svm: Handling trap read.\n");
+ ret = avic_unaccel_trap_write(svm);
+ } else {
+ /* Handling Fault */
+ ret = kvm_emulate_instruction(&svm->vcpu, 0);
+ }
+
+ return ret;
+}
+
+int avic_init_vcpu(struct vcpu_svm *svm)
+{
+ int ret;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ if (!avic || !irqchip_in_kernel(vcpu->kvm))
+ return 0;
+
+ ret = avic_init_backing_page(&svm->vcpu);
+ if (ret)
+ return ret;
+
+ INIT_LIST_HEAD(&svm->ir_list);
+ spin_lock_init(&svm->ir_list_lock);
+ svm->dfr_reg = APIC_DFR_FLAT;
+
+ return ret;
+}
+
+void avic_post_state_restore(struct kvm_vcpu *vcpu)
+{
+ if (avic_handle_apic_id_update(vcpu) != 0)
+ return;
+ avic_handle_dfr_update(vcpu);
+ avic_handle_ldr_update(vcpu);
+}
+
+void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate)
+{
+ if (!avic || !lapic_in_kernel(vcpu))
+ return;
+
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ kvm_request_apicv_update(vcpu->kvm, activate,
+ APICV_INHIBIT_REASON_IRQWIN);
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+}
+
+void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+ return;
+}
+
+void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+{
+}
+
+void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+{
+}
+
+static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm))
+ return 0;
+
+ /*
+ * Here, we go through the per-vcpu ir_list to update all existing
+ * interrupt remapping table entry targeting this vcpu.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ if (list_empty(&svm->ir_list))
+ goto out;
+
+ list_for_each_entry(ir, &svm->ir_list, node) {
+ if (activate)
+ ret = amd_iommu_activate_guest_mode(ir->data);
+ else
+ ret = amd_iommu_deactivate_guest_mode(ir->data);
+ if (ret)
+ break;
+ }
+out:
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ return ret;
+}
+
+void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+ bool activated = kvm_vcpu_apicv_active(vcpu);
+
+ if (!avic)
+ return;
+
+ if (activated) {
+ /**
+ * During AVIC temporary deactivation, guest could update
+ * APIC ID, DFR and LDR registers, which would not be trapped
+ * by avic_unaccelerated_access_interception(). In this case,
+ * we need to check and update the AVIC logical APIC ID table
+ * accordingly before re-activating.
+ */
+ avic_post_state_restore(vcpu);
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+ } else {
+ vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+ }
+ mark_dirty(vmcb, VMCB_AVIC);
+
+ svm_set_pi_irte_mode(vcpu, activated);
+}
+
+void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+ return;
+}
+
+int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
+{
+ if (!vcpu->arch.apicv_active)
+ return -1;
+
+ kvm_lapic_set_irr(vec, vcpu->arch.apic);
+ smp_mb__after_atomic();
+
+ if (avic_vcpu_is_running(vcpu)) {
+ int cpuid = vcpu->cpu;
+
+ if (cpuid != get_cpu())
+ wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid));
+ put_cpu();
+ } else
+ kvm_vcpu_wake_up(vcpu);
+
+ return 0;
+}
+
+bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
+static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+ unsigned long flags;
+ struct amd_svm_iommu_ir *cur;
+
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+ list_for_each_entry(cur, &svm->ir_list, node) {
+ if (cur->data != pi->ir_data)
+ continue;
+ list_del(&cur->node);
+ kfree(cur);
+ break;
+ }
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
+
+ /**
+ * In some cases, the existing irte is updaed and re-set,
+ * so we need to check here if it's already been * added
+ * to the ir_list.
+ */
+ if (pi->ir_data && (pi->prev_ga_tag != 0)) {
+ struct kvm *kvm = svm->vcpu.kvm;
+ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
+ struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ struct vcpu_svm *prev_svm;
+
+ if (!prev_vcpu) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ prev_svm = to_svm(prev_vcpu);
+ svm_ir_list_del(prev_svm, pi);
+ }
+
+ /**
+ * Allocating new amd_iommu_pi_data, which will get
+ * add to the per-vcpu ir_list.
+ */
+ ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
+ if (!ir) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ir->data = pi->ir_data;
+
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+ list_add(&ir->node, &svm->ir_list);
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+out:
+ return ret;
+}
+
+/**
+ * Note:
+ * The HW cannot support posting multicast/broadcast
+ * interrupts to a vCPU. So, we still use legacy interrupt
+ * remapping for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ */
+static int
+get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+ struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
+{
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu = NULL;
+
+ kvm_set_msi_irq(kvm, e, &irq);
+
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq)) {
+ pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
+ __func__, irq.vector);
+ return -1;
+ }
+
+ pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
+ irq.vector);
+ *svm = to_svm(vcpu);
+ vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
+ vcpu_info->vector = irq.vector;
+
+ return 0;
+}
+
+/*
+ * svm_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ int idx, ret = -EINVAL;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
+ __func__, host_irq, guest_irq, set);
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ struct vcpu_data vcpu_info;
+ struct vcpu_svm *svm = NULL;
+
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ /**
+ * Here, we setup with legacy mode in the following cases:
+ * 1. When cannot target interrupt to a specific vcpu.
+ * 2. Unsetting posted interrupt.
+ * 3. APIC virtialization is disabled for the vcpu.
+ * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
+ */
+ if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
+ kvm_vcpu_apicv_active(&svm->vcpu)) {
+ struct amd_iommu_pi_data pi;
+
+ /* Try to enable guest_mode in IRTE */
+ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
+ AVIC_HPA_MASK);
+ pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
+ svm->vcpu.vcpu_id);
+ pi.is_guest_mode = true;
+ pi.vcpu_data = &vcpu_info;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Here, we successfully setting up vcpu affinity in
+ * IOMMU guest mode. Now, we need to store the posted
+ * interrupt information in a per-vcpu ir_list so that
+ * we can reference to them directly when we update vcpu
+ * scheduling information in IOMMU irte.
+ */
+ if (!ret && pi.is_guest_mode)
+ svm_ir_list_add(svm, &pi);
+ } else {
+ /* Use legacy mode in IRTE */
+ struct amd_iommu_pi_data pi;
+
+ /**
+ * Here, pi is used to:
+ * - Tell IOMMU to use legacy mode for this interrupt.
+ * - Retrieve ga_tag of prior interrupt remapping data.
+ */
+ pi.is_guest_mode = false;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Check if the posted interrupt was previously
+ * setup with the guest_mode by checking if the ga_tag
+ * was cached. If so, we need to clean up the per-vcpu
+ * ir_list.
+ */
+ if (!ret && pi.prev_ga_tag) {
+ int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+ struct kvm_vcpu *vcpu;
+
+ vcpu = kvm_get_vcpu_by_id(kvm, id);
+ if (vcpu)
+ svm_ir_list_del(to_svm(vcpu), &pi);
+ }
+ }
+
+ if (!ret && svm) {
+ trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
+ e->gsi, vcpu_info.vector,
+ vcpu_info.pi_desc_addr, set);
+ }
+
+ if (ret < 0) {
+ pr_err("%s: failed to update PI IRTE\n", __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
+bool svm_check_apicv_inhibit_reasons(ulong bit)
+{
+ ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_HYPERV) |
+ BIT(APICV_INHIBIT_REASON_NESTED) |
+ BIT(APICV_INHIBIT_REASON_IRQWIN) |
+ BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
+ BIT(APICV_INHIBIT_REASON_X2APIC);
+
+ return supported & BIT(bit);
+}
+
+void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate)
+{
+ avic_update_access_page(kvm, activate);
+}
+
+static inline int
+avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm))
+ return 0;
+
+ /*
+ * Here, we go through the per-vcpu ir_list to update all existing
+ * interrupt remapping table entry targeting this vcpu.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ if (list_empty(&svm->ir_list))
+ goto out;
+
+ list_for_each_entry(ir, &svm->ir_list, node) {
+ ret = amd_iommu_update_ga(cpu, r, ir->data);
+ if (ret)
+ break;
+ }
+out:
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ return ret;
+}
+
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ u64 entry;
+ /* ID = 0xff (broadcast), ID > 0xff (reserved) */
+ int h_physical_id = kvm_cpu_get_apicid(cpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ /*
+ * Since the host physical APIC id is 8 bits,
+ * we can support host APIC ID upto 255.
+ */
+ if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ return;
+
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ if (svm->avic_is_running)
+ entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
+ svm->avic_is_running);
+}
+
+void avic_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ u64 entry;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+ avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+}
+
+/**
+ * This function is called during VCPU halt/unhalt.
+ */
+static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->avic_is_running = is_run;
+ if (is_run)
+ avic_vcpu_load(vcpu, vcpu->cpu);
+ else
+ avic_vcpu_put(vcpu);
+}
+
+void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+ avic_set_running(vcpu, false);
+}
+
+void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+ if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
+ kvm_vcpu_update_apicv(vcpu);
+ avic_set_running(vcpu, true);
+}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
new file mode 100644
index 000000000000..90a1ca939627
--- /dev/null
+++ b/arch/x86/kvm/svm/nested.c
@@ -0,0 +1,823 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+#define pr_fmt(fmt) "SVM: " fmt
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+
+#include <asm/msr-index.h>
+
+#include "kvm_emulate.h"
+#include "trace.h"
+#include "mmu.h"
+#include "x86.h"
+#include "svm.h"
+
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
+ /*
+ * TODO: track the cause of the nested page fault, and
+ * correctly fill in the high bits of exit_info_1.
+ */
+ svm->vmcb->control.exit_code = SVM_EXIT_NPF;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = (1ULL << 32);
+ svm->vmcb->control.exit_info_2 = fault->address;
+ }
+
+ svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
+ svm->vmcb->control.exit_info_1 |= fault->error_code;
+
+ /*
+ * The present bit is always zero for page structure faults on real
+ * hardware.
+ */
+ if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
+ svm->vmcb->control.exit_info_1 &= ~1;
+
+ nested_svm_vmexit(svm);
+}
+
+static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 cr3 = svm->nested.nested_cr3;
+ u64 pdpte;
+ int ret;
+
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
+ offset_in_page(cr3) + index * 8, 8);
+ if (ret)
+ return 0;
+ return pdpte;
+}
+
+static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->nested.nested_cr3;
+}
+
+static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+ WARN_ON(mmu_is_nested(vcpu));
+
+ vcpu->arch.mmu = &vcpu->arch.guest_mmu;
+ kvm_init_shadow_mmu(vcpu);
+ vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
+ vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
+ vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
+ vcpu->arch.mmu->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu);
+ reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
+ vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+}
+
+static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.mmu = &vcpu->arch.root_mmu;
+ vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
+}
+
+void recalc_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *c, *h;
+ struct nested_state *g;
+
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ if (!is_guest_mode(&svm->vcpu))
+ return;
+
+ c = &svm->vmcb->control;
+ h = &svm->nested.hsave->control;
+ g = &svm->nested;
+
+ c->intercept_cr = h->intercept_cr;
+ c->intercept_dr = h->intercept_dr;
+ c->intercept_exceptions = h->intercept_exceptions;
+ c->intercept = h->intercept;
+
+ if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
+ /* We only want the cr8 intercept bits of L1 */
+ c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ);
+ c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE);
+
+ /*
+ * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
+ * affect any interrupt we may want to inject; therefore,
+ * interrupt window vmexits are irrelevant to L0.
+ */
+ c->intercept &= ~(1ULL << INTERCEPT_VINTR);
+ }
+
+ /* We don't want to see VMMCALLs from a nested guest */
+ c->intercept &= ~(1ULL << INTERCEPT_VMMCALL);
+
+ c->intercept_cr |= g->intercept_cr;
+ c->intercept_dr |= g->intercept_dr;
+ c->intercept_exceptions |= g->intercept_exceptions;
+ c->intercept |= g->intercept;
+}
+
+static void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
+{
+ struct vmcb_control_area *dst = &dst_vmcb->control;
+ struct vmcb_control_area *from = &from_vmcb->control;
+
+ dst->intercept_cr = from->intercept_cr;
+ dst->intercept_dr = from->intercept_dr;
+ dst->intercept_exceptions = from->intercept_exceptions;
+ dst->intercept = from->intercept;
+ dst->iopm_base_pa = from->iopm_base_pa;
+ dst->msrpm_base_pa = from->msrpm_base_pa;
+ dst->tsc_offset = from->tsc_offset;
+ dst->asid = from->asid;
+ dst->tlb_ctl = from->tlb_ctl;
+ dst->int_ctl = from->int_ctl;
+ dst->int_vector = from->int_vector;
+ dst->int_state = from->int_state;
+ dst->exit_code = from->exit_code;
+ dst->exit_code_hi = from->exit_code_hi;
+ dst->exit_info_1 = from->exit_info_1;
+ dst->exit_info_2 = from->exit_info_2;
+ dst->exit_int_info = from->exit_int_info;
+ dst->exit_int_info_err = from->exit_int_info_err;
+ dst->nested_ctl = from->nested_ctl;
+ dst->event_inj = from->event_inj;
+ dst->event_inj_err = from->event_inj_err;
+ dst->nested_cr3 = from->nested_cr3;
+ dst->virt_ext = from->virt_ext;
+ dst->pause_filter_count = from->pause_filter_count;
+ dst->pause_filter_thresh = from->pause_filter_thresh;
+}
+
+static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
+{
+ /*
+ * This function merges the msr permission bitmaps of kvm and the
+ * nested vmcb. It is optimized in that it only merges the parts where
+ * the kvm msr permission bitmap may contain zero bits
+ */
+ int i;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ return true;
+
+ for (i = 0; i < MSRPM_OFFSETS; i++) {
+ u32 value, p;
+ u64 offset;
+
+ if (msrpm_offsets[i] == 0xffffffff)
+ break;
+
+ p = msrpm_offsets[i];
+ offset = svm->nested.vmcb_msrpm + (p * 4);
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
+ return false;
+
+ svm->nested.msrpm[p] = svm->msrpm[p] | value;
+ }
+
+ svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
+
+ return true;
+}
+
+static bool nested_vmcb_checks(struct vmcb *vmcb)
+{
+ if ((vmcb->save.efer & EFER_SVME) == 0)
+ return false;
+
+ if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
+ return false;
+
+ if (vmcb->control.asid == 0)
+ return false;
+
+ if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
+ !npt_enabled)
+ return false;
+
+ return true;
+}
+
+void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
+ struct vmcb *nested_vmcb, struct kvm_host_map *map)
+{
+ bool evaluate_pending_interrupts =
+ is_intercept(svm, INTERCEPT_VINTR) ||
+ is_intercept(svm, INTERCEPT_IRET);
+
+ if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
+ svm->vcpu.arch.hflags |= HF_HIF_MASK;
+ else
+ svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
+
+ if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) {
+ svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
+ nested_svm_init_mmu_context(&svm->vcpu);
+ }
+
+ /* Load the nested guest state */
+ svm->vmcb->save.es = nested_vmcb->save.es;
+ svm->vmcb->save.cs = nested_vmcb->save.cs;
+ svm->vmcb->save.ss = nested_vmcb->save.ss;
+ svm->vmcb->save.ds = nested_vmcb->save.ds;
+ svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
+ svm->vmcb->save.idtr = nested_vmcb->save.idtr;
+ kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
+ svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
+ svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
+ svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
+ if (npt_enabled) {
+ svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
+ svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
+ } else
+ (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
+
+ /* Guest paging mode is active - reset mmu */
+ kvm_mmu_reset_context(&svm->vcpu);
+
+ svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
+ kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax);
+ kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp);
+ kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip);
+
+ /* In case we don't even reach vcpu_run, the fields are not updated */
+ svm->vmcb->save.rax = nested_vmcb->save.rax;
+ svm->vmcb->save.rsp = nested_vmcb->save.rsp;
+ svm->vmcb->save.rip = nested_vmcb->save.rip;
+ svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
+ svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
+ svm->vmcb->save.cpl = nested_vmcb->save.cpl;
+
+ svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
+ svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
+
+ /* cache intercepts */
+ svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
+ svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
+ svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
+ svm->nested.intercept = nested_vmcb->control.intercept;
+
+ svm_flush_tlb(&svm->vcpu, true);
+ svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
+ if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
+ svm->vcpu.arch.hflags |= HF_VINTR_MASK;
+ else
+ svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
+
+ svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
+ svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+
+ svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
+ svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
+ svm->vmcb->control.int_state = nested_vmcb->control.int_state;
+ svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
+ svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
+
+ svm->vmcb->control.pause_filter_count =
+ nested_vmcb->control.pause_filter_count;
+ svm->vmcb->control.pause_filter_thresh =
+ nested_vmcb->control.pause_filter_thresh;
+
+ kvm_vcpu_unmap(&svm->vcpu, map, true);
+
+ /* Enter Guest-Mode */
+ enter_guest_mode(&svm->vcpu);
+
+ /*
+ * Merge guest and host intercepts - must be called with vcpu in
+ * guest-mode to take affect here
+ */
+ recalc_intercepts(svm);
+
+ svm->nested.vmcb = vmcb_gpa;
+
+ /*
+ * If L1 had a pending IRQ/NMI before executing VMRUN,
+ * which wasn't delivered because it was disallowed (e.g.
+ * interrupts disabled), L0 needs to evaluate if this pending
+ * event should cause an exit from L2 to L1 or be delivered
+ * directly to L2.
+ *
+ * Usually this would be handled by the processor noticing an
+ * IRQ/NMI window request. However, VMRUN can unblock interrupts
+ * by implicitly setting GIF, so force L0 to perform pending event
+ * evaluation by requesting a KVM_REQ_EVENT.
+ */
+ enable_gif(svm);
+ if (unlikely(evaluate_pending_interrupts))
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
+ mark_all_dirty(svm->vmcb);
+}
+
+int nested_svm_vmrun(struct vcpu_svm *svm)
+{
+ int ret;
+ struct vmcb *nested_vmcb;
+ struct vmcb *hsave = svm->nested.hsave;
+ struct vmcb *vmcb = svm->vmcb;
+ struct kvm_host_map map;
+ u64 vmcb_gpa;
+
+ vmcb_gpa = svm->vmcb->save.rax;
+
+ ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map);
+ if (ret == -EINVAL) {
+ kvm_inject_gp(&svm->vcpu, 0);
+ return 1;
+ } else if (ret) {
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+ }
+
+ ret = kvm_skip_emulated_instruction(&svm->vcpu);
+
+ nested_vmcb = map.hva;
+
+ if (!nested_vmcb_checks(nested_vmcb)) {
+ nested_vmcb->control.exit_code = SVM_EXIT_ERR;
+ nested_vmcb->control.exit_code_hi = 0;
+ nested_vmcb->control.exit_info_1 = 0;
+ nested_vmcb->control.exit_info_2 = 0;
+
+ kvm_vcpu_unmap(&svm->vcpu, &map, true);
+
+ return ret;
+ }
+
+ trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
+ nested_vmcb->save.rip,
+ nested_vmcb->control.int_ctl,
+ nested_vmcb->control.event_inj,
+ nested_vmcb->control.nested_ctl);
+
+ trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
+ nested_vmcb->control.intercept_cr >> 16,
+ nested_vmcb->control.intercept_exceptions,
+ nested_vmcb->control.intercept);
+
+ /* Clear internal status */
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ /*
+ * Save the old vmcb, so we don't need to pick what we save, but can
+ * restore everything when a VMEXIT occurs
+ */
+ hsave->save.es = vmcb->save.es;
+ hsave->save.cs = vmcb->save.cs;
+ hsave->save.ss = vmcb->save.ss;
+ hsave->save.ds = vmcb->save.ds;
+ hsave->save.gdtr = vmcb->save.gdtr;
+ hsave->save.idtr = vmcb->save.idtr;
+ hsave->save.efer = svm->vcpu.arch.efer;
+ hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
+ hsave->save.cr4 = svm->vcpu.arch.cr4;
+ hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
+ hsave->save.rip = kvm_rip_read(&svm->vcpu);
+ hsave->save.rsp = vmcb->save.rsp;
+ hsave->save.rax = vmcb->save.rax;
+ if (npt_enabled)
+ hsave->save.cr3 = vmcb->save.cr3;
+ else
+ hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
+
+ copy_vmcb_control_area(hsave, vmcb);
+
+ enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map);
+
+ if (!nested_svm_vmrun_msrpm(svm)) {
+ svm->vmcb->control.exit_code = SVM_EXIT_ERR;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ nested_svm_vmexit(svm);
+ }
+
+ return ret;
+}
+
+void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+ to_vmcb->save.fs = from_vmcb->save.fs;
+ to_vmcb->save.gs = from_vmcb->save.gs;
+ to_vmcb->save.tr = from_vmcb->save.tr;
+ to_vmcb->save.ldtr = from_vmcb->save.ldtr;
+ to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
+ to_vmcb->save.star = from_vmcb->save.star;
+ to_vmcb->save.lstar = from_vmcb->save.lstar;
+ to_vmcb->save.cstar = from_vmcb->save.cstar;
+ to_vmcb->save.sfmask = from_vmcb->save.sfmask;
+ to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
+ to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
+ to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
+}
+
+int nested_svm_vmexit(struct vcpu_svm *svm)
+{
+ int rc;
+ struct vmcb *nested_vmcb;
+ struct vmcb *hsave = svm->nested.hsave;
+ struct vmcb *vmcb = svm->vmcb;
+ struct kvm_host_map map;
+
+ trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
+ vmcb->control.exit_info_1,
+ vmcb->control.exit_info_2,
+ vmcb->control.exit_int_info,
+ vmcb->control.exit_int_info_err,
+ KVM_ISA_SVM);
+
+ rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map);
+ if (rc) {
+ if (rc == -EINVAL)
+ kvm_inject_gp(&svm->vcpu, 0);
+ return 1;
+ }
+
+ nested_vmcb = map.hva;
+
+ /* Exit Guest-Mode */
+ leave_guest_mode(&svm->vcpu);
+ svm->nested.vmcb = 0;
+
+ /* Give the current vmcb to the guest */
+ disable_gif(svm);
+
+ nested_vmcb->save.es = vmcb->save.es;
+ nested_vmcb->save.cs = vmcb->save.cs;
+ nested_vmcb->save.ss = vmcb->save.ss;
+ nested_vmcb->save.ds = vmcb->save.ds;
+ nested_vmcb->save.gdtr = vmcb->save.gdtr;
+ nested_vmcb->save.idtr = vmcb->save.idtr;
+ nested_vmcb->save.efer = svm->vcpu.arch.efer;
+ nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
+ nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
+ nested_vmcb->save.cr2 = vmcb->save.cr2;
+ nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
+ nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
+ nested_vmcb->save.rip = vmcb->save.rip;
+ nested_vmcb->save.rsp = vmcb->save.rsp;
+ nested_vmcb->save.rax = vmcb->save.rax;
+ nested_vmcb->save.dr7 = vmcb->save.dr7;
+ nested_vmcb->save.dr6 = vmcb->save.dr6;
+ nested_vmcb->save.cpl = vmcb->save.cpl;
+
+ nested_vmcb->control.int_ctl = vmcb->control.int_ctl;
+ nested_vmcb->control.int_vector = vmcb->control.int_vector;
+ nested_vmcb->control.int_state = vmcb->control.int_state;
+ nested_vmcb->control.exit_code = vmcb->control.exit_code;
+ nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi;
+ nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1;
+ nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
+ nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
+ nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
+
+ if (svm->nrips_enabled)
+ nested_vmcb->control.next_rip = vmcb->control.next_rip;
+
+ /*
+ * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
+ * to make sure that we do not lose injected events. So check event_inj
+ * here and copy it to exit_int_info if it is valid.
+ * Exit_int_info and event_inj can't be both valid because the case
+ * below only happens on a VMRUN instruction intercept which has
+ * no valid exit_int_info set.
+ */
+ if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
+ struct vmcb_control_area *nc = &nested_vmcb->control;
+
+ nc->exit_int_info = vmcb->control.event_inj;
+ nc->exit_int_info_err = vmcb->control.event_inj_err;
+ }
+
+ nested_vmcb->control.tlb_ctl = 0;
+ nested_vmcb->control.event_inj = 0;
+ nested_vmcb->control.event_inj_err = 0;
+
+ nested_vmcb->control.pause_filter_count =
+ svm->vmcb->control.pause_filter_count;
+ nested_vmcb->control.pause_filter_thresh =
+ svm->vmcb->control.pause_filter_thresh;
+
+ /* We always set V_INTR_MASKING and remember the old value in hflags */
+ if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+ nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
+
+ /* Restore the original control entries */
+ copy_vmcb_control_area(vmcb, hsave);
+
+ svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ svm->nested.nested_cr3 = 0;
+
+ /* Restore selected save entries */
+ svm->vmcb->save.es = hsave->save.es;
+ svm->vmcb->save.cs = hsave->save.cs;
+ svm->vmcb->save.ss = hsave->save.ss;
+ svm->vmcb->save.ds = hsave->save.ds;
+ svm->vmcb->save.gdtr = hsave->save.gdtr;
+ svm->vmcb->save.idtr = hsave->save.idtr;
+ kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
+ svm_set_efer(&svm->vcpu, hsave->save.efer);
+ svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
+ svm_set_cr4(&svm->vcpu, hsave->save.cr4);
+ if (npt_enabled) {
+ svm->vmcb->save.cr3 = hsave->save.cr3;
+ svm->vcpu.arch.cr3 = hsave->save.cr3;
+ } else {
+ (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
+ }
+ kvm_rax_write(&svm->vcpu, hsave->save.rax);
+ kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
+ kvm_rip_write(&svm->vcpu, hsave->save.rip);
+ svm->vmcb->save.dr7 = 0;
+ svm->vmcb->save.cpl = 0;
+ svm->vmcb->control.exit_int_info = 0;
+
+ mark_all_dirty(svm->vmcb);
+
+ kvm_vcpu_unmap(&svm->vcpu, &map, true);
+
+ nested_svm_uninit_mmu_context(&svm->vcpu);
+ kvm_mmu_reset_context(&svm->vcpu);
+ kvm_mmu_load(&svm->vcpu);
+
+ /*
+ * Drop what we picked up for L2 via svm_complete_interrupts() so it
+ * doesn't end up in L1.
+ */
+ svm->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ return 0;
+}
+
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+{
+ u32 offset, msr, value;
+ int write, mask;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ return NESTED_EXIT_HOST;
+
+ msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ offset = svm_msrpm_offset(msr);
+ write = svm->vmcb->control.exit_info_1 & 1;
+ mask = 1 << ((2 * (msr & 0xf)) + write);
+
+ if (offset == MSR_INVALID)
+ return NESTED_EXIT_DONE;
+
+ /* Offset is in 32 bit units but need in 8 bit units */
+ offset *= 4;
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
+ return NESTED_EXIT_DONE;
+
+ return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+/* DB exceptions for our internal use must not cause vmexit */
+static int nested_svm_intercept_db(struct vcpu_svm *svm)
+{
+ unsigned long dr6;
+
+ /* if we're not singlestepping, it's not ours */
+ if (!svm->nmi_singlestep)
+ return NESTED_EXIT_DONE;
+
+ /* if it's not a singlestep exception, it's not ours */
+ if (kvm_get_dr(&svm->vcpu, 6, &dr6))
+ return NESTED_EXIT_DONE;
+ if (!(dr6 & DR6_BS))
+ return NESTED_EXIT_DONE;
+
+ /* if the guest is singlestepping, it should get the vmexit */
+ if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
+ disable_nmi_singlestep(svm);
+ return NESTED_EXIT_DONE;
+ }
+
+ /* it's ours, the nested hypervisor must not see this one */
+ return NESTED_EXIT_HOST;
+}
+
+static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
+{
+ unsigned port, size, iopm_len;
+ u16 val, mask;
+ u8 start_bit;
+ u64 gpa;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
+ return NESTED_EXIT_HOST;
+
+ port = svm->vmcb->control.exit_info_1 >> 16;
+ size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
+ SVM_IOIO_SIZE_SHIFT;
+ gpa = svm->nested.vmcb_iopm + (port / 8);
+ start_bit = port % 8;
+ iopm_len = (start_bit + size > 8) ? 2 : 1;
+ mask = (0xf >> (4 - size)) << start_bit;
+ val = 0;
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
+ return NESTED_EXIT_DONE;
+
+ return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+static int nested_svm_intercept(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+ int vmexit = NESTED_EXIT_HOST;
+
+ switch (exit_code) {
+ case SVM_EXIT_MSR:
+ vmexit = nested_svm_exit_handled_msr(svm);
+ break;
+ case SVM_EXIT_IOIO:
+ vmexit = nested_svm_intercept_ioio(svm);
+ break;
+ case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
+ u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
+ if (svm->nested.intercept_cr & bit)
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
+ u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
+ if (svm->nested.intercept_dr & bit)
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+ u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
+ if (svm->nested.intercept_exceptions & excp_bits) {
+ if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
+ vmexit = nested_svm_intercept_db(svm);
+ else
+ vmexit = NESTED_EXIT_DONE;
+ }
+ /* async page fault always cause vmexit */
+ else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
+ svm->vcpu.arch.exception.nested_apf != 0)
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_ERR: {
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ default: {
+ u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
+ if (svm->nested.intercept & exit_bits)
+ vmexit = NESTED_EXIT_DONE;
+ }
+ }
+
+ return vmexit;
+}
+
+int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+ int vmexit;
+
+ vmexit = nested_svm_intercept(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
+ nested_svm_vmexit(svm);
+
+ return vmexit;
+}
+
+int nested_svm_check_permissions(struct vcpu_svm *svm)
+{
+ if (!(svm->vcpu.arch.efer & EFER_SVME) ||
+ !is_paging(&svm->vcpu)) {
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (svm->vmcb->save.cpl) {
+ kvm_inject_gp(&svm->vcpu, 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+ bool has_error_code, u32 error_code)
+{
+ int vmexit;
+
+ if (!is_guest_mode(&svm->vcpu))
+ return 0;
+
+ vmexit = nested_svm_intercept(svm);
+ if (vmexit != NESTED_EXIT_DONE)
+ return 0;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = error_code;
+
+ /*
+ * EXITINFO2 is undefined for all exception intercepts other
+ * than #PF.
+ */
+ if (svm->vcpu.arch.exception.nested_apf)
+ svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+ else if (svm->vcpu.arch.exception.has_payload)
+ svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
+ else
+ svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+
+ svm->nested.exit_required = true;
+ return vmexit;
+}
+
+static void nested_svm_intr(struct vcpu_svm *svm)
+{
+ svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ /* nested_svm_vmexit this gets called afterwards from handle_exit */
+ svm->nested.exit_required = true;
+ trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+}
+
+static bool nested_exit_on_intr(struct vcpu_svm *svm)
+{
+ return (svm->nested.intercept & 1ULL);
+}
+
+int svm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool block_nested_events =
+ kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required;
+
+ if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_svm_intr(svm);
+ return 0;
+ }
+
+ return 0;
+}
+
+int nested_svm_exit_special(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+
+ switch (exit_code) {
+ case SVM_EXIT_INTR:
+ case SVM_EXIT_NMI:
+ case SVM_EXIT_EXCP_BASE + MC_VECTOR:
+ return NESTED_EXIT_HOST;
+ case SVM_EXIT_NPF:
+ /* For now we are always handling NPFs when using them */
+ if (npt_enabled)
+ return NESTED_EXIT_HOST;
+ break;
+ case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+ /* When we're shadowing, trap PFs, but not async PF */
+ if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0)
+ return NESTED_EXIT_HOST;
+ break;
+ default:
+ break;
+ }
+
+ return NESTED_EXIT_CONTINUE;
+}
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/svm/pmu.c
index ce0b10fe5e2b..ce0b10fe5e2b 100644
--- a/arch/x86/kvm/pmu_amd.c
+++ b/arch/x86/kvm/svm/pmu.c
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
new file mode 100644
index 000000000000..0e3fc311d7da
--- /dev/null
+++ b/arch/x86/kvm/svm/sev.c
@@ -0,0 +1,1187 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM-SEV support
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ */
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/psp-sev.h>
+#include <linux/swap.h>
+
+#include "x86.h"
+#include "svm.h"
+
+static int sev_flush_asids(void);
+static DECLARE_RWSEM(sev_deactivate_lock);
+static DEFINE_MUTEX(sev_bitmap_lock);
+unsigned int max_sev_asid;
+static unsigned int min_sev_asid;
+static unsigned long *sev_asid_bitmap;
+static unsigned long *sev_reclaim_asid_bitmap;
+#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
+
+struct enc_region {
+ struct list_head list;
+ unsigned long npages;
+ struct page **pages;
+ unsigned long uaddr;
+ unsigned long size;
+};
+
+static int sev_flush_asids(void)
+{
+ int ret, error = 0;
+
+ /*
+ * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
+ * so it must be guarded.
+ */
+ down_write(&sev_deactivate_lock);
+
+ wbinvd_on_all_cpus();
+ ret = sev_guest_df_flush(&error);
+
+ up_write(&sev_deactivate_lock);
+
+ if (ret)
+ pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error);
+
+ return ret;
+}
+
+/* Must be called with the sev_bitmap_lock held */
+static bool __sev_recycle_asids(void)
+{
+ int pos;
+
+ /* Check if there are any ASIDs to reclaim before performing a flush */
+ pos = find_next_bit(sev_reclaim_asid_bitmap,
+ max_sev_asid, min_sev_asid - 1);
+ if (pos >= max_sev_asid)
+ return false;
+
+ if (sev_flush_asids())
+ return false;
+
+ bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
+ max_sev_asid);
+ bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid);
+
+ return true;
+}
+
+static int sev_asid_new(void)
+{
+ bool retry = true;
+ int pos;
+
+ mutex_lock(&sev_bitmap_lock);
+
+ /*
+ * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid.
+ */
+again:
+ pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1);
+ if (pos >= max_sev_asid) {
+ if (retry && __sev_recycle_asids()) {
+ retry = false;
+ goto again;
+ }
+ mutex_unlock(&sev_bitmap_lock);
+ return -EBUSY;
+ }
+
+ __set_bit(pos, sev_asid_bitmap);
+
+ mutex_unlock(&sev_bitmap_lock);
+
+ return pos + 1;
+}
+
+static int sev_get_asid(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev->asid;
+}
+
+static void sev_asid_free(int asid)
+{
+ struct svm_cpu_data *sd;
+ int cpu, pos;
+
+ mutex_lock(&sev_bitmap_lock);
+
+ pos = asid - 1;
+ __set_bit(pos, sev_reclaim_asid_bitmap);
+
+ for_each_possible_cpu(cpu) {
+ sd = per_cpu(svm_data, cpu);
+ sd->sev_vmcbs[pos] = NULL;
+ }
+
+ mutex_unlock(&sev_bitmap_lock);
+}
+
+static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
+{
+ struct sev_data_decommission *decommission;
+ struct sev_data_deactivate *data;
+
+ if (!handle)
+ return;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return;
+
+ /* deactivate handle */
+ data->handle = handle;
+
+ /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
+ down_read(&sev_deactivate_lock);
+ sev_guest_deactivate(data, NULL);
+ up_read(&sev_deactivate_lock);
+
+ kfree(data);
+
+ decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
+ if (!decommission)
+ return;
+
+ /* decommission handle */
+ decommission->handle = handle;
+ sev_guest_decommission(decommission, NULL);
+
+ kfree(decommission);
+}
+
+static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ int asid, ret;
+
+ ret = -EBUSY;
+ if (unlikely(sev->active))
+ return ret;
+
+ asid = sev_asid_new();
+ if (asid < 0)
+ return ret;
+
+ ret = sev_platform_init(&argp->error);
+ if (ret)
+ goto e_free;
+
+ sev->active = true;
+ sev->asid = asid;
+ INIT_LIST_HEAD(&sev->regions_list);
+
+ return 0;
+
+e_free:
+ sev_asid_free(asid);
+ return ret;
+}
+
+static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
+{
+ struct sev_data_activate *data;
+ int asid = sev_get_asid(kvm);
+ int ret;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+ if (!data)
+ return -ENOMEM;
+
+ /* activate ASID on the given handle */
+ data->handle = handle;
+ data->asid = asid;
+ ret = sev_guest_activate(data, error);
+ kfree(data);
+
+ return ret;
+}
+
+static int __sev_issue_cmd(int fd, int id, void *data, int *error)
+{
+ struct fd f;
+ int ret;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ ret = sev_issue_cmd_external_user(f.file, id, data, error);
+
+ fdput(f);
+ return ret;
+}
+
+static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return __sev_issue_cmd(sev->fd, id, data, error);
+}
+
+static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_start *start;
+ struct kvm_sev_launch_start params;
+ void *dh_blob, *session_blob;
+ int *error = &argp->error;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
+ if (!start)
+ return -ENOMEM;
+
+ dh_blob = NULL;
+ if (params.dh_uaddr) {
+ dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
+ if (IS_ERR(dh_blob)) {
+ ret = PTR_ERR(dh_blob);
+ goto e_free;
+ }
+
+ start->dh_cert_address = __sme_set(__pa(dh_blob));
+ start->dh_cert_len = params.dh_len;
+ }
+
+ session_blob = NULL;
+ if (params.session_uaddr) {
+ session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
+ if (IS_ERR(session_blob)) {
+ ret = PTR_ERR(session_blob);
+ goto e_free_dh;
+ }
+
+ start->session_address = __sme_set(__pa(session_blob));
+ start->session_len = params.session_len;
+ }
+
+ start->handle = params.handle;
+ start->policy = params.policy;
+
+ /* create memory encryption context */
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
+ if (ret)
+ goto e_free_session;
+
+ /* Bind ASID to this guest */
+ ret = sev_bind_asid(kvm, start->handle, error);
+ if (ret)
+ goto e_free_session;
+
+ /* return handle to userspace */
+ params.handle = start->handle;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
+ sev_unbind_asid(kvm, start->handle);
+ ret = -EFAULT;
+ goto e_free_session;
+ }
+
+ sev->handle = start->handle;
+ sev->fd = argp->sev_fd;
+
+e_free_session:
+ kfree(session_blob);
+e_free_dh:
+ kfree(dh_blob);
+e_free:
+ kfree(start);
+ return ret;
+}
+
+static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
+ unsigned long ulen, unsigned long *n,
+ int write)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ unsigned long npages, npinned, size;
+ unsigned long locked, lock_limit;
+ struct page **pages;
+ unsigned long first, last;
+
+ if (ulen == 0 || uaddr + ulen < uaddr)
+ return NULL;
+
+ /* Calculate number of pages. */
+ first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
+ last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
+ npages = (last - first + 1);
+
+ locked = sev->pages_locked + npages;
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+ pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
+ return NULL;
+ }
+
+ /* Avoid using vmalloc for smaller buffers. */
+ size = npages * sizeof(struct page *);
+ if (size > PAGE_SIZE)
+ pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ PAGE_KERNEL);
+ else
+ pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
+
+ if (!pages)
+ return NULL;
+
+ /* Pin the user virtual address. */
+ npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
+ if (npinned != npages) {
+ pr_err("SEV: Failure locking %lu pages.\n", npages);
+ goto err;
+ }
+
+ *n = npages;
+ sev->pages_locked = locked;
+
+ return pages;
+
+err:
+ if (npinned > 0)
+ release_pages(pages, npinned);
+
+ kvfree(pages);
+ return NULL;
+}
+
+static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
+ unsigned long npages)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ release_pages(pages, npages);
+ kvfree(pages);
+ sev->pages_locked -= npages;
+}
+
+static void sev_clflush_pages(struct page *pages[], unsigned long npages)
+{
+ uint8_t *page_virtual;
+ unsigned long i;
+
+ if (npages == 0 || pages == NULL)
+ return;
+
+ for (i = 0; i < npages; i++) {
+ page_virtual = kmap_atomic(pages[i]);
+ clflush_cache_range(page_virtual, PAGE_SIZE);
+ kunmap_atomic(page_virtual);
+ }
+}
+
+static unsigned long get_num_contig_pages(unsigned long idx,
+ struct page **inpages, unsigned long npages)
+{
+ unsigned long paddr, next_paddr;
+ unsigned long i = idx + 1, pages = 1;
+
+ /* find the number of contiguous pages starting from idx */
+ paddr = __sme_page_pa(inpages[idx]);
+ while (i < npages) {
+ next_paddr = __sme_page_pa(inpages[i++]);
+ if ((paddr + PAGE_SIZE) == next_paddr) {
+ pages++;
+ paddr = next_paddr;
+ continue;
+ }
+ break;
+ }
+
+ return pages;
+}
+
+static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_launch_update_data params;
+ struct sev_data_launch_update_data *data;
+ struct page **inpages;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+ if (!data)
+ return -ENOMEM;
+
+ vaddr = params.uaddr;
+ size = params.len;
+ vaddr_end = vaddr + size;
+
+ /* Lock the user memory. */
+ inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
+ if (!inpages) {
+ ret = -ENOMEM;
+ goto e_free;
+ }
+
+ /*
+ * The LAUNCH_UPDATE command will perform in-place encryption of the
+ * memory content (i.e it will write the same memory region with C=1).
+ * It's possible that the cache may contain the data with C=0, i.e.,
+ * unencrypted so invalidate it first.
+ */
+ sev_clflush_pages(inpages, npages);
+
+ for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
+ int offset, len;
+
+ /*
+ * If the user buffer is not page-aligned, calculate the offset
+ * within the page.
+ */
+ offset = vaddr & (PAGE_SIZE - 1);
+
+ /* Calculate the number of pages that can be encrypted in one go. */
+ pages = get_num_contig_pages(i, inpages, npages);
+
+ len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
+
+ data->handle = sev->handle;
+ data->len = len;
+ data->address = __sme_page_pa(inpages[i]) + offset;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
+ if (ret)
+ goto e_unpin;
+
+ size -= len;
+ next_vaddr = vaddr + len;
+ }
+
+e_unpin:
+ /* content of memory is updated, mark pages dirty */
+ for (i = 0; i < npages; i++) {
+ set_page_dirty_lock(inpages[i]);
+ mark_page_accessed(inpages[i]);
+ }
+ /* unlock the user pages */
+ sev_unpin_memory(kvm, inpages, npages);
+e_free:
+ kfree(data);
+ return ret;
+}
+
+static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ void __user *measure = (void __user *)(uintptr_t)argp->data;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_measure *data;
+ struct kvm_sev_launch_measure params;
+ void __user *p = NULL;
+ void *blob = NULL;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, measure, sizeof(params)))
+ return -EFAULT;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+ if (!data)
+ return -ENOMEM;
+
+ /* User wants to query the blob length */
+ if (!params.len)
+ goto cmd;
+
+ p = (void __user *)(uintptr_t)params.uaddr;
+ if (p) {
+ if (params.len > SEV_FW_BLOB_MAX_SIZE) {
+ ret = -EINVAL;
+ goto e_free;
+ }
+
+ ret = -ENOMEM;
+ blob = kmalloc(params.len, GFP_KERNEL);
+ if (!blob)
+ goto e_free;
+
+ data->address = __psp_pa(blob);
+ data->len = params.len;
+ }
+
+cmd:
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
+
+ /*
+ * If we query the session length, FW responded with expected data.
+ */
+ if (!params.len)
+ goto done;
+
+ if (ret)
+ goto e_free_blob;
+
+ if (blob) {
+ if (copy_to_user(p, blob, params.len))
+ ret = -EFAULT;
+ }
+
+done:
+ params.len = data->len;
+ if (copy_to_user(measure, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free_blob:
+ kfree(blob);
+e_free:
+ kfree(data);
+ return ret;
+}
+
+static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_finish *data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+ if (!data)
+ return -ENOMEM;
+
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
+
+ kfree(data);
+ return ret;
+}
+
+static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_guest_status params;
+ struct sev_data_guest_status *data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+ if (!data)
+ return -ENOMEM;
+
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
+ if (ret)
+ goto e_free;
+
+ params.policy = data->policy;
+ params.state = data->state;
+ params.handle = data->handle;
+
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free:
+ kfree(data);
+ return ret;
+}
+
+static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
+ unsigned long dst, int size,
+ int *error, bool enc)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_dbg *data;
+ int ret;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+ if (!data)
+ return -ENOMEM;
+
+ data->handle = sev->handle;
+ data->dst_addr = dst;
+ data->src_addr = src;
+ data->len = size;
+
+ ret = sev_issue_cmd(kvm,
+ enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+ data, error);
+ kfree(data);
+ return ret;
+}
+
+static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
+ unsigned long dst_paddr, int sz, int *err)
+{
+ int offset;
+
+ /*
+ * Its safe to read more than we are asked, caller should ensure that
+ * destination has enough space.
+ */
+ src_paddr = round_down(src_paddr, 16);
+ offset = src_paddr & 15;
+ sz = round_up(sz + offset, 16);
+
+ return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
+}
+
+static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
+ unsigned long __user dst_uaddr,
+ unsigned long dst_paddr,
+ int size, int *err)
+{
+ struct page *tpage = NULL;
+ int ret, offset;
+
+ /* if inputs are not 16-byte then use intermediate buffer */
+ if (!IS_ALIGNED(dst_paddr, 16) ||
+ !IS_ALIGNED(paddr, 16) ||
+ !IS_ALIGNED(size, 16)) {
+ tpage = (void *)alloc_page(GFP_KERNEL);
+ if (!tpage)
+ return -ENOMEM;
+
+ dst_paddr = __sme_page_pa(tpage);
+ }
+
+ ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
+ if (ret)
+ goto e_free;
+
+ if (tpage) {
+ offset = paddr & 15;
+ if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
+ page_address(tpage) + offset, size))
+ ret = -EFAULT;
+ }
+
+e_free:
+ if (tpage)
+ __free_page(tpage);
+
+ return ret;
+}
+
+static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
+ unsigned long __user vaddr,
+ unsigned long dst_paddr,
+ unsigned long __user dst_vaddr,
+ int size, int *error)
+{
+ struct page *src_tpage = NULL;
+ struct page *dst_tpage = NULL;
+ int ret, len = size;
+
+ /* If source buffer is not aligned then use an intermediate buffer */
+ if (!IS_ALIGNED(vaddr, 16)) {
+ src_tpage = alloc_page(GFP_KERNEL);
+ if (!src_tpage)
+ return -ENOMEM;
+
+ if (copy_from_user(page_address(src_tpage),
+ (void __user *)(uintptr_t)vaddr, size)) {
+ __free_page(src_tpage);
+ return -EFAULT;
+ }
+
+ paddr = __sme_page_pa(src_tpage);
+ }
+
+ /*
+ * If destination buffer or length is not aligned then do read-modify-write:
+ * - decrypt destination in an intermediate buffer
+ * - copy the source buffer in an intermediate buffer
+ * - use the intermediate buffer as source buffer
+ */
+ if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+ int dst_offset;
+
+ dst_tpage = alloc_page(GFP_KERNEL);
+ if (!dst_tpage) {
+ ret = -ENOMEM;
+ goto e_free;
+ }
+
+ ret = __sev_dbg_decrypt(kvm, dst_paddr,
+ __sme_page_pa(dst_tpage), size, error);
+ if (ret)
+ goto e_free;
+
+ /*
+ * If source is kernel buffer then use memcpy() otherwise
+ * copy_from_user().
+ */
+ dst_offset = dst_paddr & 15;
+
+ if (src_tpage)
+ memcpy(page_address(dst_tpage) + dst_offset,
+ page_address(src_tpage), size);
+ else {
+ if (copy_from_user(page_address(dst_tpage) + dst_offset,
+ (void __user *)(uintptr_t)vaddr, size)) {
+ ret = -EFAULT;
+ goto e_free;
+ }
+ }
+
+ paddr = __sme_page_pa(dst_tpage);
+ dst_paddr = round_down(dst_paddr, 16);
+ len = round_up(size, 16);
+ }
+
+ ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
+
+e_free:
+ if (src_tpage)
+ __free_page(src_tpage);
+ if (dst_tpage)
+ __free_page(dst_tpage);
+ return ret;
+}
+
+static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
+{
+ unsigned long vaddr, vaddr_end, next_vaddr;
+ unsigned long dst_vaddr;
+ struct page **src_p, **dst_p;
+ struct kvm_sev_dbg debug;
+ unsigned long n;
+ unsigned int size;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
+ return -EFAULT;
+
+ if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
+ return -EINVAL;
+ if (!debug.dst_uaddr)
+ return -EINVAL;
+
+ vaddr = debug.src_uaddr;
+ size = debug.len;
+ vaddr_end = vaddr + size;
+ dst_vaddr = debug.dst_uaddr;
+
+ for (; vaddr < vaddr_end; vaddr = next_vaddr) {
+ int len, s_off, d_off;
+
+ /* lock userspace source and destination page */
+ src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
+ if (!src_p)
+ return -EFAULT;
+
+ dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
+ if (!dst_p) {
+ sev_unpin_memory(kvm, src_p, n);
+ return -EFAULT;
+ }
+
+ /*
+ * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the
+ * memory content (i.e it will write the same memory region with C=1).
+ * It's possible that the cache may contain the data with C=0, i.e.,
+ * unencrypted so invalidate it first.
+ */
+ sev_clflush_pages(src_p, 1);
+ sev_clflush_pages(dst_p, 1);
+
+ /*
+ * Since user buffer may not be page aligned, calculate the
+ * offset within the page.
+ */
+ s_off = vaddr & ~PAGE_MASK;
+ d_off = dst_vaddr & ~PAGE_MASK;
+ len = min_t(size_t, (PAGE_SIZE - s_off), size);
+
+ if (dec)
+ ret = __sev_dbg_decrypt_user(kvm,
+ __sme_page_pa(src_p[0]) + s_off,
+ dst_vaddr,
+ __sme_page_pa(dst_p[0]) + d_off,
+ len, &argp->error);
+ else
+ ret = __sev_dbg_encrypt_user(kvm,
+ __sme_page_pa(src_p[0]) + s_off,
+ vaddr,
+ __sme_page_pa(dst_p[0]) + d_off,
+ dst_vaddr,
+ len, &argp->error);
+
+ sev_unpin_memory(kvm, src_p, n);
+ sev_unpin_memory(kvm, dst_p, n);
+
+ if (ret)
+ goto err;
+
+ next_vaddr = vaddr + len;
+ dst_vaddr = dst_vaddr + len;
+ size -= len;
+ }
+err:
+ return ret;
+}
+
+static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_secret *data;
+ struct kvm_sev_launch_secret params;
+ struct page **pages;
+ void *blob, *hdr;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
+ if (!pages)
+ return -ENOMEM;
+
+ /*
+ * The secret must be copied into contiguous memory region, lets verify
+ * that userspace memory pages are contiguous before we issue command.
+ */
+ if (get_num_contig_pages(0, pages, n) != n) {
+ ret = -EINVAL;
+ goto e_unpin_memory;
+ }
+
+ ret = -ENOMEM;
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+ if (!data)
+ goto e_unpin_memory;
+
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ data->guest_address = __sme_page_pa(pages[0]) + offset;
+ data->guest_len = params.guest_len;
+
+ blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+ if (IS_ERR(blob)) {
+ ret = PTR_ERR(blob);
+ goto e_free;
+ }
+
+ data->trans_address = __psp_pa(blob);
+ data->trans_len = params.trans_len;
+
+ hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+ if (IS_ERR(hdr)) {
+ ret = PTR_ERR(hdr);
+ goto e_free_blob;
+ }
+ data->hdr_address = __psp_pa(hdr);
+ data->hdr_len = params.hdr_len;
+
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
+
+ kfree(hdr);
+
+e_free_blob:
+ kfree(blob);
+e_free:
+ kfree(data);
+e_unpin_memory:
+ sev_unpin_memory(kvm, pages, n);
+ return ret;
+}
+
+int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_sev_cmd sev_cmd;
+ int r;
+
+ if (!svm_sev_enabled())
+ return -ENOTTY;
+
+ if (!argp)
+ return 0;
+
+ if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
+ return -EFAULT;
+
+ mutex_lock(&kvm->lock);
+
+ switch (sev_cmd.id) {
+ case KVM_SEV_INIT:
+ r = sev_guest_init(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_START:
+ r = sev_launch_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_UPDATE_DATA:
+ r = sev_launch_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_MEASURE:
+ r = sev_launch_measure(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_FINISH:
+ r = sev_launch_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_GUEST_STATUS:
+ r = sev_guest_status(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_DBG_DECRYPT:
+ r = sev_dbg_crypt(kvm, &sev_cmd, true);
+ break;
+ case KVM_SEV_DBG_ENCRYPT:
+ r = sev_dbg_crypt(kvm, &sev_cmd, false);
+ break;
+ case KVM_SEV_LAUNCH_SECRET:
+ r = sev_launch_secret(kvm, &sev_cmd);
+ break;
+ default:
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
+ r = -EFAULT;
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+int svm_register_enc_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct enc_region *region;
+ int ret = 0;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
+ return -EINVAL;
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
+ if (!region)
+ return -ENOMEM;
+
+ region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
+ if (!region->pages) {
+ ret = -ENOMEM;
+ goto e_free;
+ }
+
+ /*
+ * The guest may change the memory encryption attribute from C=0 -> C=1
+ * or vice versa for this memory range. Lets make sure caches are
+ * flushed to ensure that guest data gets written into memory with
+ * correct C-bit.
+ */
+ sev_clflush_pages(region->pages, region->npages);
+
+ region->uaddr = range->addr;
+ region->size = range->size;
+
+ mutex_lock(&kvm->lock);
+ list_add_tail(&region->list, &sev->regions_list);
+ mutex_unlock(&kvm->lock);
+
+ return ret;
+
+e_free:
+ kfree(region);
+ return ret;
+}
+
+static struct enc_region *
+find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct list_head *head = &sev->regions_list;
+ struct enc_region *i;
+
+ list_for_each_entry(i, head, list) {
+ if (i->uaddr == range->addr &&
+ i->size == range->size)
+ return i;
+ }
+
+ return NULL;
+}
+
+static void __unregister_enc_region_locked(struct kvm *kvm,
+ struct enc_region *region)
+{
+ sev_unpin_memory(kvm, region->pages, region->npages);
+ list_del(&region->list);
+ kfree(region);
+}
+
+int svm_unregister_enc_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
+{
+ struct enc_region *region;
+ int ret;
+
+ mutex_lock(&kvm->lock);
+
+ if (!sev_guest(kvm)) {
+ ret = -ENOTTY;
+ goto failed;
+ }
+
+ region = find_enc_region(kvm, range);
+ if (!region) {
+ ret = -EINVAL;
+ goto failed;
+ }
+
+ /*
+ * Ensure that all guest tagged cache entries are flushed before
+ * releasing the pages back to the system for use. CLFLUSH will
+ * not do this, so issue a WBINVD.
+ */
+ wbinvd_on_all_cpus();
+
+ __unregister_enc_region_locked(kvm, region);
+
+ mutex_unlock(&kvm->lock);
+ return 0;
+
+failed:
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+void sev_vm_destroy(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct list_head *head = &sev->regions_list;
+ struct list_head *pos, *q;
+
+ if (!sev_guest(kvm))
+ return;
+
+ mutex_lock(&kvm->lock);
+
+ /*
+ * Ensure that all guest tagged cache entries are flushed before
+ * releasing the pages back to the system for use. CLFLUSH will
+ * not do this, so issue a WBINVD.
+ */
+ wbinvd_on_all_cpus();
+
+ /*
+ * if userspace was terminated before unregistering the memory regions
+ * then lets unpin all the registered memory.
+ */
+ if (!list_empty(head)) {
+ list_for_each_safe(pos, q, head) {
+ __unregister_enc_region_locked(kvm,
+ list_entry(pos, struct enc_region, list));
+ }
+ }
+
+ mutex_unlock(&kvm->lock);
+
+ sev_unbind_asid(kvm, sev->handle);
+ sev_asid_free(sev->asid);
+}
+
+int __init sev_hardware_setup(void)
+{
+ struct sev_user_data_status *status;
+ int rc;
+
+ /* Maximum number of encrypted guests supported simultaneously */
+ max_sev_asid = cpuid_ecx(0x8000001F);
+
+ if (!max_sev_asid)
+ return 1;
+
+ /* Minimum ASID value that should be used for SEV guest */
+ min_sev_asid = cpuid_edx(0x8000001F);
+
+ /* Initialize SEV ASID bitmaps */
+ sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
+ if (!sev_asid_bitmap)
+ return 1;
+
+ sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
+ if (!sev_reclaim_asid_bitmap)
+ return 1;
+
+ status = kmalloc(sizeof(*status), GFP_KERNEL);
+ if (!status)
+ return 1;
+
+ /*
+ * Check SEV platform status.
+ *
+ * PLATFORM_STATUS can be called in any state, if we failed to query
+ * the PLATFORM status then either PSP firmware does not support SEV
+ * feature or SEV firmware is dead.
+ */
+ rc = sev_platform_status(status, NULL);
+ if (rc)
+ goto err;
+
+ pr_info("SEV supported\n");
+
+err:
+ kfree(status);
+ return rc;
+}
+
+void sev_hardware_teardown(void)
+{
+ bitmap_free(sev_asid_bitmap);
+ bitmap_free(sev_reclaim_asid_bitmap);
+
+ sev_flush_asids();
+}
+
+void pre_sev_run(struct vcpu_svm *svm, int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ int asid = sev_get_asid(svm->vcpu.kvm);
+
+ /* Assign the asid allocated with this SEV guest */
+ svm->vmcb->control.asid = asid;
+
+ /*
+ * Flush guest TLB:
+ *
+ * 1) when different VMCB for the same ASID is to be run on the same host CPU.
+ * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
+ */
+ if (sd->sev_vmcbs[asid] == svm->vmcb &&
+ svm->last_cpu == cpu)
+ return;
+
+ svm->last_cpu = cpu;
+ sd->sev_vmcbs[asid] = svm->vmcb;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ mark_dirty(svm->vmcb, VMCB_ASID);
+}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm/svm.c
index 851e9cc79930..2be5bbae3a40 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1,17 +1,3 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * AMD SVM support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- */
-
#define pr_fmt(fmt) "SVM: " fmt
#include <linux/kvm_host.h>
@@ -28,10 +14,10 @@
#include <linux/kernel.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
+#include <linux/amd-iommu.h>
#include <linux/sched.h>
#include <linux/trace_events.h>
#include <linux/slab.h>
-#include <linux/amd-iommu.h>
#include <linux/hashtable.h>
#include <linux/frame.h>
#include <linux/psp-sev.h>
@@ -53,6 +39,8 @@
#include <asm/virtext.h>
#include "trace.h"
+#include "svm.h"
+
#define __ex(x) __kvm_handle_fault_on_reboot(x)
MODULE_AUTHOR("Qumranet");
@@ -80,107 +68,15 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
-#define SVM_AVIC_DOORBELL 0xc001011b
-
-#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
-#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
-#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
-
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
#define TSC_RATIO_RSVD 0xffffff0000000000ULL
#define TSC_RATIO_MIN 0x0000000000000001ULL
#define TSC_RATIO_MAX 0x000000ffffffffffULL
-#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
-
-/*
- * 0xff is broadcast, so the max index allowed for physical APIC ID
- * table is 0xfe. APIC IDs above 0xff are reserved.
- */
-#define AVIC_MAX_PHYSICAL_ID_COUNT 255
-
-#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
-#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
-#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
-
-/* AVIC GATAG is encoded using VM and VCPU IDs */
-#define AVIC_VCPU_ID_BITS 8
-#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1)
-
-#define AVIC_VM_ID_BITS 24
-#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS)
-#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1)
-
-#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
- (y & AVIC_VCPU_ID_MASK))
-#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
-#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
-
static bool erratum_383_found __read_mostly;
-static const u32 host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
- MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
- MSR_FS_BASE,
-#endif
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- MSR_TSC_AUX,
-};
-
-#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-
-struct kvm_sev_info {
- bool active; /* SEV enabled guest */
- unsigned int asid; /* ASID used for this guest */
- unsigned int handle; /* SEV firmware handle */
- int fd; /* SEV device fd */
- unsigned long pages_locked; /* Number of pages locked */
- struct list_head regions_list; /* List of registered regions */
-};
-
-struct kvm_svm {
- struct kvm kvm;
-
- /* Struct members for AVIC */
- u32 avic_vm_id;
- struct page *avic_logical_id_table_page;
- struct page *avic_physical_id_table_page;
- struct hlist_node hnode;
-
- struct kvm_sev_info sev_info;
-};
-
-struct kvm_vcpu;
-
-struct nested_state {
- struct vmcb *hsave;
- u64 hsave_msr;
- u64 vm_cr_msr;
- u64 vmcb;
-
- /* These are the merged vectors */
- u32 *msrpm;
-
- /* gpa pointers to the real vectors */
- u64 vmcb_msrpm;
- u64 vmcb_iopm;
-
- /* A VMEXIT is required but not yet emulated */
- bool exit_required;
-
- /* cache for intercepts of the guest */
- u32 intercept_cr;
- u32 intercept_dr;
- u32 intercept_exceptions;
- u64 intercept;
-
- /* Nested Paging related state */
- u64 nested_cr3;
-};
-
-#define MSRPM_OFFSETS 16
-static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
/*
* Set osvw_len to higher value when updated Revision Guides
@@ -188,92 +84,9 @@ static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
*/
static uint64_t osvw_len = 4, osvw_status;
-struct vcpu_svm {
- struct kvm_vcpu vcpu;
- struct vmcb *vmcb;
- unsigned long vmcb_pa;
- struct svm_cpu_data *svm_data;
- uint64_t asid_generation;
- uint64_t sysenter_esp;
- uint64_t sysenter_eip;
- uint64_t tsc_aux;
-
- u64 msr_decfg;
-
- u64 next_rip;
-
- u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
- struct {
- u16 fs;
- u16 gs;
- u16 ldt;
- u64 gs_base;
- } host;
-
- u64 spec_ctrl;
- /*
- * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
- * translated into the appropriate L2_CFG bits on the host to
- * perform speculative control.
- */
- u64 virt_spec_ctrl;
-
- u32 *msrpm;
-
- ulong nmi_iret_rip;
-
- struct nested_state nested;
-
- bool nmi_singlestep;
- u64 nmi_singlestep_guest_rflags;
-
- unsigned int3_injected;
- unsigned long int3_rip;
-
- /* cached guest cpuid flags for faster access */
- bool nrips_enabled : 1;
-
- u32 ldr_reg;
- u32 dfr_reg;
- struct page *avic_backing_page;
- u64 *avic_physical_id_cache;
- bool avic_is_running;
-
- /*
- * Per-vcpu list of struct amd_svm_iommu_ir:
- * This is used mainly to store interrupt remapping information used
- * when update the vcpu affinity. This avoids the need to scan for
- * IRTE and try to match ga_tag in the IOMMU driver.
- */
- struct list_head ir_list;
- spinlock_t ir_list_lock;
-
- /* which host CPU was used for running this vcpu */
- unsigned int last_cpu;
-};
-
-/*
- * This is a wrapper of struct amd_iommu_ir_data.
- */
-struct amd_svm_iommu_ir {
- struct list_head node; /* Used by SVM for per-vcpu ir_list */
- void *data; /* Storing pointer to struct amd_ir_data */
-};
-
-#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
-#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
-#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
-
-#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
-#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
-#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
-#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
-
static DEFINE_PER_CPU(u64, current_tsc_ratio);
#define TSC_RATIO_DEFAULT 0x0100000000ULL
-#define MSR_INVALID 0xffffffffU
-
static const struct svm_direct_access_msrs {
u32 index; /* Index of the MSR */
bool always; /* True if intercept is always on */
@@ -299,9 +112,9 @@ static const struct svm_direct_access_msrs {
/* enable NPT for AMD64 and X86 with PAE */
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-static bool npt_enabled = true;
+bool npt_enabled = true;
#else
-static bool npt_enabled;
+bool npt_enabled;
#endif
/*
@@ -360,12 +173,6 @@ module_param(npt, int, S_IRUGO);
static int nested = true;
module_param(nested, int, S_IRUGO);
-/* enable / disable AVIC */
-static int avic;
-#ifdef CONFIG_X86_LOCAL_APIC
-module_param(avic, int, S_IRUGO);
-#endif
-
/* enable/disable Next RIP Save */
static int nrips = true;
module_param(nrips, int, 0444);
@@ -387,303 +194,7 @@ module_param(dump_invalid_vmcb, bool, 0644);
static u8 rsm_ins_bytes[] = "\x0f\xaa";
-static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
static void svm_complete_interrupts(struct vcpu_svm *svm);
-static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
-static inline void avic_post_state_restore(struct kvm_vcpu *vcpu);
-
-static int nested_svm_exit_handled(struct vcpu_svm *svm);
-static int nested_svm_intercept(struct vcpu_svm *svm);
-static int nested_svm_vmexit(struct vcpu_svm *svm);
-static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
- bool has_error_code, u32 error_code);
-
-enum {
- VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
- pause filter count */
- VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
- VMCB_ASID, /* ASID */
- VMCB_INTR, /* int_ctl, int_vector */
- VMCB_NPT, /* npt_en, nCR3, gPAT */
- VMCB_CR, /* CR0, CR3, CR4, EFER */
- VMCB_DR, /* DR6, DR7 */
- VMCB_DT, /* GDT, IDT */
- VMCB_SEG, /* CS, DS, SS, ES, CPL */
- VMCB_CR2, /* CR2 only */
- VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
- VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
- * AVIC PHYSICAL_TABLE pointer,
- * AVIC LOGICAL_TABLE pointer
- */
- VMCB_DIRTY_MAX,
-};
-
-/* TPR and CR2 are always written before VMRUN */
-#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
-
-#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
-
-static int sev_flush_asids(void);
-static DECLARE_RWSEM(sev_deactivate_lock);
-static DEFINE_MUTEX(sev_bitmap_lock);
-static unsigned int max_sev_asid;
-static unsigned int min_sev_asid;
-static unsigned long *sev_asid_bitmap;
-static unsigned long *sev_reclaim_asid_bitmap;
-#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
-
-struct enc_region {
- struct list_head list;
- unsigned long npages;
- struct page **pages;
- unsigned long uaddr;
- unsigned long size;
-};
-
-
-static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
-{
- return container_of(kvm, struct kvm_svm, kvm);
-}
-
-static inline bool svm_sev_enabled(void)
-{
- return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0;
-}
-
-static inline bool sev_guest(struct kvm *kvm)
-{
-#ifdef CONFIG_KVM_AMD_SEV
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-
- return sev->active;
-#else
- return false;
-#endif
-}
-
-static inline int sev_get_asid(struct kvm *kvm)
-{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-
- return sev->asid;
-}
-
-static inline void mark_all_dirty(struct vmcb *vmcb)
-{
- vmcb->control.clean = 0;
-}
-
-static inline void mark_all_clean(struct vmcb *vmcb)
-{
- vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
- & ~VMCB_ALWAYS_DIRTY_MASK;
-}
-
-static inline void mark_dirty(struct vmcb *vmcb, int bit)
-{
- vmcb->control.clean &= ~(1 << bit);
-}
-
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
-{
- return container_of(vcpu, struct vcpu_svm, vcpu);
-}
-
-static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
-{
- svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
- mark_dirty(svm->vmcb, VMCB_AVIC);
-}
-
-static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 *entry = svm->avic_physical_id_cache;
-
- if (!entry)
- return false;
-
- return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
-}
-
-static void recalc_intercepts(struct vcpu_svm *svm)
-{
- struct vmcb_control_area *c, *h;
- struct nested_state *g;
-
- mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-
- if (!is_guest_mode(&svm->vcpu))
- return;
-
- c = &svm->vmcb->control;
- h = &svm->nested.hsave->control;
- g = &svm->nested;
-
- c->intercept_cr = h->intercept_cr;
- c->intercept_dr = h->intercept_dr;
- c->intercept_exceptions = h->intercept_exceptions;
- c->intercept = h->intercept;
-
- if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
- /* We only want the cr8 intercept bits of L1 */
- c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ);
- c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE);
-
- /*
- * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
- * affect any interrupt we may want to inject; therefore,
- * interrupt window vmexits are irrelevant to L0.
- */
- c->intercept &= ~(1ULL << INTERCEPT_VINTR);
- }
-
- /* We don't want to see VMMCALLs from a nested guest */
- c->intercept &= ~(1ULL << INTERCEPT_VMMCALL);
-
- c->intercept_cr |= g->intercept_cr;
- c->intercept_dr |= g->intercept_dr;
- c->intercept_exceptions |= g->intercept_exceptions;
- c->intercept |= g->intercept;
-}
-
-static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
-{
- if (is_guest_mode(&svm->vcpu))
- return svm->nested.hsave;
- else
- return svm->vmcb;
-}
-
-static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_cr |= (1U << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_cr &= ~(1U << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- return vmcb->control.intercept_cr & (1U << bit);
-}
-
-static inline void set_dr_intercepts(struct vcpu_svm *svm)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
- | (1 << INTERCEPT_DR1_READ)
- | (1 << INTERCEPT_DR2_READ)
- | (1 << INTERCEPT_DR3_READ)
- | (1 << INTERCEPT_DR4_READ)
- | (1 << INTERCEPT_DR5_READ)
- | (1 << INTERCEPT_DR6_READ)
- | (1 << INTERCEPT_DR7_READ)
- | (1 << INTERCEPT_DR0_WRITE)
- | (1 << INTERCEPT_DR1_WRITE)
- | (1 << INTERCEPT_DR2_WRITE)
- | (1 << INTERCEPT_DR3_WRITE)
- | (1 << INTERCEPT_DR4_WRITE)
- | (1 << INTERCEPT_DR5_WRITE)
- | (1 << INTERCEPT_DR6_WRITE)
- | (1 << INTERCEPT_DR7_WRITE);
-
- recalc_intercepts(svm);
-}
-
-static inline void clr_dr_intercepts(struct vcpu_svm *svm)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_dr = 0;
-
- recalc_intercepts(svm);
-}
-
-static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_exceptions |= (1U << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_exceptions &= ~(1U << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void set_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept |= (1ULL << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline void clr_intercept(struct vcpu_svm *svm, int bit)
-{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept &= ~(1ULL << bit);
-
- recalc_intercepts(svm);
-}
-
-static inline bool is_intercept(struct vcpu_svm *svm, int bit)
-{
- return (svm->vmcb->control.intercept & (1ULL << bit)) != 0;
-}
-
-static inline bool vgif_enabled(struct vcpu_svm *svm)
-{
- return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
-}
-
-static inline void enable_gif(struct vcpu_svm *svm)
-{
- if (vgif_enabled(svm))
- svm->vmcb->control.int_ctl |= V_GIF_MASK;
- else
- svm->vcpu.arch.hflags |= HF_GIF_MASK;
-}
-
-static inline void disable_gif(struct vcpu_svm *svm)
-{
- if (vgif_enabled(svm))
- svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
- else
- svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
-}
-
-static inline bool gif_set(struct vcpu_svm *svm)
-{
- if (vgif_enabled(svm))
- return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
- else
- return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
-}
static unsigned long iopm_base;
@@ -696,23 +207,7 @@ struct kvm_ldttss_desc {
u32 zero1;
} __attribute__((packed));
-struct svm_cpu_data {
- int cpu;
-
- u64 asid_generation;
- u32 max_asid;
- u32 next_asid;
- u32 min_asid;
- struct kvm_ldttss_desc *tss_desc;
-
- struct page *save_area;
- struct vmcb *current_vmcb;
-
- /* index = sev_asid, value = vmcb pointer */
- struct vmcb **sev_vmcbs;
-};
-
-static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
+DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
@@ -720,7 +215,7 @@ static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
#define MSRS_RANGE_SIZE 2048
#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
-static u32 svm_msrpm_offset(u32 msr)
+u32 svm_msrpm_offset(u32 msr)
{
u32 offset;
int i;
@@ -767,7 +262,7 @@ static int get_npt_level(struct kvm_vcpu *vcpu)
#endif
}
-static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
vcpu->arch.efer = efer;
@@ -1198,7 +693,7 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
}
-static void disable_nmi_singlestep(struct vcpu_svm *svm)
+void disable_nmi_singlestep(struct vcpu_svm *svm)
{
svm->nmi_singlestep = false;
@@ -1211,97 +706,6 @@ static void disable_nmi_singlestep(struct vcpu_svm *svm)
}
}
-/* Note:
- * This hash table is used to map VM_ID to a struct kvm_svm,
- * when handling AMD IOMMU GALOG notification to schedule in
- * a particular vCPU.
- */
-#define SVM_VM_DATA_HASH_BITS 8
-static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
-static u32 next_vm_id = 0;
-static bool next_vm_id_wrapped = 0;
-static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
-
-/* Note:
- * This function is called from IOMMU driver to notify
- * SVM to schedule in a particular vCPU of a particular VM.
- */
-static int avic_ga_log_notifier(u32 ga_tag)
-{
- unsigned long flags;
- struct kvm_svm *kvm_svm;
- struct kvm_vcpu *vcpu = NULL;
- u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
- u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
-
- pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
- trace_kvm_avic_ga_log(vm_id, vcpu_id);
-
- spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
- hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
- if (kvm_svm->avic_vm_id != vm_id)
- continue;
- vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
- break;
- }
- spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
-
- /* Note:
- * At this point, the IOMMU should have already set the pending
- * bit in the vAPIC backing page. So, we just need to schedule
- * in the vcpu.
- */
- if (vcpu)
- kvm_vcpu_wake_up(vcpu);
-
- return 0;
-}
-
-static __init int sev_hardware_setup(void)
-{
- struct sev_user_data_status *status;
- int rc;
-
- /* Maximum number of encrypted guests supported simultaneously */
- max_sev_asid = cpuid_ecx(0x8000001F);
-
- if (!max_sev_asid)
- return 1;
-
- /* Minimum ASID value that should be used for SEV guest */
- min_sev_asid = cpuid_edx(0x8000001F);
-
- /* Initialize SEV ASID bitmaps */
- sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
- if (!sev_asid_bitmap)
- return 1;
-
- sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
- if (!sev_reclaim_asid_bitmap)
- return 1;
-
- status = kmalloc(sizeof(*status), GFP_KERNEL);
- if (!status)
- return 1;
-
- /*
- * Check SEV platform status.
- *
- * PLATFORM_STATUS can be called in any state, if we failed to query
- * the PLATFORM status then either PSP firmware does not support SEV
- * feature or SEV firmware is dead.
- */
- rc = sev_platform_status(status, NULL);
- if (rc)
- goto err;
-
- pr_info("SEV supported\n");
-
-err:
- kfree(status);
- return rc;
-}
-
static void grow_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1383,12 +787,8 @@ static void svm_hardware_teardown(void)
{
int cpu;
- if (svm_sev_enabled()) {
- bitmap_free(sev_asid_bitmap);
- bitmap_free(sev_reclaim_asid_bitmap);
-
- sev_flush_asids();
- }
+ if (svm_sev_enabled())
+ sev_hardware_teardown();
for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
@@ -1585,24 +985,6 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
return svm->vmcb->control.tsc_offset;
}
-static void avic_init_vmcb(struct vcpu_svm *svm)
-{
- struct vmcb *vmcb = svm->vmcb;
- struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
- phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
- phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
- phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
-
- vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
- vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
- vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
- vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
- if (kvm_apicv_activated(svm->vcpu.kvm))
- vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
- else
- vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
-}
-
static void init_vmcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
@@ -1762,449 +1144,6 @@ static void init_vmcb(struct vcpu_svm *svm)
}
-static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
- unsigned int index)
-{
- u64 *avic_physical_id_table;
- struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
-
- if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
- return NULL;
-
- avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
-
- return &avic_physical_id_table[index];
-}
-
-/**
- * Note:
- * AVIC hardware walks the nested page table to check permissions,
- * but does not use the SPA address specified in the leaf page
- * table entry since it uses address in the AVIC_BACKING_PAGE pointer
- * field of the VMCB. Therefore, we set up the
- * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
- */
-static int avic_update_access_page(struct kvm *kvm, bool activate)
-{
- int ret = 0;
-
- mutex_lock(&kvm->slots_lock);
- /*
- * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger
- * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT
- * memory region. So, we need to ensure that kvm->mm == current->mm.
- */
- if ((kvm->arch.apic_access_page_done == activate) ||
- (kvm->mm != current->mm))
- goto out;
-
- ret = __x86_set_memory_region(kvm,
- APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
- APIC_DEFAULT_PHYS_BASE,
- activate ? PAGE_SIZE : 0);
- if (ret)
- goto out;
-
- kvm->arch.apic_access_page_done = activate;
-out:
- mutex_unlock(&kvm->slots_lock);
- return ret;
-}
-
-static int avic_init_backing_page(struct kvm_vcpu *vcpu)
-{
- u64 *entry, new_entry;
- int id = vcpu->vcpu_id;
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
- return -EINVAL;
-
- if (!svm->vcpu.arch.apic->regs)
- return -EINVAL;
-
- if (kvm_apicv_activated(vcpu->kvm)) {
- int ret;
-
- ret = avic_update_access_page(vcpu->kvm, true);
- if (ret)
- return ret;
- }
-
- svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
-
- /* Setting AVIC backing page address in the phy APIC ID table */
- entry = avic_get_physical_id_entry(vcpu, id);
- if (!entry)
- return -EINVAL;
-
- new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
- AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
- AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
- WRITE_ONCE(*entry, new_entry);
-
- svm->avic_physical_id_cache = entry;
-
- return 0;
-}
-
-static void sev_asid_free(int asid)
-{
- struct svm_cpu_data *sd;
- int cpu, pos;
-
- mutex_lock(&sev_bitmap_lock);
-
- pos = asid - 1;
- __set_bit(pos, sev_reclaim_asid_bitmap);
-
- for_each_possible_cpu(cpu) {
- sd = per_cpu(svm_data, cpu);
- sd->sev_vmcbs[pos] = NULL;
- }
-
- mutex_unlock(&sev_bitmap_lock);
-}
-
-static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
-{
- struct sev_data_decommission *decommission;
- struct sev_data_deactivate *data;
-
- if (!handle)
- return;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- return;
-
- /* deactivate handle */
- data->handle = handle;
-
- /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
- down_read(&sev_deactivate_lock);
- sev_guest_deactivate(data, NULL);
- up_read(&sev_deactivate_lock);
-
- kfree(data);
-
- decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
- if (!decommission)
- return;
-
- /* decommission handle */
- decommission->handle = handle;
- sev_guest_decommission(decommission, NULL);
-
- kfree(decommission);
-}
-
-static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
- unsigned long ulen, unsigned long *n,
- int write)
-{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- unsigned long npages, npinned, size;
- unsigned long locked, lock_limit;
- struct page **pages;
- unsigned long first, last;
-
- if (ulen == 0 || uaddr + ulen < uaddr)
- return NULL;
-
- /* Calculate number of pages. */
- first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
- last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
- npages = (last - first + 1);
-
- locked = sev->pages_locked + npages;
- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
- pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
- return NULL;
- }
-
- /* Avoid using vmalloc for smaller buffers. */
- size = npages * sizeof(struct page *);
- if (size > PAGE_SIZE)
- pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO,
- PAGE_KERNEL);
- else
- pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
-
- if (!pages)
- return NULL;
-
- /* Pin the user virtual address. */
- npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
- if (npinned != npages) {
- pr_err("SEV: Failure locking %lu pages.\n", npages);
- goto err;
- }
-
- *n = npages;
- sev->pages_locked = locked;
-
- return pages;
-
-err:
- if (npinned > 0)
- release_pages(pages, npinned);
-
- kvfree(pages);
- return NULL;
-}
-
-static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
- unsigned long npages)
-{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-
- release_pages(pages, npages);
- kvfree(pages);
- sev->pages_locked -= npages;
-}
-
-static void sev_clflush_pages(struct page *pages[], unsigned long npages)
-{
- uint8_t *page_virtual;
- unsigned long i;
-
- if (npages == 0 || pages == NULL)
- return;
-
- for (i = 0; i < npages; i++) {
- page_virtual = kmap_atomic(pages[i]);
- clflush_cache_range(page_virtual, PAGE_SIZE);
- kunmap_atomic(page_virtual);
- }
-}
-
-static void __unregister_enc_region_locked(struct kvm *kvm,
- struct enc_region *region)
-{
- sev_unpin_memory(kvm, region->pages, region->npages);
- list_del(&region->list);
- kfree(region);
-}
-
-static void sev_vm_destroy(struct kvm *kvm)
-{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct list_head *head = &sev->regions_list;
- struct list_head *pos, *q;
-
- if (!sev_guest(kvm))
- return;
-
- mutex_lock(&kvm->lock);
-
- /*
- * Ensure that all guest tagged cache entries are flushed before
- * releasing the pages back to the system for use. CLFLUSH will
- * not do this, so issue a WBINVD.
- */
- wbinvd_on_all_cpus();
-
- /*
- * if userspace was terminated before unregistering the memory regions
- * then lets unpin all the registered memory.
- */
- if (!list_empty(head)) {
- list_for_each_safe(pos, q, head) {
- __unregister_enc_region_locked(kvm,
- list_entry(pos, struct enc_region, list));
- }
- }
-
- mutex_unlock(&kvm->lock);
-
- sev_unbind_asid(kvm, sev->handle);
- sev_asid_free(sev->asid);
-}
-
-static void avic_vm_destroy(struct kvm *kvm)
-{
- unsigned long flags;
- struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
-
- if (!avic)
- return;
-
- if (kvm_svm->avic_logical_id_table_page)
- __free_page(kvm_svm->avic_logical_id_table_page);
- if (kvm_svm->avic_physical_id_table_page)
- __free_page(kvm_svm->avic_physical_id_table_page);
-
- spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
- hash_del(&kvm_svm->hnode);
- spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
-}
-
-static void svm_vm_destroy(struct kvm *kvm)
-{
- avic_vm_destroy(kvm);
- sev_vm_destroy(kvm);
-}
-
-static int avic_vm_init(struct kvm *kvm)
-{
- unsigned long flags;
- int err = -ENOMEM;
- struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
- struct kvm_svm *k2;
- struct page *p_page;
- struct page *l_page;
- u32 vm_id;
-
- if (!avic)
- return 0;
-
- /* Allocating physical APIC ID table (4KB) */
- p_page = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!p_page)
- goto free_avic;
-
- kvm_svm->avic_physical_id_table_page = p_page;
- clear_page(page_address(p_page));
-
- /* Allocating logical APIC ID table (4KB) */
- l_page = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!l_page)
- goto free_avic;
-
- kvm_svm->avic_logical_id_table_page = l_page;
- clear_page(page_address(l_page));
-
- spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
- again:
- vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
- if (vm_id == 0) { /* id is 1-based, zero is not okay */
- next_vm_id_wrapped = 1;
- goto again;
- }
- /* Is it still in use? Only possible if wrapped at least once */
- if (next_vm_id_wrapped) {
- hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
- if (k2->avic_vm_id == vm_id)
- goto again;
- }
- }
- kvm_svm->avic_vm_id = vm_id;
- hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
- spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
-
- return 0;
-
-free_avic:
- avic_vm_destroy(kvm);
- return err;
-}
-
-static int svm_vm_init(struct kvm *kvm)
-{
- if (avic) {
- int ret = avic_vm_init(kvm);
- if (ret)
- return ret;
- }
-
- kvm_apicv_init(kvm, avic);
- return 0;
-}
-
-static inline int
-avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
-{
- int ret = 0;
- unsigned long flags;
- struct amd_svm_iommu_ir *ir;
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm))
- return 0;
-
- /*
- * Here, we go through the per-vcpu ir_list to update all existing
- * interrupt remapping table entry targeting this vcpu.
- */
- spin_lock_irqsave(&svm->ir_list_lock, flags);
-
- if (list_empty(&svm->ir_list))
- goto out;
-
- list_for_each_entry(ir, &svm->ir_list, node) {
- ret = amd_iommu_update_ga(cpu, r, ir->data);
- if (ret)
- break;
- }
-out:
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
- return ret;
-}
-
-static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
- u64 entry;
- /* ID = 0xff (broadcast), ID > 0xff (reserved) */
- int h_physical_id = kvm_cpu_get_apicid(cpu);
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
-
- /*
- * Since the host physical APIC id is 8 bits,
- * we can support host APIC ID upto 255.
- */
- if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
- return;
-
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
- WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
-
- entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
- entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
-
- entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- if (svm->avic_is_running)
- entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
- avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
- svm->avic_is_running);
-}
-
-static void avic_vcpu_put(struct kvm_vcpu *vcpu)
-{
- u64 entry;
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
-
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
- if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
- avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
-
- entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
-}
-
-/**
- * This function is called during VCPU halt/unhalt.
- */
-static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->avic_is_running = is_run;
- if (is_run)
- avic_vcpu_load(vcpu, vcpu->cpu);
- else
- avic_vcpu_put(vcpu);
-}
-
static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -2229,25 +1168,6 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
}
-static int avic_init_vcpu(struct vcpu_svm *svm)
-{
- int ret;
- struct kvm_vcpu *vcpu = &svm->vcpu;
-
- if (!avic || !irqchip_in_kernel(vcpu->kvm))
- return 0;
-
- ret = avic_init_backing_page(&svm->vcpu);
- if (ret)
- return ret;
-
- INIT_LIST_HEAD(&svm->ir_list);
- spin_lock_init(&svm->ir_list_lock);
- svm->dfr_reg = APIC_DFR_FLAT;
-
- return ret;
-}
-
static int svm_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
@@ -2404,18 +1324,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
}
-static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
-{
- avic_set_running(vcpu, false);
-}
-
-static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
-{
- if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
- kvm_vcpu_update_apicv(vcpu);
- avic_set_running(vcpu, true);
-}
-
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -2652,7 +1560,7 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
}
}
-static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -2686,7 +1594,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
update_cr0_intercept(svm);
}
-static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
@@ -3022,776 +1930,6 @@ static int vmmcall_interception(struct vcpu_svm *svm)
return kvm_emulate_hypercall(&svm->vcpu);
}
-static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- return svm->nested.nested_cr3;
-}
-
-static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 cr3 = svm->nested.nested_cr3;
- u64 pdpte;
- int ret;
-
- ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
- offset_in_page(cr3) + index * 8, 8);
- if (ret)
- return 0;
- return pdpte;
-}
-
-static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
- struct x86_exception *fault)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
- /*
- * TODO: track the cause of the nested page fault, and
- * correctly fill in the high bits of exit_info_1.
- */
- svm->vmcb->control.exit_code = SVM_EXIT_NPF;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = (1ULL << 32);
- svm->vmcb->control.exit_info_2 = fault->address;
- }
-
- svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
- svm->vmcb->control.exit_info_1 |= fault->error_code;
-
- /*
- * The present bit is always zero for page structure faults on real
- * hardware.
- */
- if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
- svm->vmcb->control.exit_info_1 &= ~1;
-
- nested_svm_vmexit(svm);
-}
-
-static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
-{
- WARN_ON(mmu_is_nested(vcpu));
-
- vcpu->arch.mmu = &vcpu->arch.guest_mmu;
- kvm_init_shadow_mmu(vcpu);
- vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
- vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
- vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
- vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu);
- reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
- vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
-}
-
-static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.mmu = &vcpu->arch.root_mmu;
- vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
-}
-
-static int nested_svm_check_permissions(struct vcpu_svm *svm)
-{
- if (!(svm->vcpu.arch.efer & EFER_SVME) ||
- !is_paging(&svm->vcpu)) {
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
- }
-
- if (svm->vmcb->save.cpl) {
- kvm_inject_gp(&svm->vcpu, 0);
- return 1;
- }
-
- return 0;
-}
-
-static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
- bool has_error_code, u32 error_code)
-{
- int vmexit;
-
- if (!is_guest_mode(&svm->vcpu))
- return 0;
-
- vmexit = nested_svm_intercept(svm);
- if (vmexit != NESTED_EXIT_DONE)
- return 0;
-
- svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = error_code;
-
- /*
- * EXITINFO2 is undefined for all exception intercepts other
- * than #PF.
- */
- if (svm->vcpu.arch.exception.nested_apf)
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
- else if (svm->vcpu.arch.exception.has_payload)
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
- else
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
-
- svm->nested.exit_required = true;
- return vmexit;
-}
-
-static void nested_svm_intr(struct vcpu_svm *svm)
-{
- svm->vmcb->control.exit_code = SVM_EXIT_INTR;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- /* nested_svm_vmexit this gets called afterwards from handle_exit */
- svm->nested.exit_required = true;
- trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
-}
-
-static bool nested_exit_on_intr(struct vcpu_svm *svm)
-{
- return (svm->nested.intercept & 1ULL);
-}
-
-static int svm_check_nested_events(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- bool block_nested_events =
- kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required;
-
- if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) {
- if (block_nested_events)
- return -EBUSY;
- nested_svm_intr(svm);
- return 0;
- }
-
- return 0;
-}
-
-/* This function returns true if it is save to enable the nmi window */
-static inline bool nested_svm_nmi(struct vcpu_svm *svm)
-{
- if (!is_guest_mode(&svm->vcpu))
- return true;
-
- if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
- return true;
-
- svm->vmcb->control.exit_code = SVM_EXIT_NMI;
- svm->nested.exit_required = true;
-
- return false;
-}
-
-static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
-{
- unsigned port, size, iopm_len;
- u16 val, mask;
- u8 start_bit;
- u64 gpa;
-
- if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
- return NESTED_EXIT_HOST;
-
- port = svm->vmcb->control.exit_info_1 >> 16;
- size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
- SVM_IOIO_SIZE_SHIFT;
- gpa = svm->nested.vmcb_iopm + (port / 8);
- start_bit = port % 8;
- iopm_len = (start_bit + size > 8) ? 2 : 1;
- mask = (0xf >> (4 - size)) << start_bit;
- val = 0;
-
- if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
- return NESTED_EXIT_DONE;
-
- return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
-}
-
-static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
-{
- u32 offset, msr, value;
- int write, mask;
-
- if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
- return NESTED_EXIT_HOST;
-
- msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- offset = svm_msrpm_offset(msr);
- write = svm->vmcb->control.exit_info_1 & 1;
- mask = 1 << ((2 * (msr & 0xf)) + write);
-
- if (offset == MSR_INVALID)
- return NESTED_EXIT_DONE;
-
- /* Offset is in 32 bit units but need in 8 bit units */
- offset *= 4;
-
- if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
- return NESTED_EXIT_DONE;
-
- return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
-}
-
-/* DB exceptions for our internal use must not cause vmexit */
-static int nested_svm_intercept_db(struct vcpu_svm *svm)
-{
- unsigned long dr6;
-
- /* if we're not singlestepping, it's not ours */
- if (!svm->nmi_singlestep)
- return NESTED_EXIT_DONE;
-
- /* if it's not a singlestep exception, it's not ours */
- if (kvm_get_dr(&svm->vcpu, 6, &dr6))
- return NESTED_EXIT_DONE;
- if (!(dr6 & DR6_BS))
- return NESTED_EXIT_DONE;
-
- /* if the guest is singlestepping, it should get the vmexit */
- if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
- disable_nmi_singlestep(svm);
- return NESTED_EXIT_DONE;
- }
-
- /* it's ours, the nested hypervisor must not see this one */
- return NESTED_EXIT_HOST;
-}
-
-static int nested_svm_exit_special(struct vcpu_svm *svm)
-{
- u32 exit_code = svm->vmcb->control.exit_code;
-
- switch (exit_code) {
- case SVM_EXIT_INTR:
- case SVM_EXIT_NMI:
- case SVM_EXIT_EXCP_BASE + MC_VECTOR:
- return NESTED_EXIT_HOST;
- case SVM_EXIT_NPF:
- /* For now we are always handling NPFs when using them */
- if (npt_enabled)
- return NESTED_EXIT_HOST;
- break;
- case SVM_EXIT_EXCP_BASE + PF_VECTOR:
- /* When we're shadowing, trap PFs, but not async PF */
- if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0)
- return NESTED_EXIT_HOST;
- break;
- default:
- break;
- }
-
- return NESTED_EXIT_CONTINUE;
-}
-
-static int nested_svm_intercept(struct vcpu_svm *svm)
-{
- u32 exit_code = svm->vmcb->control.exit_code;
- int vmexit = NESTED_EXIT_HOST;
-
- switch (exit_code) {
- case SVM_EXIT_MSR:
- vmexit = nested_svm_exit_handled_msr(svm);
- break;
- case SVM_EXIT_IOIO:
- vmexit = nested_svm_intercept_ioio(svm);
- break;
- case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
- u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
- if (svm->nested.intercept_cr & bit)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
- u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
- if (svm->nested.intercept_dr & bit)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
- u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
- if (svm->nested.intercept_exceptions & excp_bits) {
- if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
- vmexit = nested_svm_intercept_db(svm);
- else
- vmexit = NESTED_EXIT_DONE;
- }
- /* async page fault always cause vmexit */
- else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
- svm->vcpu.arch.exception.nested_apf != 0)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_ERR: {
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- default: {
- u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
- if (svm->nested.intercept & exit_bits)
- vmexit = NESTED_EXIT_DONE;
- }
- }
-
- return vmexit;
-}
-
-static int nested_svm_exit_handled(struct vcpu_svm *svm)
-{
- int vmexit;
-
- vmexit = nested_svm_intercept(svm);
-
- if (vmexit == NESTED_EXIT_DONE)
- nested_svm_vmexit(svm);
-
- return vmexit;
-}
-
-static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
-{
- struct vmcb_control_area *dst = &dst_vmcb->control;
- struct vmcb_control_area *from = &from_vmcb->control;
-
- dst->intercept_cr = from->intercept_cr;
- dst->intercept_dr = from->intercept_dr;
- dst->intercept_exceptions = from->intercept_exceptions;
- dst->intercept = from->intercept;
- dst->iopm_base_pa = from->iopm_base_pa;
- dst->msrpm_base_pa = from->msrpm_base_pa;
- dst->tsc_offset = from->tsc_offset;
- dst->asid = from->asid;
- dst->tlb_ctl = from->tlb_ctl;
- dst->int_ctl = from->int_ctl;
- dst->int_vector = from->int_vector;
- dst->int_state = from->int_state;
- dst->exit_code = from->exit_code;
- dst->exit_code_hi = from->exit_code_hi;
- dst->exit_info_1 = from->exit_info_1;
- dst->exit_info_2 = from->exit_info_2;
- dst->exit_int_info = from->exit_int_info;
- dst->exit_int_info_err = from->exit_int_info_err;
- dst->nested_ctl = from->nested_ctl;
- dst->event_inj = from->event_inj;
- dst->event_inj_err = from->event_inj_err;
- dst->nested_cr3 = from->nested_cr3;
- dst->virt_ext = from->virt_ext;
- dst->pause_filter_count = from->pause_filter_count;
- dst->pause_filter_thresh = from->pause_filter_thresh;
-}
-
-static int nested_svm_vmexit(struct vcpu_svm *svm)
-{
- int rc;
- struct vmcb *nested_vmcb;
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
- struct kvm_host_map map;
-
- trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
- vmcb->control.exit_info_1,
- vmcb->control.exit_info_2,
- vmcb->control.exit_int_info,
- vmcb->control.exit_int_info_err,
- KVM_ISA_SVM);
-
- rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map);
- if (rc) {
- if (rc == -EINVAL)
- kvm_inject_gp(&svm->vcpu, 0);
- return 1;
- }
-
- nested_vmcb = map.hva;
-
- /* Exit Guest-Mode */
- leave_guest_mode(&svm->vcpu);
- svm->nested.vmcb = 0;
-
- /* Give the current vmcb to the guest */
- disable_gif(svm);
-
- nested_vmcb->save.es = vmcb->save.es;
- nested_vmcb->save.cs = vmcb->save.cs;
- nested_vmcb->save.ss = vmcb->save.ss;
- nested_vmcb->save.ds = vmcb->save.ds;
- nested_vmcb->save.gdtr = vmcb->save.gdtr;
- nested_vmcb->save.idtr = vmcb->save.idtr;
- nested_vmcb->save.efer = svm->vcpu.arch.efer;
- nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
- nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
- nested_vmcb->save.cr2 = vmcb->save.cr2;
- nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
- nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
- nested_vmcb->save.rip = vmcb->save.rip;
- nested_vmcb->save.rsp = vmcb->save.rsp;
- nested_vmcb->save.rax = vmcb->save.rax;
- nested_vmcb->save.dr7 = vmcb->save.dr7;
- nested_vmcb->save.dr6 = vmcb->save.dr6;
- nested_vmcb->save.cpl = vmcb->save.cpl;
-
- nested_vmcb->control.int_ctl = vmcb->control.int_ctl;
- nested_vmcb->control.int_vector = vmcb->control.int_vector;
- nested_vmcb->control.int_state = vmcb->control.int_state;
- nested_vmcb->control.exit_code = vmcb->control.exit_code;
- nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi;
- nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1;
- nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
- nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
- nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
-
- if (svm->nrips_enabled)
- nested_vmcb->control.next_rip = vmcb->control.next_rip;
-
- /*
- * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
- * to make sure that we do not lose injected events. So check event_inj
- * here and copy it to exit_int_info if it is valid.
- * Exit_int_info and event_inj can't be both valid because the case
- * below only happens on a VMRUN instruction intercept which has
- * no valid exit_int_info set.
- */
- if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
- struct vmcb_control_area *nc = &nested_vmcb->control;
-
- nc->exit_int_info = vmcb->control.event_inj;
- nc->exit_int_info_err = vmcb->control.event_inj_err;
- }
-
- nested_vmcb->control.tlb_ctl = 0;
- nested_vmcb->control.event_inj = 0;
- nested_vmcb->control.event_inj_err = 0;
-
- nested_vmcb->control.pause_filter_count =
- svm->vmcb->control.pause_filter_count;
- nested_vmcb->control.pause_filter_thresh =
- svm->vmcb->control.pause_filter_thresh;
-
- /* We always set V_INTR_MASKING and remember the old value in hflags */
- if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
- nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
-
- /* Restore the original control entries */
- copy_vmcb_control_area(vmcb, hsave);
-
- svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset;
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
-
- svm->nested.nested_cr3 = 0;
-
- /* Restore selected save entries */
- svm->vmcb->save.es = hsave->save.es;
- svm->vmcb->save.cs = hsave->save.cs;
- svm->vmcb->save.ss = hsave->save.ss;
- svm->vmcb->save.ds = hsave->save.ds;
- svm->vmcb->save.gdtr = hsave->save.gdtr;
- svm->vmcb->save.idtr = hsave->save.idtr;
- kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
- svm_set_efer(&svm->vcpu, hsave->save.efer);
- svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
- svm_set_cr4(&svm->vcpu, hsave->save.cr4);
- if (npt_enabled) {
- svm->vmcb->save.cr3 = hsave->save.cr3;
- svm->vcpu.arch.cr3 = hsave->save.cr3;
- } else {
- (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
- }
- kvm_rax_write(&svm->vcpu, hsave->save.rax);
- kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
- kvm_rip_write(&svm->vcpu, hsave->save.rip);
- svm->vmcb->save.dr7 = 0;
- svm->vmcb->save.cpl = 0;
- svm->vmcb->control.exit_int_info = 0;
-
- mark_all_dirty(svm->vmcb);
-
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
-
- nested_svm_uninit_mmu_context(&svm->vcpu);
- kvm_mmu_reset_context(&svm->vcpu);
- kvm_mmu_load(&svm->vcpu);
-
- /*
- * Drop what we picked up for L2 via svm_complete_interrupts() so it
- * doesn't end up in L1.
- */
- svm->vcpu.arch.nmi_injected = false;
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
-
- return 0;
-}
-
-static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
-{
- /*
- * This function merges the msr permission bitmaps of kvm and the
- * nested vmcb. It is optimized in that it only merges the parts where
- * the kvm msr permission bitmap may contain zero bits
- */
- int i;
-
- if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
- return true;
-
- for (i = 0; i < MSRPM_OFFSETS; i++) {
- u32 value, p;
- u64 offset;
-
- if (msrpm_offsets[i] == 0xffffffff)
- break;
-
- p = msrpm_offsets[i];
- offset = svm->nested.vmcb_msrpm + (p * 4);
-
- if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
- return false;
-
- svm->nested.msrpm[p] = svm->msrpm[p] | value;
- }
-
- svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
-
- return true;
-}
-
-static bool nested_vmcb_checks(struct vmcb *vmcb)
-{
- if ((vmcb->save.efer & EFER_SVME) == 0)
- return false;
-
- if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
- return false;
-
- if (vmcb->control.asid == 0)
- return false;
-
- if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
- !npt_enabled)
- return false;
-
- return true;
-}
-
-static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
- struct vmcb *nested_vmcb, struct kvm_host_map *map)
-{
- bool evaluate_pending_interrupts =
- is_intercept(svm, INTERCEPT_VINTR) ||
- is_intercept(svm, INTERCEPT_IRET);
-
- if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
- svm->vcpu.arch.hflags |= HF_HIF_MASK;
- else
- svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
-
- if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) {
- svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
- nested_svm_init_mmu_context(&svm->vcpu);
- }
-
- /* Load the nested guest state */
- svm->vmcb->save.es = nested_vmcb->save.es;
- svm->vmcb->save.cs = nested_vmcb->save.cs;
- svm->vmcb->save.ss = nested_vmcb->save.ss;
- svm->vmcb->save.ds = nested_vmcb->save.ds;
- svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
- svm->vmcb->save.idtr = nested_vmcb->save.idtr;
- kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
- svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
- svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
- svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
- if (npt_enabled) {
- svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
- svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
- } else
- (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
-
- /* Guest paging mode is active - reset mmu */
- kvm_mmu_reset_context(&svm->vcpu);
-
- svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
- kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax);
- kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp);
- kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip);
-
- /* In case we don't even reach vcpu_run, the fields are not updated */
- svm->vmcb->save.rax = nested_vmcb->save.rax;
- svm->vmcb->save.rsp = nested_vmcb->save.rsp;
- svm->vmcb->save.rip = nested_vmcb->save.rip;
- svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
- svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
- svm->vmcb->save.cpl = nested_vmcb->save.cpl;
-
- svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
- svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
-
- /* cache intercepts */
- svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
- svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
- svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
- svm->nested.intercept = nested_vmcb->control.intercept;
-
- svm_flush_tlb(&svm->vcpu, true);
- svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
- if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
- svm->vcpu.arch.hflags |= HF_VINTR_MASK;
- else
- svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
-
- svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
- svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
-
- svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
- svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
- svm->vmcb->control.int_state = nested_vmcb->control.int_state;
- svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
- svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
-
- svm->vmcb->control.pause_filter_count =
- nested_vmcb->control.pause_filter_count;
- svm->vmcb->control.pause_filter_thresh =
- nested_vmcb->control.pause_filter_thresh;
-
- kvm_vcpu_unmap(&svm->vcpu, map, true);
-
- /* Enter Guest-Mode */
- enter_guest_mode(&svm->vcpu);
-
- /*
- * Merge guest and host intercepts - must be called with vcpu in
- * guest-mode to take affect here
- */
- recalc_intercepts(svm);
-
- svm->nested.vmcb = vmcb_gpa;
-
- /*
- * If L1 had a pending IRQ/NMI before executing VMRUN,
- * which wasn't delivered because it was disallowed (e.g.
- * interrupts disabled), L0 needs to evaluate if this pending
- * event should cause an exit from L2 to L1 or be delivered
- * directly to L2.
- *
- * Usually this would be handled by the processor noticing an
- * IRQ/NMI window request. However, VMRUN can unblock interrupts
- * by implicitly setting GIF, so force L0 to perform pending event
- * evaluation by requesting a KVM_REQ_EVENT.
- */
- enable_gif(svm);
- if (unlikely(evaluate_pending_interrupts))
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
-
- mark_all_dirty(svm->vmcb);
-}
-
-static int nested_svm_vmrun(struct vcpu_svm *svm)
-{
- int ret;
- struct vmcb *nested_vmcb;
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
- struct kvm_host_map map;
- u64 vmcb_gpa;
-
- vmcb_gpa = svm->vmcb->save.rax;
-
- ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map);
- if (ret == -EINVAL) {
- kvm_inject_gp(&svm->vcpu, 0);
- return 1;
- } else if (ret) {
- return kvm_skip_emulated_instruction(&svm->vcpu);
- }
-
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
-
- nested_vmcb = map.hva;
-
- if (!nested_vmcb_checks(nested_vmcb)) {
- nested_vmcb->control.exit_code = SVM_EXIT_ERR;
- nested_vmcb->control.exit_code_hi = 0;
- nested_vmcb->control.exit_info_1 = 0;
- nested_vmcb->control.exit_info_2 = 0;
-
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
-
- return ret;
- }
-
- trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
- nested_vmcb->save.rip,
- nested_vmcb->control.int_ctl,
- nested_vmcb->control.event_inj,
- nested_vmcb->control.nested_ctl);
-
- trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
- nested_vmcb->control.intercept_cr >> 16,
- nested_vmcb->control.intercept_exceptions,
- nested_vmcb->control.intercept);
-
- /* Clear internal status */
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
-
- /*
- * Save the old vmcb, so we don't need to pick what we save, but can
- * restore everything when a VMEXIT occurs
- */
- hsave->save.es = vmcb->save.es;
- hsave->save.cs = vmcb->save.cs;
- hsave->save.ss = vmcb->save.ss;
- hsave->save.ds = vmcb->save.ds;
- hsave->save.gdtr = vmcb->save.gdtr;
- hsave->save.idtr = vmcb->save.idtr;
- hsave->save.efer = svm->vcpu.arch.efer;
- hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
- hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
- hsave->save.rip = kvm_rip_read(&svm->vcpu);
- hsave->save.rsp = vmcb->save.rsp;
- hsave->save.rax = vmcb->save.rax;
- if (npt_enabled)
- hsave->save.cr3 = vmcb->save.cr3;
- else
- hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
-
- copy_vmcb_control_area(hsave, vmcb);
-
- enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map);
-
- if (!nested_svm_vmrun_msrpm(svm)) {
- svm->vmcb->control.exit_code = SVM_EXIT_ERR;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
- }
-
- return ret;
-}
-
-static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
-{
- to_vmcb->save.fs = from_vmcb->save.fs;
- to_vmcb->save.gs = from_vmcb->save.gs;
- to_vmcb->save.tr = from_vmcb->save.tr;
- to_vmcb->save.ldtr = from_vmcb->save.ldtr;
- to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
- to_vmcb->save.star = from_vmcb->save.star;
- to_vmcb->save.lstar = from_vmcb->save.lstar;
- to_vmcb->save.cstar = from_vmcb->save.cstar;
- to_vmcb->save.sfmask = from_vmcb->save.sfmask;
- to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
- to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
- to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
-}
-
static int vmload_interception(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
@@ -4565,276 +2703,6 @@ static int mwait_interception(struct vcpu_svm *svm)
return nop_interception(svm);
}
-enum avic_ipi_failure_cause {
- AVIC_IPI_FAILURE_INVALID_INT_TYPE,
- AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
- AVIC_IPI_FAILURE_INVALID_TARGET,
- AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
-};
-
-static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
-{
- u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
- u32 icrl = svm->vmcb->control.exit_info_1;
- u32 id = svm->vmcb->control.exit_info_2 >> 32;
- u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
- struct kvm_lapic *apic = svm->vcpu.arch.apic;
-
- trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
-
- switch (id) {
- case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
- /*
- * AVIC hardware handles the generation of
- * IPIs when the specified Message Type is Fixed
- * (also known as fixed delivery mode) and
- * the Trigger Mode is edge-triggered. The hardware
- * also supports self and broadcast delivery modes
- * specified via the Destination Shorthand(DSH)
- * field of the ICRL. Logical and physical APIC ID
- * formats are supported. All other IPI types cause
- * a #VMEXIT, which needs to emulated.
- */
- kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
- kvm_lapic_reg_write(apic, APIC_ICR, icrl);
- break;
- case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
- int i;
- struct kvm_vcpu *vcpu;
- struct kvm *kvm = svm->vcpu.kvm;
- struct kvm_lapic *apic = svm->vcpu.arch.apic;
-
- /*
- * At this point, we expect that the AVIC HW has already
- * set the appropriate IRR bits on the valid target
- * vcpus. So, we just need to kick the appropriate vcpu.
- */
- kvm_for_each_vcpu(i, vcpu, kvm) {
- bool m = kvm_apic_match_dest(vcpu, apic,
- icrl & APIC_SHORT_MASK,
- GET_APIC_DEST_FIELD(icrh),
- icrl & APIC_DEST_MASK);
-
- if (m && !avic_vcpu_is_running(vcpu))
- kvm_vcpu_wake_up(vcpu);
- }
- break;
- }
- case AVIC_IPI_FAILURE_INVALID_TARGET:
- WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
- index, svm->vcpu.vcpu_id, icrh, icrl);
- break;
- case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
- WARN_ONCE(1, "Invalid backing page\n");
- break;
- default:
- pr_err("Unknown IPI interception\n");
- }
-
- return 1;
-}
-
-static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
-{
- struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
- int index;
- u32 *logical_apic_id_table;
- int dlid = GET_APIC_LOGICAL_ID(ldr);
-
- if (!dlid)
- return NULL;
-
- if (flat) { /* flat */
- index = ffs(dlid) - 1;
- if (index > 7)
- return NULL;
- } else { /* cluster */
- int cluster = (dlid & 0xf0) >> 4;
- int apic = ffs(dlid & 0x0f) - 1;
-
- if ((apic < 0) || (apic > 7) ||
- (cluster >= 0xf))
- return NULL;
- index = (cluster << 2) + apic;
- }
-
- logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
-
- return &logical_apic_id_table[index];
-}
-
-static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
-{
- bool flat;
- u32 *entry, new_entry;
-
- flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
- entry = avic_get_logical_id_entry(vcpu, ldr, flat);
- if (!entry)
- return -EINVAL;
-
- new_entry = READ_ONCE(*entry);
- new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
- new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
- new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
- WRITE_ONCE(*entry, new_entry);
-
- return 0;
-}
-
-static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- bool flat = svm->dfr_reg == APIC_DFR_FLAT;
- u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
-
- if (entry)
- clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
-}
-
-static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
-{
- int ret = 0;
- struct vcpu_svm *svm = to_svm(vcpu);
- u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
- u32 id = kvm_xapic_id(vcpu->arch.apic);
-
- if (ldr == svm->ldr_reg)
- return 0;
-
- avic_invalidate_logical_id_entry(vcpu);
-
- if (ldr)
- ret = avic_ldr_write(vcpu, id, ldr);
-
- if (!ret)
- svm->ldr_reg = ldr;
-
- return ret;
-}
-
-static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
-{
- u64 *old, *new;
- struct vcpu_svm *svm = to_svm(vcpu);
- u32 id = kvm_xapic_id(vcpu->arch.apic);
-
- if (vcpu->vcpu_id == id)
- return 0;
-
- old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
- new = avic_get_physical_id_entry(vcpu, id);
- if (!new || !old)
- return 1;
-
- /* We need to move physical_id_entry to new offset */
- *new = *old;
- *old = 0ULL;
- to_svm(vcpu)->avic_physical_id_cache = new;
-
- /*
- * Also update the guest physical APIC ID in the logical
- * APIC ID table entry if already setup the LDR.
- */
- if (svm->ldr_reg)
- avic_handle_ldr_update(vcpu);
-
- return 0;
-}
-
-static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
-
- if (svm->dfr_reg == dfr)
- return;
-
- avic_invalidate_logical_id_entry(vcpu);
- svm->dfr_reg = dfr;
-}
-
-static int avic_unaccel_trap_write(struct vcpu_svm *svm)
-{
- struct kvm_lapic *apic = svm->vcpu.arch.apic;
- u32 offset = svm->vmcb->control.exit_info_1 &
- AVIC_UNACCEL_ACCESS_OFFSET_MASK;
-
- switch (offset) {
- case APIC_ID:
- if (avic_handle_apic_id_update(&svm->vcpu))
- return 0;
- break;
- case APIC_LDR:
- if (avic_handle_ldr_update(&svm->vcpu))
- return 0;
- break;
- case APIC_DFR:
- avic_handle_dfr_update(&svm->vcpu);
- break;
- default:
- break;
- }
-
- kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
-
- return 1;
-}
-
-static bool is_avic_unaccelerated_access_trap(u32 offset)
-{
- bool ret = false;
-
- switch (offset) {
- case APIC_ID:
- case APIC_EOI:
- case APIC_RRR:
- case APIC_LDR:
- case APIC_DFR:
- case APIC_SPIV:
- case APIC_ESR:
- case APIC_ICR:
- case APIC_LVTT:
- case APIC_LVTTHMR:
- case APIC_LVTPC:
- case APIC_LVT0:
- case APIC_LVT1:
- case APIC_LVTERR:
- case APIC_TMICT:
- case APIC_TDCR:
- ret = true;
- break;
- default:
- break;
- }
- return ret;
-}
-
-static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
-{
- int ret = 0;
- u32 offset = svm->vmcb->control.exit_info_1 &
- AVIC_UNACCEL_ACCESS_OFFSET_MASK;
- u32 vector = svm->vmcb->control.exit_info_2 &
- AVIC_UNACCEL_ACCESS_VECTOR_MASK;
- bool write = (svm->vmcb->control.exit_info_1 >> 32) &
- AVIC_UNACCEL_ACCESS_WRITE_MASK;
- bool trap = is_avic_unaccelerated_access_trap(offset);
-
- trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
- trap, write, vector);
- if (trap) {
- /* Handling Trap */
- WARN_ONCE(!write, "svm: Handling trap read.\n");
- ret = avic_unaccel_trap_write(svm);
- } else {
- /* Handling Fault */
- ret = kvm_emulate_instruction(&svm->vcpu, 0);
- }
-
- return ret;
-}
-
static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
@@ -5125,30 +2993,6 @@ static void reload_tss(struct kvm_vcpu *vcpu)
load_TR_desc();
}
-static void pre_sev_run(struct vcpu_svm *svm, int cpu)
-{
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- int asid = sev_get_asid(svm->vcpu.kvm);
-
- /* Assign the asid allocated with this SEV guest */
- svm->vmcb->control.asid = asid;
-
- /*
- * Flush guest TLB:
- *
- * 1) when different VMCB for the same ASID is to be run on the same host CPU.
- * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
- */
- if (sd->sev_vmcbs[asid] == svm->vmcb &&
- svm->last_cpu == cpu)
- return;
-
- svm->last_cpu = cpu;
- sd->sev_vmcbs[asid] = svm->vmcb;
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
- mark_dirty(svm->vmcb, VMCB_ASID);
-}
-
static void pre_svm_run(struct vcpu_svm *svm)
{
int cpu = raw_smp_processor_id();
@@ -5186,11 +3030,6 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
-static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
-{
- return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
-}
-
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -5207,324 +3046,6 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
}
-static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
-{
- return;
-}
-
-static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
-{
-}
-
-static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
-{
-}
-
-static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate)
-{
- if (!avic || !lapic_in_kernel(vcpu))
- return;
-
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- kvm_request_apicv_update(vcpu->kvm, activate,
- APICV_INHIBIT_REASON_IRQWIN);
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-}
-
-static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
-{
- int ret = 0;
- unsigned long flags;
- struct amd_svm_iommu_ir *ir;
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm))
- return 0;
-
- /*
- * Here, we go through the per-vcpu ir_list to update all existing
- * interrupt remapping table entry targeting this vcpu.
- */
- spin_lock_irqsave(&svm->ir_list_lock, flags);
-
- if (list_empty(&svm->ir_list))
- goto out;
-
- list_for_each_entry(ir, &svm->ir_list, node) {
- if (activate)
- ret = amd_iommu_activate_guest_mode(ir->data);
- else
- ret = amd_iommu_deactivate_guest_mode(ir->data);
- if (ret)
- break;
- }
-out:
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
- return ret;
-}
-
-static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *vmcb = svm->vmcb;
- bool activated = kvm_vcpu_apicv_active(vcpu);
-
- if (!avic)
- return;
-
- if (activated) {
- /**
- * During AVIC temporary deactivation, guest could update
- * APIC ID, DFR and LDR registers, which would not be trapped
- * by avic_unaccelerated_access_interception(). In this case,
- * we need to check and update the AVIC logical APIC ID table
- * accordingly before re-activating.
- */
- avic_post_state_restore(vcpu);
- vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
- } else {
- vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
- }
- mark_dirty(vmcb, VMCB_AVIC);
-
- svm_set_pi_irte_mode(vcpu, activated);
-}
-
-static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
-{
- return;
-}
-
-static int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
-{
- if (!vcpu->arch.apicv_active)
- return -1;
-
- kvm_lapic_set_irr(vec, vcpu->arch.apic);
- smp_mb__after_atomic();
-
- if (avic_vcpu_is_running(vcpu)) {
- int cpuid = vcpu->cpu;
-
- if (cpuid != get_cpu())
- wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid));
- put_cpu();
- } else
- kvm_vcpu_wake_up(vcpu);
-
- return 0;
-}
-
-static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
-{
- return false;
-}
-
-static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
-{
- unsigned long flags;
- struct amd_svm_iommu_ir *cur;
-
- spin_lock_irqsave(&svm->ir_list_lock, flags);
- list_for_each_entry(cur, &svm->ir_list, node) {
- if (cur->data != pi->ir_data)
- continue;
- list_del(&cur->node);
- kfree(cur);
- break;
- }
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-}
-
-static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
-{
- int ret = 0;
- unsigned long flags;
- struct amd_svm_iommu_ir *ir;
-
- /**
- * In some cases, the existing irte is updaed and re-set,
- * so we need to check here if it's already been * added
- * to the ir_list.
- */
- if (pi->ir_data && (pi->prev_ga_tag != 0)) {
- struct kvm *kvm = svm->vcpu.kvm;
- u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
- struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
- struct vcpu_svm *prev_svm;
-
- if (!prev_vcpu) {
- ret = -EINVAL;
- goto out;
- }
-
- prev_svm = to_svm(prev_vcpu);
- svm_ir_list_del(prev_svm, pi);
- }
-
- /**
- * Allocating new amd_iommu_pi_data, which will get
- * add to the per-vcpu ir_list.
- */
- ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
- if (!ir) {
- ret = -ENOMEM;
- goto out;
- }
- ir->data = pi->ir_data;
-
- spin_lock_irqsave(&svm->ir_list_lock, flags);
- list_add(&ir->node, &svm->ir_list);
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-out:
- return ret;
-}
-
-/**
- * Note:
- * The HW cannot support posting multicast/broadcast
- * interrupts to a vCPU. So, we still use legacy interrupt
- * remapping for these kind of interrupts.
- *
- * For lowest-priority interrupts, we only support
- * those with single CPU as the destination, e.g. user
- * configures the interrupts via /proc/irq or uses
- * irqbalance to make the interrupts single-CPU.
- */
-static int
-get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
- struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
-{
- struct kvm_lapic_irq irq;
- struct kvm_vcpu *vcpu = NULL;
-
- kvm_set_msi_irq(kvm, e, &irq);
-
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq)) {
- pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
- __func__, irq.vector);
- return -1;
- }
-
- pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
- irq.vector);
- *svm = to_svm(vcpu);
- vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
- vcpu_info->vector = irq.vector;
-
- return 0;
-}
-
-/*
- * svm_update_pi_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
- struct kvm_kernel_irq_routing_entry *e;
- struct kvm_irq_routing_table *irq_rt;
- int idx, ret = -EINVAL;
-
- if (!kvm_arch_has_assigned_device(kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP))
- return 0;
-
- pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
- __func__, host_irq, guest_irq, set);
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
-
- hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
- struct vcpu_data vcpu_info;
- struct vcpu_svm *svm = NULL;
-
- if (e->type != KVM_IRQ_ROUTING_MSI)
- continue;
-
- /**
- * Here, we setup with legacy mode in the following cases:
- * 1. When cannot target interrupt to a specific vcpu.
- * 2. Unsetting posted interrupt.
- * 3. APIC virtialization is disabled for the vcpu.
- * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
- */
- if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
- kvm_vcpu_apicv_active(&svm->vcpu)) {
- struct amd_iommu_pi_data pi;
-
- /* Try to enable guest_mode in IRTE */
- pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
- AVIC_HPA_MASK);
- pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
- svm->vcpu.vcpu_id);
- pi.is_guest_mode = true;
- pi.vcpu_data = &vcpu_info;
- ret = irq_set_vcpu_affinity(host_irq, &pi);
-
- /**
- * Here, we successfully setting up vcpu affinity in
- * IOMMU guest mode. Now, we need to store the posted
- * interrupt information in a per-vcpu ir_list so that
- * we can reference to them directly when we update vcpu
- * scheduling information in IOMMU irte.
- */
- if (!ret && pi.is_guest_mode)
- svm_ir_list_add(svm, &pi);
- } else {
- /* Use legacy mode in IRTE */
- struct amd_iommu_pi_data pi;
-
- /**
- * Here, pi is used to:
- * - Tell IOMMU to use legacy mode for this interrupt.
- * - Retrieve ga_tag of prior interrupt remapping data.
- */
- pi.is_guest_mode = false;
- ret = irq_set_vcpu_affinity(host_irq, &pi);
-
- /**
- * Check if the posted interrupt was previously
- * setup with the guest_mode by checking if the ga_tag
- * was cached. If so, we need to clean up the per-vcpu
- * ir_list.
- */
- if (!ret && pi.prev_ga_tag) {
- int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
- struct kvm_vcpu *vcpu;
-
- vcpu = kvm_get_vcpu_by_id(kvm, id);
- if (vcpu)
- svm_ir_list_del(to_svm(vcpu), &pi);
- }
- }
-
- if (!ret && svm) {
- trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
- e->gsi, vcpu_info.vector,
- vcpu_info.pi_desc_addr, set);
- }
-
- if (ret < 0) {
- pr_err("%s: failed to update PI IRTE\n", __func__);
- goto out;
- }
- }
-
- ret = 0;
-out:
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
-}
-
static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -5632,7 +3153,7 @@ static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
return 0;
}
-static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -5755,6 +3276,8 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
svm_complete_interrupts(svm);
}
+bool __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
+
static void svm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -5809,95 +3332,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
local_irq_enable();
- asm volatile (
- "push %%" _ASM_BP "; \n\t"
- "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
- "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
- "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
- "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
- "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
- "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
-#ifdef CONFIG_X86_64
- "mov %c[r8](%[svm]), %%r8 \n\t"
- "mov %c[r9](%[svm]), %%r9 \n\t"
- "mov %c[r10](%[svm]), %%r10 \n\t"
- "mov %c[r11](%[svm]), %%r11 \n\t"
- "mov %c[r12](%[svm]), %%r12 \n\t"
- "mov %c[r13](%[svm]), %%r13 \n\t"
- "mov %c[r14](%[svm]), %%r14 \n\t"
- "mov %c[r15](%[svm]), %%r15 \n\t"
-#endif
-
- /* Enter guest mode */
- "push %%" _ASM_AX " \n\t"
- "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
- __ex("vmload %%" _ASM_AX) "\n\t"
- __ex("vmrun %%" _ASM_AX) "\n\t"
- __ex("vmsave %%" _ASM_AX) "\n\t"
- "pop %%" _ASM_AX " \n\t"
-
- /* Save guest registers, load host registers */
- "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
- "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
- "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
- "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
- "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
- "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
-#ifdef CONFIG_X86_64
- "mov %%r8, %c[r8](%[svm]) \n\t"
- "mov %%r9, %c[r9](%[svm]) \n\t"
- "mov %%r10, %c[r10](%[svm]) \n\t"
- "mov %%r11, %c[r11](%[svm]) \n\t"
- "mov %%r12, %c[r12](%[svm]) \n\t"
- "mov %%r13, %c[r13](%[svm]) \n\t"
- "mov %%r14, %c[r14](%[svm]) \n\t"
- "mov %%r15, %c[r15](%[svm]) \n\t"
- /*
- * Clear host registers marked as clobbered to prevent
- * speculative use.
- */
- "xor %%r8d, %%r8d \n\t"
- "xor %%r9d, %%r9d \n\t"
- "xor %%r10d, %%r10d \n\t"
- "xor %%r11d, %%r11d \n\t"
- "xor %%r12d, %%r12d \n\t"
- "xor %%r13d, %%r13d \n\t"
- "xor %%r14d, %%r14d \n\t"
- "xor %%r15d, %%r15d \n\t"
-#endif
- "xor %%ebx, %%ebx \n\t"
- "xor %%ecx, %%ecx \n\t"
- "xor %%edx, %%edx \n\t"
- "xor %%esi, %%esi \n\t"
- "xor %%edi, %%edi \n\t"
- "pop %%" _ASM_BP
- :
- : [svm]"a"(svm),
- [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
- [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
- [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
- [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
- [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
- [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
- [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
-#ifdef CONFIG_X86_64
- , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
- [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
- [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
- [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
- [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
- [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
- [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
- [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
-#endif
- : "cc", "memory"
-#ifdef CONFIG_X86_64
- , "rbx", "rcx", "rdx", "rsi", "rdi"
- , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
-#else
- , "ebx", "ecx", "edx", "esi", "edi"
-#endif
- );
+ __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
/* Eliminate branch target predictions from guest mode */
vmexit_fill_RSB();
@@ -6292,14 +3727,6 @@ static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
shrink_ple_window(vcpu);
}
-static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
-{
- if (avic_handle_apic_id_update(vcpu) != 0)
- return;
- avic_handle_dfr_update(vcpu);
- avic_handle_ldr_update(vcpu);
-}
-
static void svm_setup_mce(struct kvm_vcpu *vcpu)
{
/* [63:9] are reserved. */
@@ -6380,900 +3807,6 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
return 0;
}
-static int sev_flush_asids(void)
-{
- int ret, error;
-
- /*
- * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
- * so it must be guarded.
- */
- down_write(&sev_deactivate_lock);
-
- wbinvd_on_all_cpus();
- ret = sev_guest_df_flush(&error);
-
- up_write(&sev_deactivate_lock);
-
- if (ret)
- pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error);
-
- return ret;
-}
-
-/* Must be called with the sev_bitmap_lock held */
-static bool __sev_recycle_asids(void)
-{
- int pos;
-
- /* Check if there are any ASIDs to reclaim before performing a flush */
- pos = find_next_bit(sev_reclaim_asid_bitmap,
- max_sev_asid, min_sev_asid - 1);
- if (pos >= max_sev_asid)
- return false;
-
- if (sev_flush_asids())
- return false;
-
- bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
- max_sev_asid);
- bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid);
-
- return true;
-}
-
-static int sev_asid_new(void)
-{
- bool retry = true;
- int pos;
-
- mutex_lock(&sev_bitmap_lock);
-
- /*
- * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid.
- */
-again:
- pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1);
- if (pos >= max_sev_asid) {
- if (retry && __sev_recycle_asids()) {
- retry = false;
- goto again;
- }
- mutex_unlock(&sev_bitmap_lock);
- return -EBUSY;
- }
-
- __set_bit(pos, sev_asid_bitmap);
-
- mutex_unlock(&sev_bitmap_lock);
-
- return pos + 1;
-}
-
-static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- int asid, ret;
-
- ret = -EBUSY;
- if (unlikely(sev->active))
- return ret;
-
- asid = sev_asid_new();
- if (asid < 0)
- return ret;
-
- ret = sev_platform_init(&argp->error);
- if (ret)
- goto e_free;
-
- sev->active = true;
- sev->asid = asid;
- INIT_LIST_HEAD(&sev->regions_list);
-
- return 0;
-
-e_free:
- sev_asid_free(asid);
- return ret;
-}
-
-static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
-{
- struct sev_data_activate *data;
- int asid = sev_get_asid(kvm);
- int ret;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
- /* activate ASID on the given handle */
- data->handle = handle;
- data->asid = asid;
- ret = sev_guest_activate(data, error);
- kfree(data);
-
- return ret;
-}
-
-static int __sev_issue_cmd(int fd, int id, void *data, int *error)
-{
- struct fd f;
- int ret;
-
- f = fdget(fd);
- if (!f.file)
- return -EBADF;
-
- ret = sev_issue_cmd_external_user(f.file, id, data, error);
-
- fdput(f);
- return ret;
-}
-
-static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
-{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-
- return __sev_issue_cmd(sev->fd, id, data, error);
-}
-
-static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_start *start;
- struct kvm_sev_launch_start params;
- void *dh_blob, *session_blob;
- int *error = &argp->error;
- int ret;
-
- if (!sev_guest(kvm))
- return -ENOTTY;
-
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
- return -EFAULT;
-
- start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
- if (!start)
- return -ENOMEM;
-
- dh_blob = NULL;
- if (params.dh_uaddr) {
- dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
- if (IS_ERR(dh_blob)) {
- ret = PTR_ERR(dh_blob);
- goto e_free;
- }
-
- start->dh_cert_address = __sme_set(__pa(dh_blob));
- start->dh_cert_len = params.dh_len;
- }
-
- session_blob = NULL;
- if (params.session_uaddr) {
- session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
- if (IS_ERR(session_blob)) {
- ret = PTR_ERR(session_blob);
- goto e_free_dh;
- }
-
- start->session_address = __sme_set(__pa(session_blob));
- start->session_len = params.session_len;
- }
-
- start->handle = params.handle;
- start->policy = params.policy;
-
- /* create memory encryption context */
- ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
- if (ret)
- goto e_free_session;
-
- /* Bind ASID to this guest */
- ret = sev_bind_asid(kvm, start->handle, error);
- if (ret)
- goto e_free_session;
-
- /* return handle to userspace */
- params.handle = start->handle;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
- sev_unbind_asid(kvm, start->handle);
- ret = -EFAULT;
- goto e_free_session;
- }
-
- sev->handle = start->handle;
- sev->fd = argp->sev_fd;
-
-e_free_session:
- kfree(session_blob);
-e_free_dh:
- kfree(dh_blob);
-e_free:
- kfree(start);
- return ret;
-}
-
-static unsigned long get_num_contig_pages(unsigned long idx,
- struct page **inpages, unsigned long npages)
-{
- unsigned long paddr, next_paddr;
- unsigned long i = idx + 1, pages = 1;
-
- /* find the number of contiguous pages starting from idx */
- paddr = __sme_page_pa(inpages[idx]);
- while (i < npages) {
- next_paddr = __sme_page_pa(inpages[i++]);
- if ((paddr + PAGE_SIZE) == next_paddr) {
- pages++;
- paddr = next_paddr;
- continue;
- }
- break;
- }
-
- return pages;
-}
-
-static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
- unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct kvm_sev_launch_update_data params;
- struct sev_data_launch_update_data *data;
- struct page **inpages;
- int ret;
-
- if (!sev_guest(kvm))
- return -ENOTTY;
-
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
- return -EFAULT;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
- vaddr = params.uaddr;
- size = params.len;
- vaddr_end = vaddr + size;
-
- /* Lock the user memory. */
- inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
- if (!inpages) {
- ret = -ENOMEM;
- goto e_free;
- }
-
- /*
- * The LAUNCH_UPDATE command will perform in-place encryption of the
- * memory content (i.e it will write the same memory region with C=1).
- * It's possible that the cache may contain the data with C=0, i.e.,
- * unencrypted so invalidate it first.
- */
- sev_clflush_pages(inpages, npages);
-
- for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
- int offset, len;
-
- /*
- * If the user buffer is not page-aligned, calculate the offset
- * within the page.
- */
- offset = vaddr & (PAGE_SIZE - 1);
-
- /* Calculate the number of pages that can be encrypted in one go. */
- pages = get_num_contig_pages(i, inpages, npages);
-
- len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
-
- data->handle = sev->handle;
- data->len = len;
- data->address = __sme_page_pa(inpages[i]) + offset;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
- if (ret)
- goto e_unpin;
-
- size -= len;
- next_vaddr = vaddr + len;
- }
-
-e_unpin:
- /* content of memory is updated, mark pages dirty */
- for (i = 0; i < npages; i++) {
- set_page_dirty_lock(inpages[i]);
- mark_page_accessed(inpages[i]);
- }
- /* unlock the user pages */
- sev_unpin_memory(kvm, inpages, npages);
-e_free:
- kfree(data);
- return ret;
-}
-
-static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
- void __user *measure = (void __user *)(uintptr_t)argp->data;
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_measure *data;
- struct kvm_sev_launch_measure params;
- void __user *p = NULL;
- void *blob = NULL;
- int ret;
-
- if (!sev_guest(kvm))
- return -ENOTTY;
-
- if (copy_from_user(&params, measure, sizeof(params)))
- return -EFAULT;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
- /* User wants to query the blob length */
- if (!params.len)
- goto cmd;
-
- p = (void __user *)(uintptr_t)params.uaddr;
- if (p) {
- if (params.len > SEV_FW_BLOB_MAX_SIZE) {
- ret = -EINVAL;
- goto e_free;
- }
-
- ret = -ENOMEM;
- blob = kmalloc(params.len, GFP_KERNEL);
- if (!blob)
- goto e_free;
-
- data->address = __psp_pa(blob);
- data->len = params.len;
- }
-
-cmd:
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
-
- /*
- * If we query the session length, FW responded with expected data.
- */
- if (!params.len)
- goto done;
-
- if (ret)
- goto e_free_blob;
-
- if (blob) {
- if (copy_to_user(p, blob, params.len))
- ret = -EFAULT;
- }
-
-done:
- params.len = data->len;
- if (copy_to_user(measure, &params, sizeof(params)))
- ret = -EFAULT;
-e_free_blob:
- kfree(blob);
-e_free:
- kfree(data);
- return ret;
-}
-
-static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_finish *data;
- int ret;
-
- if (!sev_guest(kvm))
- return -ENOTTY;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
-
- kfree(data);
- return ret;
-}
-
-static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct kvm_sev_guest_status params;
- struct sev_data_guest_status *data;
- int ret;
-
- if (!sev_guest(kvm))
- return -ENOTTY;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
- if (ret)
- goto e_free;
-
- params.policy = data->policy;
- params.state = data->state;
- params.handle = data->handle;
-
- if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
- ret = -EFAULT;
-e_free:
- kfree(data);
- return ret;
-}
-
-static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
- unsigned long dst, int size,
- int *error, bool enc)
-{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_dbg *data;
- int ret;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
- data->handle = sev->handle;
- data->dst_addr = dst;
- data->src_addr = src;
- data->len = size;
-
- ret = sev_issue_cmd(kvm,
- enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
- data, error);
- kfree(data);
- return ret;
-}
-
-static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
- unsigned long dst_paddr, int sz, int *err)
-{
- int offset;
-
- /*
- * Its safe to read more than we are asked, caller should ensure that
- * destination has enough space.
- */
- src_paddr = round_down(src_paddr, 16);
- offset = src_paddr & 15;
- sz = round_up(sz + offset, 16);
-
- return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
-}
-
-static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
- unsigned long __user dst_uaddr,
- unsigned long dst_paddr,
- int size, int *err)
-{
- struct page *tpage = NULL;
- int ret, offset;
-
- /* if inputs are not 16-byte then use intermediate buffer */
- if (!IS_ALIGNED(dst_paddr, 16) ||
- !IS_ALIGNED(paddr, 16) ||
- !IS_ALIGNED(size, 16)) {
- tpage = (void *)alloc_page(GFP_KERNEL);
- if (!tpage)
- return -ENOMEM;
-
- dst_paddr = __sme_page_pa(tpage);
- }
-
- ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
- if (ret)
- goto e_free;
-
- if (tpage) {
- offset = paddr & 15;
- if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
- page_address(tpage) + offset, size))
- ret = -EFAULT;
- }
-
-e_free:
- if (tpage)
- __free_page(tpage);
-
- return ret;
-}
-
-static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
- unsigned long __user vaddr,
- unsigned long dst_paddr,
- unsigned long __user dst_vaddr,
- int size, int *error)
-{
- struct page *src_tpage = NULL;
- struct page *dst_tpage = NULL;
- int ret, len = size;
-
- /* If source buffer is not aligned then use an intermediate buffer */
- if (!IS_ALIGNED(vaddr, 16)) {
- src_tpage = alloc_page(GFP_KERNEL);
- if (!src_tpage)
- return -ENOMEM;
-
- if (copy_from_user(page_address(src_tpage),
- (void __user *)(uintptr_t)vaddr, size)) {
- __free_page(src_tpage);
- return -EFAULT;
- }
-
- paddr = __sme_page_pa(src_tpage);
- }
-
- /*
- * If destination buffer or length is not aligned then do read-modify-write:
- * - decrypt destination in an intermediate buffer
- * - copy the source buffer in an intermediate buffer
- * - use the intermediate buffer as source buffer
- */
- if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
- int dst_offset;
-
- dst_tpage = alloc_page(GFP_KERNEL);
- if (!dst_tpage) {
- ret = -ENOMEM;
- goto e_free;
- }
-
- ret = __sev_dbg_decrypt(kvm, dst_paddr,
- __sme_page_pa(dst_tpage), size, error);
- if (ret)
- goto e_free;
-
- /*
- * If source is kernel buffer then use memcpy() otherwise
- * copy_from_user().
- */
- dst_offset = dst_paddr & 15;
-
- if (src_tpage)
- memcpy(page_address(dst_tpage) + dst_offset,
- page_address(src_tpage), size);
- else {
- if (copy_from_user(page_address(dst_tpage) + dst_offset,
- (void __user *)(uintptr_t)vaddr, size)) {
- ret = -EFAULT;
- goto e_free;
- }
- }
-
- paddr = __sme_page_pa(dst_tpage);
- dst_paddr = round_down(dst_paddr, 16);
- len = round_up(size, 16);
- }
-
- ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
-
-e_free:
- if (src_tpage)
- __free_page(src_tpage);
- if (dst_tpage)
- __free_page(dst_tpage);
- return ret;
-}
-
-static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
-{
- unsigned long vaddr, vaddr_end, next_vaddr;
- unsigned long dst_vaddr;
- struct page **src_p, **dst_p;
- struct kvm_sev_dbg debug;
- unsigned long n;
- unsigned int size;
- int ret;
-
- if (!sev_guest(kvm))
- return -ENOTTY;
-
- if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
- return -EFAULT;
-
- if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
- return -EINVAL;
- if (!debug.dst_uaddr)
- return -EINVAL;
-
- vaddr = debug.src_uaddr;
- size = debug.len;
- vaddr_end = vaddr + size;
- dst_vaddr = debug.dst_uaddr;
-
- for (; vaddr < vaddr_end; vaddr = next_vaddr) {
- int len, s_off, d_off;
-
- /* lock userspace source and destination page */
- src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
- if (!src_p)
- return -EFAULT;
-
- dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
- if (!dst_p) {
- sev_unpin_memory(kvm, src_p, n);
- return -EFAULT;
- }
-
- /*
- * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the
- * memory content (i.e it will write the same memory region with C=1).
- * It's possible that the cache may contain the data with C=0, i.e.,
- * unencrypted so invalidate it first.
- */
- sev_clflush_pages(src_p, 1);
- sev_clflush_pages(dst_p, 1);
-
- /*
- * Since user buffer may not be page aligned, calculate the
- * offset within the page.
- */
- s_off = vaddr & ~PAGE_MASK;
- d_off = dst_vaddr & ~PAGE_MASK;
- len = min_t(size_t, (PAGE_SIZE - s_off), size);
-
- if (dec)
- ret = __sev_dbg_decrypt_user(kvm,
- __sme_page_pa(src_p[0]) + s_off,
- dst_vaddr,
- __sme_page_pa(dst_p[0]) + d_off,
- len, &argp->error);
- else
- ret = __sev_dbg_encrypt_user(kvm,
- __sme_page_pa(src_p[0]) + s_off,
- vaddr,
- __sme_page_pa(dst_p[0]) + d_off,
- dst_vaddr,
- len, &argp->error);
-
- sev_unpin_memory(kvm, src_p, n);
- sev_unpin_memory(kvm, dst_p, n);
-
- if (ret)
- goto err;
-
- next_vaddr = vaddr + len;
- dst_vaddr = dst_vaddr + len;
- size -= len;
- }
-err:
- return ret;
-}
-
-static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_secret *data;
- struct kvm_sev_launch_secret params;
- struct page **pages;
- void *blob, *hdr;
- unsigned long n;
- int ret, offset;
-
- if (!sev_guest(kvm))
- return -ENOTTY;
-
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
- return -EFAULT;
-
- pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
- if (!pages)
- return -ENOMEM;
-
- /*
- * The secret must be copied into contiguous memory region, lets verify
- * that userspace memory pages are contiguous before we issue command.
- */
- if (get_num_contig_pages(0, pages, n) != n) {
- ret = -EINVAL;
- goto e_unpin_memory;
- }
-
- ret = -ENOMEM;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- goto e_unpin_memory;
-
- offset = params.guest_uaddr & (PAGE_SIZE - 1);
- data->guest_address = __sme_page_pa(pages[0]) + offset;
- data->guest_len = params.guest_len;
-
- blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
- if (IS_ERR(blob)) {
- ret = PTR_ERR(blob);
- goto e_free;
- }
-
- data->trans_address = __psp_pa(blob);
- data->trans_len = params.trans_len;
-
- hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
- if (IS_ERR(hdr)) {
- ret = PTR_ERR(hdr);
- goto e_free_blob;
- }
- data->hdr_address = __psp_pa(hdr);
- data->hdr_len = params.hdr_len;
-
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
-
- kfree(hdr);
-
-e_free_blob:
- kfree(blob);
-e_free:
- kfree(data);
-e_unpin_memory:
- sev_unpin_memory(kvm, pages, n);
- return ret;
-}
-
-static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
-{
- struct kvm_sev_cmd sev_cmd;
- int r;
-
- if (!svm_sev_enabled())
- return -ENOTTY;
-
- if (!argp)
- return 0;
-
- if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
- return -EFAULT;
-
- mutex_lock(&kvm->lock);
-
- switch (sev_cmd.id) {
- case KVM_SEV_INIT:
- r = sev_guest_init(kvm, &sev_cmd);
- break;
- case KVM_SEV_LAUNCH_START:
- r = sev_launch_start(kvm, &sev_cmd);
- break;
- case KVM_SEV_LAUNCH_UPDATE_DATA:
- r = sev_launch_update_data(kvm, &sev_cmd);
- break;
- case KVM_SEV_LAUNCH_MEASURE:
- r = sev_launch_measure(kvm, &sev_cmd);
- break;
- case KVM_SEV_LAUNCH_FINISH:
- r = sev_launch_finish(kvm, &sev_cmd);
- break;
- case KVM_SEV_GUEST_STATUS:
- r = sev_guest_status(kvm, &sev_cmd);
- break;
- case KVM_SEV_DBG_DECRYPT:
- r = sev_dbg_crypt(kvm, &sev_cmd, true);
- break;
- case KVM_SEV_DBG_ENCRYPT:
- r = sev_dbg_crypt(kvm, &sev_cmd, false);
- break;
- case KVM_SEV_LAUNCH_SECRET:
- r = sev_launch_secret(kvm, &sev_cmd);
- break;
- default:
- r = -EINVAL;
- goto out;
- }
-
- if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
- r = -EFAULT;
-
-out:
- mutex_unlock(&kvm->lock);
- return r;
-}
-
-static int svm_register_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range)
-{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct enc_region *region;
- int ret = 0;
-
- if (!sev_guest(kvm))
- return -ENOTTY;
-
- if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
- return -EINVAL;
-
- region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
- if (!region)
- return -ENOMEM;
-
- region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
- if (!region->pages) {
- ret = -ENOMEM;
- goto e_free;
- }
-
- /*
- * The guest may change the memory encryption attribute from C=0 -> C=1
- * or vice versa for this memory range. Lets make sure caches are
- * flushed to ensure that guest data gets written into memory with
- * correct C-bit.
- */
- sev_clflush_pages(region->pages, region->npages);
-
- region->uaddr = range->addr;
- region->size = range->size;
-
- mutex_lock(&kvm->lock);
- list_add_tail(&region->list, &sev->regions_list);
- mutex_unlock(&kvm->lock);
-
- return ret;
-
-e_free:
- kfree(region);
- return ret;
-}
-
-static struct enc_region *
-find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
-{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct list_head *head = &sev->regions_list;
- struct enc_region *i;
-
- list_for_each_entry(i, head, list) {
- if (i->uaddr == range->addr &&
- i->size == range->size)
- return i;
- }
-
- return NULL;
-}
-
-
-static int svm_unregister_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range)
-{
- struct enc_region *region;
- int ret;
-
- mutex_lock(&kvm->lock);
-
- if (!sev_guest(kvm)) {
- ret = -ENOTTY;
- goto failed;
- }
-
- region = find_enc_region(kvm, range);
- if (!region) {
- ret = -EINVAL;
- goto failed;
- }
-
- /*
- * Ensure that all guest tagged cache entries are flushed before
- * releasing the pages back to the system for use. CLFLUSH will
- * not do this, so issue a WBINVD.
- */
- wbinvd_on_all_cpus();
-
- __unregister_enc_region_locked(kvm, region);
-
- mutex_unlock(&kvm->lock);
- return 0;
-
-failed:
- mutex_unlock(&kvm->lock);
- return ret;
-}
-
static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
{
unsigned long cr4 = kvm_read_cr4(vcpu);
@@ -7347,21 +3880,22 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
(svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
}
-static bool svm_check_apicv_inhibit_reasons(ulong bit)
+static void svm_vm_destroy(struct kvm *kvm)
{
- ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
- BIT(APICV_INHIBIT_REASON_HYPERV) |
- BIT(APICV_INHIBIT_REASON_NESTED) |
- BIT(APICV_INHIBIT_REASON_IRQWIN) |
- BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
- BIT(APICV_INHIBIT_REASON_X2APIC);
-
- return supported & BIT(bit);
+ avic_vm_destroy(kvm);
+ sev_vm_destroy(kvm);
}
-static void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate)
+static int svm_vm_init(struct kvm *kvm)
{
- avic_update_access_page(kvm, activate);
+ if (avic) {
+ int ret = avic_vm_init(kvm);
+ if (ret)
+ return ret;
+ }
+
+ kvm_apicv_init(kvm, avic);
+ return 0;
}
static struct kvm_x86_ops svm_x86_ops __initdata = {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
new file mode 100644
index 000000000000..df3474f4fb02
--- /dev/null
+++ b/arch/x86/kvm/svm/svm.h
@@ -0,0 +1,491 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+#ifndef __SVM_SVM_H
+#define __SVM_SVM_H
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+
+#include <asm/svm.h>
+
+static const u32 host_save_user_msrs[] = {
+#ifdef CONFIG_X86_64
+ MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
+ MSR_FS_BASE,
+#endif
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ MSR_TSC_AUX,
+};
+
+#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+
+#define MSRPM_OFFSETS 16
+extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+extern bool npt_enabled;
+
+enum {
+ VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
+ pause filter count */
+ VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
+ VMCB_ASID, /* ASID */
+ VMCB_INTR, /* int_ctl, int_vector */
+ VMCB_NPT, /* npt_en, nCR3, gPAT */
+ VMCB_CR, /* CR0, CR3, CR4, EFER */
+ VMCB_DR, /* DR6, DR7 */
+ VMCB_DT, /* GDT, IDT */
+ VMCB_SEG, /* CS, DS, SS, ES, CPL */
+ VMCB_CR2, /* CR2 only */
+ VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
+ VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
+ * AVIC PHYSICAL_TABLE pointer,
+ * AVIC LOGICAL_TABLE pointer
+ */
+ VMCB_DIRTY_MAX,
+};
+
+/* TPR and CR2 are always written before VMRUN */
+#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
+
+struct kvm_sev_info {
+ bool active; /* SEV enabled guest */
+ unsigned int asid; /* ASID used for this guest */
+ unsigned int handle; /* SEV firmware handle */
+ int fd; /* SEV device fd */
+ unsigned long pages_locked; /* Number of pages locked */
+ struct list_head regions_list; /* List of registered regions */
+};
+
+struct kvm_svm {
+ struct kvm kvm;
+
+ /* Struct members for AVIC */
+ u32 avic_vm_id;
+ struct page *avic_logical_id_table_page;
+ struct page *avic_physical_id_table_page;
+ struct hlist_node hnode;
+
+ struct kvm_sev_info sev_info;
+};
+
+struct kvm_vcpu;
+
+struct nested_state {
+ struct vmcb *hsave;
+ u64 hsave_msr;
+ u64 vm_cr_msr;
+ u64 vmcb;
+
+ /* These are the merged vectors */
+ u32 *msrpm;
+
+ /* gpa pointers to the real vectors */
+ u64 vmcb_msrpm;
+ u64 vmcb_iopm;
+
+ /* A VMEXIT is required but not yet emulated */
+ bool exit_required;
+
+ /* cache for intercepts of the guest */
+ u32 intercept_cr;
+ u32 intercept_dr;
+ u32 intercept_exceptions;
+ u64 intercept;
+
+ /* Nested Paging related state */
+ u64 nested_cr3;
+};
+
+struct vcpu_svm {
+ struct kvm_vcpu vcpu;
+ struct vmcb *vmcb;
+ unsigned long vmcb_pa;
+ struct svm_cpu_data *svm_data;
+ uint64_t asid_generation;
+ uint64_t sysenter_esp;
+ uint64_t sysenter_eip;
+ uint64_t tsc_aux;
+
+ u64 msr_decfg;
+
+ u64 next_rip;
+
+ u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
+ struct {
+ u16 fs;
+ u16 gs;
+ u16 ldt;
+ u64 gs_base;
+ } host;
+
+ u64 spec_ctrl;
+ /*
+ * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
+ * translated into the appropriate L2_CFG bits on the host to
+ * perform speculative control.
+ */
+ u64 virt_spec_ctrl;
+
+ u32 *msrpm;
+
+ ulong nmi_iret_rip;
+
+ struct nested_state nested;
+
+ bool nmi_singlestep;
+ u64 nmi_singlestep_guest_rflags;
+
+ unsigned int3_injected;
+ unsigned long int3_rip;
+
+ /* cached guest cpuid flags for faster access */
+ bool nrips_enabled : 1;
+
+ u32 ldr_reg;
+ u32 dfr_reg;
+ struct page *avic_backing_page;
+ u64 *avic_physical_id_cache;
+ bool avic_is_running;
+
+ /*
+ * Per-vcpu list of struct amd_svm_iommu_ir:
+ * This is used mainly to store interrupt remapping information used
+ * when update the vcpu affinity. This avoids the need to scan for
+ * IRTE and try to match ga_tag in the IOMMU driver.
+ */
+ struct list_head ir_list;
+ spinlock_t ir_list_lock;
+
+ /* which host CPU was used for running this vcpu */
+ unsigned int last_cpu;
+};
+
+struct svm_cpu_data {
+ int cpu;
+
+ u64 asid_generation;
+ u32 max_asid;
+ u32 next_asid;
+ u32 min_asid;
+ struct kvm_ldttss_desc *tss_desc;
+
+ struct page *save_area;
+ struct vmcb *current_vmcb;
+
+ /* index = sev_asid, value = vmcb pointer */
+ struct vmcb **sev_vmcbs;
+};
+
+DECLARE_PER_CPU(struct svm_cpu_data *, svm_data);
+
+void recalc_intercepts(struct vcpu_svm *svm);
+
+static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_svm, kvm);
+}
+
+static inline void mark_all_dirty(struct vmcb *vmcb)
+{
+ vmcb->control.clean = 0;
+}
+
+static inline void mark_all_clean(struct vmcb *vmcb)
+{
+ vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
+ & ~VMCB_ALWAYS_DIRTY_MASK;
+}
+
+static inline void mark_dirty(struct vmcb *vmcb, int bit)
+{
+ vmcb->control.clean &= ~(1 << bit);
+}
+
+static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_svm, vcpu);
+}
+
+static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+{
+ if (is_guest_mode(&svm->vcpu))
+ return svm->nested.hsave;
+ else
+ return svm->vmcb;
+}
+
+static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_cr |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_cr &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ return vmcb->control.intercept_cr & (1U << bit);
+}
+
+static inline void set_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
+ | (1 << INTERCEPT_DR1_READ)
+ | (1 << INTERCEPT_DR2_READ)
+ | (1 << INTERCEPT_DR3_READ)
+ | (1 << INTERCEPT_DR4_READ)
+ | (1 << INTERCEPT_DR5_READ)
+ | (1 << INTERCEPT_DR6_READ)
+ | (1 << INTERCEPT_DR7_READ)
+ | (1 << INTERCEPT_DR0_WRITE)
+ | (1 << INTERCEPT_DR1_WRITE)
+ | (1 << INTERCEPT_DR2_WRITE)
+ | (1 << INTERCEPT_DR3_WRITE)
+ | (1 << INTERCEPT_DR4_WRITE)
+ | (1 << INTERCEPT_DR5_WRITE)
+ | (1 << INTERCEPT_DR6_WRITE)
+ | (1 << INTERCEPT_DR7_WRITE);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_dr = 0;
+
+ recalc_intercepts(svm);
+}
+
+static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_exceptions |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_exceptions &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void set_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept |= (1ULL << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept &= ~(1ULL << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline bool is_intercept(struct vcpu_svm *svm, int bit)
+{
+ return (svm->vmcb->control.intercept & (1ULL << bit)) != 0;
+}
+
+static inline bool vgif_enabled(struct vcpu_svm *svm)
+{
+ return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
+}
+
+static inline void enable_gif(struct vcpu_svm *svm)
+{
+ if (vgif_enabled(svm))
+ svm->vmcb->control.int_ctl |= V_GIF_MASK;
+ else
+ svm->vcpu.arch.hflags |= HF_GIF_MASK;
+}
+
+static inline void disable_gif(struct vcpu_svm *svm)
+{
+ if (vgif_enabled(svm))
+ svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
+ else
+ svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+}
+
+static inline bool gif_set(struct vcpu_svm *svm)
+{
+ if (vgif_enabled(svm))
+ return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
+ else
+ return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
+}
+
+/* svm.c */
+#define MSR_INVALID 0xffffffffU
+
+u32 svm_msrpm_offset(u32 msr);
+void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
+void disable_nmi_singlestep(struct vcpu_svm *svm);
+
+/* nested.c */
+
+#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
+#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
+#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
+
+/* This function returns true if it is save to enable the nmi window */
+static inline bool nested_svm_nmi(struct vcpu_svm *svm)
+{
+ if (!is_guest_mode(&svm->vcpu))
+ return true;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
+ return true;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_NMI;
+ svm->nested.exit_required = true;
+
+ return false;
+}
+
+static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
+{
+ return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
+}
+
+void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
+ struct vmcb *nested_vmcb, struct kvm_host_map *map);
+int nested_svm_vmrun(struct vcpu_svm *svm);
+void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
+int nested_svm_vmexit(struct vcpu_svm *svm);
+int nested_svm_exit_handled(struct vcpu_svm *svm);
+int nested_svm_check_permissions(struct vcpu_svm *svm);
+int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+ bool has_error_code, u32 error_code);
+int svm_check_nested_events(struct kvm_vcpu *vcpu);
+int nested_svm_exit_special(struct vcpu_svm *svm);
+
+/* avic.c */
+
+#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
+#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
+#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
+
+#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+
+extern int avic;
+
+static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
+{
+ svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
+ mark_dirty(svm->vmcb, VMCB_AVIC);
+}
+
+static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 *entry = svm->avic_physical_id_cache;
+
+ if (!entry)
+ return false;
+
+ return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+}
+
+int avic_ga_log_notifier(u32 ga_tag);
+void avic_vm_destroy(struct kvm *kvm);
+int avic_vm_init(struct kvm *kvm);
+void avic_init_vmcb(struct vcpu_svm *svm);
+void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
+int avic_incomplete_ipi_interception(struct vcpu_svm *svm);
+int avic_unaccelerated_access_interception(struct vcpu_svm *svm);
+int avic_init_vcpu(struct vcpu_svm *svm);
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void avic_vcpu_put(struct kvm_vcpu *vcpu);
+void avic_post_state_restore(struct kvm_vcpu *vcpu);
+void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
+bool svm_check_apicv_inhibit_reasons(ulong bit);
+void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate);
+void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
+void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
+int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec);
+bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
+int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
+void svm_vcpu_blocking(struct kvm_vcpu *vcpu);
+void svm_vcpu_unblocking(struct kvm_vcpu *vcpu);
+
+/* sev.c */
+
+extern unsigned int max_sev_asid;
+
+static inline bool sev_guest(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev->active;
+#else
+ return false;
+#endif
+}
+
+static inline bool svm_sev_enabled(void)
+{
+ return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0;
+}
+
+void sev_vm_destroy(struct kvm *kvm);
+int svm_mem_enc_op(struct kvm *kvm, void __user *argp);
+int svm_register_enc_region(struct kvm *kvm,
+ struct kvm_enc_region *range);
+int svm_unregister_enc_region(struct kvm *kvm,
+ struct kvm_enc_region *range);
+void pre_sev_run(struct vcpu_svm *svm, int cpu);
+int __init sev_hardware_setup(void);
+void sev_hardware_teardown(void);
+
+#endif
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
new file mode 100644
index 000000000000..fa1af90067e9
--- /dev/null
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/bitsperlong.h>
+#include <asm/kvm_vcpu_regs.h>
+
+#define WORD_SIZE (BITS_PER_LONG / 8)
+
+/* Intentionally omit RAX as it's context switched by hardware */
+#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE
+#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE
+#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE
+/* Intentionally omit RSP as it's context switched by hardware */
+#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE
+#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE
+#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE
+
+#ifdef CONFIG_X86_64
+#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE
+#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE
+#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE
+#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE
+#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE
+#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE
+#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE
+#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
+#endif
+
+ .text
+
+/**
+ * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
+ * @vmcb_pa: unsigned long
+ * @regs: unsigned long * (to guest registers)
+ */
+SYM_FUNC_START(__svm_vcpu_run)
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+#ifdef CONFIG_X86_64
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+#else
+ push %edi
+ push %esi
+#endif
+ push %_ASM_BX
+
+ /* Save @regs. */
+ push %_ASM_ARG2
+
+ /* Save @vmcb. */
+ push %_ASM_ARG1
+
+ /* Move @regs to RAX. */
+ mov %_ASM_ARG2, %_ASM_AX
+
+ /* Load guest registers. */
+ mov VCPU_RCX(%_ASM_AX), %_ASM_CX
+ mov VCPU_RDX(%_ASM_AX), %_ASM_DX
+ mov VCPU_RBX(%_ASM_AX), %_ASM_BX
+ mov VCPU_RBP(%_ASM_AX), %_ASM_BP
+ mov VCPU_RSI(%_ASM_AX), %_ASM_SI
+ mov VCPU_RDI(%_ASM_AX), %_ASM_DI
+#ifdef CONFIG_X86_64
+ mov VCPU_R8 (%_ASM_AX), %r8
+ mov VCPU_R9 (%_ASM_AX), %r9
+ mov VCPU_R10(%_ASM_AX), %r10
+ mov VCPU_R11(%_ASM_AX), %r11
+ mov VCPU_R12(%_ASM_AX), %r12
+ mov VCPU_R13(%_ASM_AX), %r13
+ mov VCPU_R14(%_ASM_AX), %r14
+ mov VCPU_R15(%_ASM_AX), %r15
+#endif
+
+ /* "POP" @vmcb to RAX. */
+ pop %_ASM_AX
+
+ /* Enter guest mode */
+1: vmload %_ASM_AX
+ jmp 3f
+2: cmpb $0, kvm_rebooting
+ jne 3f
+ ud2
+ _ASM_EXTABLE(1b, 2b)
+
+3: vmrun %_ASM_AX
+ jmp 5f
+4: cmpb $0, kvm_rebooting
+ jne 5f
+ ud2
+ _ASM_EXTABLE(3b, 4b)
+
+5: vmsave %_ASM_AX
+ jmp 7f
+6: cmpb $0, kvm_rebooting
+ jne 7f
+ ud2
+ _ASM_EXTABLE(5b, 6b)
+7:
+ /* "POP" @regs to RAX. */
+ pop %_ASM_AX
+
+ /* Save all guest registers. */
+ mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
+ mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
+ mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
+ mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
+ mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
+ mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
+#ifdef CONFIG_X86_64
+ mov %r8, VCPU_R8 (%_ASM_AX)
+ mov %r9, VCPU_R9 (%_ASM_AX)
+ mov %r10, VCPU_R10(%_ASM_AX)
+ mov %r11, VCPU_R11(%_ASM_AX)
+ mov %r12, VCPU_R12(%_ASM_AX)
+ mov %r13, VCPU_R13(%_ASM_AX)
+ mov %r14, VCPU_R14(%_ASM_AX)
+ mov %r15, VCPU_R15(%_ASM_AX)
+#endif
+
+ /*
+ * Clear all general purpose registers except RSP and RAX to prevent
+ * speculative use of the guest's values, even those that are reloaded
+ * via the stack. In theory, an L1 cache miss when restoring registers
+ * could lead to speculative execution with the guest's values.
+ * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
+ * free. RSP and RAX are exempt as they are restored by hardware
+ * during VM-Exit.
+ */
+ xor %ecx, %ecx
+ xor %edx, %edx
+ xor %ebx, %ebx
+ xor %ebp, %ebp
+ xor %esi, %esi
+ xor %edi, %edi
+#ifdef CONFIG_X86_64
+ xor %r8d, %r8d
+ xor %r9d, %r9d
+ xor %r10d, %r10d
+ xor %r11d, %r11d
+ xor %r12d, %r12d
+ xor %r13d, %r13d
+ xor %r14d, %r14d
+ xor %r15d, %r15d
+#endif
+
+ pop %_ASM_BX
+
+#ifdef CONFIG_X86_64
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+#else
+ pop %esi
+ pop %edi
+#endif
+ pop %_ASM_BP
+ ret
+SYM_FUNC_END(__svm_vcpu_run)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index de232306561a..cbc9ea2de28f 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3645,7 +3645,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
* Clear the MTF state. If a higher priority VM-exit is delivered first,
* this state is discarded.
*/
- vmx->nested.mtf_pending = false;
+ if (!block_nested_events)
+ vmx->nested.mtf_pending = false;
if (lapic_in_kernel(vcpu) &&
test_bit(KVM_APIC_INIT, &apic->pending_events)) {
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 9651ba388ba9..87f3f24fef37 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -58,12 +58,8 @@ SYM_FUNC_START(vmx_vmenter)
ret
4: ud2
- .pushsection .fixup, "ax"
-5: jmp 3b
- .popsection
-
- _ASM_EXTABLE(1b, 5b)
- _ASM_EXTABLE(2b, 5b)
+ _ASM_EXTABLE(1b, 3b)
+ _ASM_EXTABLE(2b, 3b)
SYM_FUNC_END(vmx_vmenter)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 91749f1254e8..8959514eaf0f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2261,10 +2261,6 @@ static int hardware_enable(void)
!hv_get_vp_assist_page(cpu))
return -EFAULT;
- INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
- INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
- spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-
r = kvm_cpu_vmxon(phys_addr);
if (r)
return r;
@@ -8044,7 +8040,7 @@ module_exit(vmx_exit);
static int __init vmx_init(void)
{
- int r;
+ int r, cpu;
#if IS_ENABLED(CONFIG_HYPERV)
/*
@@ -8098,6 +8094,12 @@ static int __init vmx_init(void)
return r;
}
+ for_each_possible_cpu(cpu) {
+ INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ }
+
#ifdef CONFIG_KEXEC_CORE
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b8124b562dea..027dfd278a97 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1586,7 +1586,8 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data
if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
- ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {
+ ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
+ ((u32)(data >> 32) != X2APIC_BROADCAST)) {
data &= ~(1 << 12);
kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 859519f5b342..a51df516b87b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1222,7 +1222,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
return 1;
/* read, not present: */
- if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+ if (unlikely(!vma_is_accessible(vma)))
return 1;
return 0;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index e7bb483557c9..1bba16c5742b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -467,7 +467,7 @@ bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
* the physical memory. To access them they are temporarily mapped.
*/
unsigned long __ref init_memory_mapping(unsigned long start,
- unsigned long end)
+ unsigned long end, pgprot_t prot)
{
struct map_range mr[NR_RANGE_MR];
unsigned long ret = 0;
@@ -481,7 +481,8 @@ unsigned long __ref init_memory_mapping(unsigned long start,
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
- mr[i].page_size_mask);
+ mr[i].page_size_mask,
+ prot);
add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
@@ -521,7 +522,7 @@ static unsigned long __init init_range_memory_mapping(
*/
can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
- init_memory_mapping(start, end);
+ init_memory_mapping(start, end, PAGE_KERNEL);
mapped_ram_size += end - start;
can_use_brk_pgt = true;
}
@@ -661,7 +662,7 @@ void __init init_mem_mapping(void)
#endif
/* the ISA range is always mapped regardless of memory holes */
- init_memory_mapping(0, ISA_END_ADDRESS);
+ init_memory_mapping(0, ISA_END_ADDRESS, PAGE_KERNEL);
/* Init the trampoline, possibly with KASLR memory offset */
init_trampoline();
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index de73992b8432..4222a010057a 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -257,7 +257,8 @@ static inline int __is_kernel_text(unsigned long addr)
unsigned long __init
kernel_physical_mapping_init(unsigned long start,
unsigned long end,
- unsigned long page_size_mask)
+ unsigned long page_size_mask,
+ pgprot_t prot)
{
int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
unsigned long last_map_addr = end;
@@ -819,12 +820,24 @@ void __init mem_init(void)
#ifdef CONFIG_MEMORY_HOTPLUG
int arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
+ int ret;
- return __add_pages(nid, start_pfn, nr_pages, restrictions);
+ /*
+ * The page tables were already mapped at boot so if the caller
+ * requests a different mapping type then we must change all the
+ * pages with __set_memory_prot().
+ */
+ if (params->pgprot.pgprot != PAGE_KERNEL.pgprot) {
+ ret = __set_memory_prot(start, nr_pages, params->pgprot);
+ if (ret)
+ return ret;
+ }
+
+ return __add_pages(nid, start_pfn, nr_pages, params);
}
void arch_remove_memory(int nid, u64 start, u64 size,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0a14711d3a93..3b289c2f75cd 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -585,7 +585,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
*/
static unsigned long __meminit
phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
- unsigned long page_size_mask, bool init)
+ unsigned long page_size_mask, pgprot_t _prot, bool init)
{
unsigned long pages = 0, paddr_next;
unsigned long paddr_last = paddr_end;
@@ -595,7 +595,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) {
pud_t *pud;
pmd_t *pmd;
- pgprot_t prot = PAGE_KERNEL;
+ pgprot_t prot = _prot;
vaddr = (unsigned long)__va(paddr);
pud = pud_page + pud_index(vaddr);
@@ -644,9 +644,12 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
if (page_size_mask & (1<<PG_LEVEL_1G)) {
pages++;
spin_lock(&init_mm.page_table_lock);
+
+ prot = __pgprot(pgprot_val(prot) | __PAGE_KERNEL_LARGE);
+
set_pte_init((pte_t *)pud,
pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
- PAGE_KERNEL_LARGE),
+ prot),
init);
spin_unlock(&init_mm.page_table_lock);
paddr_last = paddr_next;
@@ -669,7 +672,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
static unsigned long __meminit
phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
- unsigned long page_size_mask, bool init)
+ unsigned long page_size_mask, pgprot_t prot, bool init)
{
unsigned long vaddr, vaddr_end, vaddr_next, paddr_next, paddr_last;
@@ -679,7 +682,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
if (!pgtable_l5_enabled())
return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end,
- page_size_mask, init);
+ page_size_mask, prot, init);
for (; vaddr < vaddr_end; vaddr = vaddr_next) {
p4d_t *p4d = p4d_page + p4d_index(vaddr);
@@ -702,13 +705,13 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
if (!p4d_none(*p4d)) {
pud = pud_offset(p4d, 0);
paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
- page_size_mask, init);
+ page_size_mask, prot, init);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
- page_size_mask, init);
+ page_size_mask, prot, init);
spin_lock(&init_mm.page_table_lock);
p4d_populate_init(&init_mm, p4d, pud, init);
@@ -722,7 +725,7 @@ static unsigned long __meminit
__kernel_physical_mapping_init(unsigned long paddr_start,
unsigned long paddr_end,
unsigned long page_size_mask,
- bool init)
+ pgprot_t prot, bool init)
{
bool pgd_changed = false;
unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
@@ -743,13 +746,13 @@ __kernel_physical_mapping_init(unsigned long paddr_start,
paddr_last = phys_p4d_init(p4d, __pa(vaddr),
__pa(vaddr_end),
page_size_mask,
- init);
+ prot, init);
continue;
}
p4d = alloc_low_page();
paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
- page_size_mask, init);
+ page_size_mask, prot, init);
spin_lock(&init_mm.page_table_lock);
if (pgtable_l5_enabled())
@@ -778,10 +781,10 @@ __kernel_physical_mapping_init(unsigned long paddr_start,
unsigned long __meminit
kernel_physical_mapping_init(unsigned long paddr_start,
unsigned long paddr_end,
- unsigned long page_size_mask)
+ unsigned long page_size_mask, pgprot_t prot)
{
return __kernel_physical_mapping_init(paddr_start, paddr_end,
- page_size_mask, true);
+ page_size_mask, prot, true);
}
/*
@@ -796,7 +799,8 @@ kernel_physical_mapping_change(unsigned long paddr_start,
unsigned long page_size_mask)
{
return __kernel_physical_mapping_init(paddr_start, paddr_end,
- page_size_mask, false);
+ page_size_mask, PAGE_KERNEL,
+ false);
}
#ifndef CONFIG_NUMA
@@ -843,11 +847,11 @@ static void update_end_of_memory_vars(u64 start, u64 size)
}
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
int ret;
- ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
+ ret = __add_pages(nid, start_pfn, nr_pages, params);
WARN_ON_ONCE(ret);
/* update max_pfn, max_low_pfn and high_memory */
@@ -858,14 +862,14 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
}
int arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- init_memory_mapping(start, start + size);
+ init_memory_mapping(start, start + size, params->pgprot);
- return add_pages(nid, start_pfn, nr_pages, restrictions);
+ return add_pages(nid, start_pfn, nr_pages, params);
}
#define PAGE_INUSE 0xFD
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
index eeae142062ed..3f37b5c80bb3 100644
--- a/arch/x86/mm/mm_internal.h
+++ b/arch/x86/mm/mm_internal.h
@@ -12,7 +12,8 @@ void early_ioremap_page_table_range_init(void);
unsigned long kernel_physical_mapping_init(unsigned long start,
unsigned long end,
- unsigned long page_size_mask);
+ unsigned long page_size_mask,
+ pgprot_t prot);
unsigned long kernel_physical_mapping_change(unsigned long start,
unsigned long end,
unsigned long page_size_mask);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 99f7a68738f0..59ba008504dc 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -25,11 +25,8 @@ nodemask_t numa_nodes_parsed __initdata;
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
-static struct numa_meminfo numa_meminfo
-#ifndef CONFIG_MEMORY_HOTPLUG
-__initdata
-#endif
-;
+static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
+static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
static int numa_distance_cnt;
static u8 *numa_distance;
@@ -169,6 +166,19 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
}
/**
+ * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
+ * @dst: numa_meminfo to append block to
+ * @idx: Index of memblk to remove
+ * @src: numa_meminfo to remove memblk from
+ */
+static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
+ struct numa_meminfo *src)
+{
+ dst->blk[dst->nr_blks++] = src->blk[idx];
+ numa_remove_memblk_from(idx, src);
+}
+
+/**
* numa_add_memblk - Add one numa_memblk to numa_meminfo
* @nid: NUMA node ID of the new memblk
* @start: Start address of the new memblk
@@ -237,14 +247,19 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
for (i = 0; i < mi->nr_blks; i++) {
struct numa_memblk *bi = &mi->blk[i];
- /* make sure all blocks are inside the limits */
+ /* move / save reserved memory ranges */
+ if (!memblock_overlaps_region(&memblock.memory,
+ bi->start, bi->end - bi->start)) {
+ numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
+ continue;
+ }
+
+ /* make sure all non-reserved blocks are inside the limits */
bi->start = max(bi->start, low);
bi->end = min(bi->end, high);
- /* and there's no empty or non-exist block */
- if (bi->start >= bi->end ||
- !memblock_overlaps_region(&memblock.memory,
- bi->start, bi->end - bi->start))
+ /* and there's no empty block */
+ if (bi->start >= bi->end)
numa_remove_memblk_from(i--, mi);
}
@@ -881,16 +896,38 @@ EXPORT_SYMBOL(cpumask_of_node);
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
-#ifdef CONFIG_MEMORY_HOTPLUG
-int memory_add_physaddr_to_nid(u64 start)
+#ifdef CONFIG_NUMA_KEEP_MEMINFO
+static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
{
- struct numa_meminfo *mi = &numa_meminfo;
- int nid = mi->blk[0].nid;
int i;
for (i = 0; i < mi->nr_blks; i++)
if (mi->blk[i].start <= start && mi->blk[i].end > start)
- nid = mi->blk[i].nid;
+ return mi->blk[i].nid;
+ return NUMA_NO_NODE;
+}
+
+int phys_to_target_node(phys_addr_t start)
+{
+ int nid = meminfo_to_nid(&numa_meminfo, start);
+
+ /*
+ * Prefer online nodes, but if reserved memory might be
+ * hot-added continue the search with reserved ranges.
+ */
+ if (nid != NUMA_NO_NODE)
+ return nid;
+
+ return meminfo_to_nid(&numa_reserved_meminfo, start);
+}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
+
+int memory_add_physaddr_to_nid(u64 start)
+{
+ int nid = meminfo_to_nid(&numa_meminfo, start);
+
+ if (nid == NUMA_NO_NODE)
+ nid = numa_meminfo.blk[0].nid;
return nid;
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 6d5424069e2b..59eca6a94ce7 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1795,6 +1795,19 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages,
CPA_PAGES_ARRAY, pages);
}
+/*
+ * _set_memory_prot is an internal helper for callers that have been passed
+ * a pgprot_t value from upper layers and a reservation has already been taken.
+ * If you want to set the pgprot to a specific page protocol, use the
+ * set_memory_xx() functions.
+ */
+int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot)
+{
+ return change_page_attr_set_clr(&addr, numpages, prot,
+ __pgprot(~pgprot_val(prot)), 0, 0,
+ NULL);
+}
+
int _set_memory_uc(unsigned long addr, int numpages)
{
/*
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index c6f84c0b5d7a..8873ed1438a9 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -63,7 +63,7 @@ int __execute_only_pkey(struct mm_struct *mm)
static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
{
/* Do this check first since the vm_flags should be hot */
- if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
+ if ((vma->vm_flags & VM_ACCESS_FLAGS) != VM_EXEC)
return false;
if (vma_pkey(vma) != vma->vm_mm->context.execute_only_pkey)
return false;
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 607f58147311..c60255da5a6c 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -352,7 +352,8 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
if (type == EFI_MEMORY_MAPPED_IO)
return ioremap(phys_addr, size);
- last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
+ last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size,
+ PAGE_KERNEL);
if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) {
unsigned long top = last_map_pfn << PAGE_SHIFT;
efi_ioremap(top, size - (top - phys_addr), type, attribute);
diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
index 593d5f3902bd..478710384b34 100644
--- a/arch/x86/um/asm/processor.h
+++ b/arch/x86/um/asm/processor.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __UM_PROCESSOR_H
#define __UM_PROCESSOR_H
+#include <linux/time-internal.h>
/* include faultinfo structure */
#include <sysdep/faultinfo.h>
@@ -21,12 +22,19 @@
#include <asm/user.h>
/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static inline void rep_nop(void)
+static __always_inline void rep_nop(void)
{
__asm__ __volatile__("rep;nop": : :"memory");
}
-#define cpu_relax() rep_nop()
+static __always_inline void cpu_relax(void)
+{
+ if (time_travel_mode == TT_MODE_INFCPU ||
+ time_travel_mode == TT_MODE_EXTERNAL)
+ time_travel_ndelay(1);
+ else
+ rep_nop();
+}
#define task_pt_regs(t) (&(t)->thread.regs)
diff --git a/arch/x86/um/asm/vm-flags.h b/arch/x86/um/asm/vm-flags.h
index 7c297e9e2413..df7a3896f5dd 100644
--- a/arch/x86/um/asm/vm-flags.h
+++ b/arch/x86/um/asm/vm-flags.h
@@ -9,17 +9,11 @@
#ifdef CONFIG_X86_32
-#define VM_DATA_DEFAULT_FLAGS \
- (VM_READ | VM_WRITE | \
- ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC
#else
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-#define VM_STACK_DEFAULT_FLAGS (VM_GROWSDOWN | VM_READ | VM_WRITE | \
- VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_STACK_DEFAULT_FLAGS (VM_GROWSDOWN | VM_DATA_FLAGS_EXEC)
#endif
#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 33b0e20df7fc..1a2d8a50dac4 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -985,7 +985,7 @@ void xen_enable_syscall(void)
#endif /* CONFIG_X86_64 */
}
-void __init xen_pvmmu_arch_setup(void)
+static void __init xen_pvmmu_arch_setup(void)
{
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7d1c4fcbe8f7..1ba601df3a37 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -38,7 +38,7 @@ SYM_CODE_START(startup_xen)
#ifdef CONFIG_X86_64
mov initial_stack(%rip), %rsp
#else
- mov pa(initial_stack), %esp
+ mov initial_stack, %esp
#endif
#ifdef CONFIG_X86_64
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index de229424b659..3a9f1e80394a 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -122,7 +122,7 @@ config XTENSA_VARIANT_CUSTOM_NAME
help
Provide the name of a custom Xtensa processor variant.
This CORENAME selects arch/xtensa/variant/CORENAME.
- Dont forget you have to select MMU if you have one.
+ Don't forget you have to select MMU if you have one.
config XTENSA_VARIANT_NAME
string
diff --git a/arch/xtensa/boot/Makefile b/arch/xtensa/boot/Makefile
index efb91bfda2b4..1a14d38d9b33 100644
--- a/arch/xtensa/boot/Makefile
+++ b/arch/xtensa/boot/Makefile
@@ -14,7 +14,6 @@ HOSTFLAGS += -Iarch/$(ARCH)/boot/include
BIG_ENDIAN := $(shell echo __XTENSA_EB__ | $(CC) -E - | grep -v "\#")
-export ccflags-y
export BIG_ENDIAN
subdir-y := lib
diff --git a/arch/xtensa/include/asm/page.h b/arch/xtensa/include/asm/page.h
index f4771c29c7e9..37ce25ef92d6 100644
--- a/arch/xtensa/include/asm/page.h
+++ b/arch/xtensa/include/asm/page.h
@@ -203,8 +203,5 @@ static inline unsigned long ___pa(unsigned long va)
#endif /* __ASSEMBLY__ */
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
#include <asm-generic/memory_model.h>
#endif /* _XTENSA_PAGE_H */
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index 27ac17c9da09..8be0c0568c50 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h
@@ -266,7 +266,6 @@ static inline void paging_init(void) { }
static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_WRITABLE; }
static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; }
static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
-static inline int pte_special(pte_t pte) { return 0; }
static inline pte_t pte_wrprotect(pte_t pte)
{ pte_val(pte) &= ~(_PAGE_WRITABLE | _PAGE_HW_WRITE); return pte; }
@@ -280,8 +279,6 @@ static inline pte_t pte_mkyoung(pte_t pte)
{ pte_val(pte) |= _PAGE_ACCESSED; return pte; }
static inline pte_t pte_mkwrite(pte_t pte)
{ pte_val(pte) |= _PAGE_WRITABLE; return pte; }
-static inline pte_t pte_mkspecial(pte_t pte)
- { return pte; }
#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) & ~_PAGE_CA_MASK))
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index 83b244ce61ee..cd85a7a2722b 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -53,16 +53,12 @@ static void system_flush_invalidate_dcache_range(unsigned long start,
#define IPI_IRQ 0
static irqreturn_t ipi_interrupt(int irq, void *dev_id);
-static struct irqaction ipi_irqaction = {
- .handler = ipi_interrupt,
- .flags = IRQF_PERCPU,
- .name = "ipi",
-};
void ipi_init(void)
{
unsigned irq = irq_create_mapping(NULL, IPI_IRQ);
- setup_irq(irq, &ipi_irqaction);
+ if (request_irq(irq, ipi_interrupt, IRQF_PERCPU, "ipi", NULL))
+ pr_err("Failed to request irq %u (ipi)\n", irq);
}
static inline unsigned int get_core_count(void)
diff --git a/arch/xtensa/kernel/syscalls/syscallhdr.sh b/arch/xtensa/kernel/syscalls/syscallhdr.sh
index d37db641ca31..eebfb8a8ace6 100644
--- a/arch/xtensa/kernel/syscalls/syscallhdr.sh
+++ b/arch/xtensa/kernel/syscalls/syscallhdr.sh
@@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
printf "#define __NR_syscalls\t%s\n" "${nxt}"
printf "#endif\n"
printf "\n"
- printf "#endif /* %s */" "${fileguard}"
+ printf "#endif /* %s */\n" "${fileguard}"
) > "$out"
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index 69db8c93c1f9..77971fe4cc95 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -128,12 +128,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}
-static struct irqaction timer_irqaction = {
- .handler = timer_interrupt,
- .flags = IRQF_TIMER,
- .name = "timer",
-};
-
void local_timer_setup(unsigned cpu)
{
struct ccount_timer *timer = &per_cpu(ccount_timer, cpu);
@@ -184,6 +178,8 @@ static inline void calibrate_ccount(void)
void __init time_init(void)
{
+ int irq;
+
of_clk_init(NULL);
#ifdef CONFIG_XTENSA_CALIBRATE_CCOUNT
pr_info("Calibrating CPU frequency ");
@@ -199,7 +195,9 @@ void __init time_init(void)
__func__);
clocksource_register_hz(&ccount_clocksource, ccount_freq);
local_timer_setup(0);
- setup_irq(this_cpu_ptr(&ccount_timer)->evt.irq, &timer_irqaction);
+ irq = this_cpu_ptr(&ccount_timer)->evt.irq;
+ if (request_irq(irq, timer_interrupt, IRQF_TIMER, "timer", NULL))
+ pr_err("Failed to request irq %d (timer)\n", irq);
sched_clock_register(ccount_sched_clock_read, 32, ccount_freq);
timer_probe();
}
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c15a26096038..c5dc833212e1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -883,8 +883,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
/* this prevents anyone from attaching or migrating to this blkcg */
wb_blkcg_offline(blkcg);
- /* put the base cgwb reference allowing step 2 to be triggered */
- blkcg_cgwb_put(blkcg);
+ /* put the base online pin allowing step 2 to be triggered */
+ blkcg_unpin_online(blkcg);
}
/**
@@ -983,11 +983,11 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
}
spin_lock_init(&blkcg->lock);
+ refcount_set(&blkcg->online_pin, 1);
INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
INIT_HLIST_HEAD(&blkcg->blkg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&blkcg->cgwb_list);
- refcount_set(&blkcg->cgwb_refcnt, 1);
#endif
list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
@@ -1006,6 +1006,21 @@ unlock:
return ret;
}
+static int blkcg_css_online(struct cgroup_subsys_state *css)
+{
+ struct blkcg *blkcg = css_to_blkcg(css);
+ struct blkcg *parent = blkcg_parent(blkcg);
+
+ /*
+ * blkcg_pin_online() is used to delay blkcg offline so that blkgs
+ * don't go offline while cgwbs are still active on them. Pin the
+ * parent so that offline always happens towards the root.
+ */
+ if (parent)
+ blkcg_pin_online(parent);
+ return 0;
+}
+
/**
* blkcg_init_queue - initialize blkcg part of request queue
* @q: request_queue to initialize
@@ -1199,6 +1214,7 @@ static void blkcg_exit(struct task_struct *tsk)
struct cgroup_subsys io_cgrp_subsys = {
.css_alloc = blkcg_css_alloc,
+ .css_online = blkcg_css_online,
.css_offline = blkcg_css_offline,
.css_free = blkcg_css_free,
.can_attach = blkcg_can_attach,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f6291ceedee4..8e56884fd2e9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1289,7 +1289,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
* the driver there was more coming, but that turned out to
* be a lie.
*/
- if (q->mq_ops->commit_rqs)
+ if (q->mq_ops->commit_rqs && queued)
q->mq_ops->commit_rqs(hctx);
spin_lock(&hctx->lock);
@@ -1911,6 +1911,8 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
struct list_head *list)
{
+ int queued = 0;
+
while (!list_empty(list)) {
blk_status_t ret;
struct request *rq = list_first_entry(list, struct request,
@@ -1926,7 +1928,8 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
break;
}
blk_mq_end_request(rq, ret);
- }
+ } else
+ queued++;
}
/*
@@ -1934,7 +1937,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
* the driver there was more coming, but that turned out to
* be a lie.
*/
- if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs)
+ if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs && queued)
hctx->queue->mq_ops->commit_rqs(hctx);
}
diff --git a/block/partitions/core.c b/block/partitions/core.c
index b79c4513629b..bc1ded1331b1 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -496,7 +496,7 @@ int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev)
if (!disk_part_scan_enabled(disk))
return 0;
- if (bdev->bd_part_count || bdev->bd_super)
+ if (bdev->bd_part_count || bdev->bd_openers > 1)
return -EBUSY;
res = invalidate_partition(disk, 0);
if (res)
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 74920bda3fda..dcecc9f6e33f 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -138,6 +138,10 @@ source "drivers/virt/Kconfig"
source "drivers/virtio/Kconfig"
+source "drivers/vdpa/Kconfig"
+
+source "drivers/vhost/Kconfig"
+
source "drivers/hv/Kconfig"
source "drivers/xen/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 7646549a1e93..c0cd1b9075e3 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_DMADEVICES) += dma/
obj-y += soc/
obj-$(CONFIG_VIRTIO) += virtio/
+obj-$(CONFIG_VDPA) += vdpa/
obj-$(CONFIG_XEN) += xen/
# regulators early, since some subsystems rely on them to initialize
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index ed3d2d1a7ae9..7d04424189df 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1015,6 +1015,7 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev)
return ops;
if (dev_is_pci(dev)) {
+ struct iommu_fwspec *fwspec;
struct pci_bus *bus = to_pci_dev(dev)->bus;
struct iort_pci_alias_info info = { .dev = dev };
@@ -1027,8 +1028,9 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev)
err = pci_for_each_dma_alias(to_pci_dev(dev),
iort_pci_iommu_init, &info);
- if (!err && iort_pci_rc_supports_ats(node))
- dev->iommu_fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS;
+ fwspec = dev_iommu_fwspec_get(dev);
+ if (fwspec && iort_pci_rc_supports_ats(node))
+ fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS;
} else {
int i = 0;
diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 4816df520f72..b4c0152e92aa 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -1589,8 +1589,8 @@ static int acpi_ec_add(struct acpi_device *device)
strcpy(acpi_device_name(device), ACPI_EC_DEVICE_NAME);
strcpy(acpi_device_class(device), ACPI_EC_CLASS);
- if ((boot_ec && boot_ec->handle == device->handle) ||
- !strcmp(acpi_device_hid(device), ACPI_ECDT_HID)) {
+ if (boot_ec && (boot_ec->handle == device->handle ||
+ !strcmp(acpi_device_hid(device), ACPI_ECDT_HID))) {
/* Fast path: this device corresponds to the boot EC. */
ec = boot_ec;
} else {
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index a3320f93616d..fa4500f9cfd1 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -360,7 +360,7 @@ static union acpi_object *acpi_label_info(acpi_handle handle)
static u8 nfit_dsm_revid(unsigned family, unsigned func)
{
- static const u8 revid_table[NVDIMM_FAMILY_MAX+1][32] = {
+ static const u8 revid_table[NVDIMM_FAMILY_MAX+1][NVDIMM_CMD_MAX+1] = {
[NVDIMM_FAMILY_INTEL] = {
[NVDIMM_INTEL_GET_MODES] = 2,
[NVDIMM_INTEL_GET_FWINFO] = 2,
@@ -386,7 +386,7 @@ static u8 nfit_dsm_revid(unsigned family, unsigned func)
if (family > NVDIMM_FAMILY_MAX)
return 0;
- if (func > 31)
+ if (func > NVDIMM_CMD_MAX)
return 0;
id = revid_table[family][func];
if (id == 0)
@@ -492,7 +492,8 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
* Check for a valid command. For ND_CMD_CALL, we also have to
* make sure that the DSM function is supported.
*/
- if (cmd == ND_CMD_CALL && !test_bit(func, &dsm_mask))
+ if (cmd == ND_CMD_CALL &&
+ (func > NVDIMM_CMD_MAX || !test_bit(func, &dsm_mask)))
return -ENOTTY;
else if (!test_bit(cmd, &cmd_mask))
return -ENOTTY;
@@ -2026,8 +2027,10 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
continue;
}
- if (nfit_mem->bdw && nfit_mem->memdev_pmem)
+ if (nfit_mem->bdw && nfit_mem->memdev_pmem) {
set_bit(NDD_ALIASING, &flags);
+ set_bit(NDD_LABELING, &flags);
+ }
/* collate flags across all memdevs for this dimm */
list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
@@ -3492,7 +3495,8 @@ static int acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc,
if (nvdimm && cmd == ND_CMD_CALL &&
call_pkg->nd_family == NVDIMM_FAMILY_INTEL) {
func = call_pkg->nd_command;
- if ((1 << func) & NVDIMM_INTEL_SECURITY_CMDMASK)
+ if (func > NVDIMM_CMD_MAX ||
+ (1 << func) & NVDIMM_INTEL_SECURITY_CMDMASK)
return -EOPNOTSUPP;
}
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index 24241941181c..f5525f8bb770 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -34,6 +34,7 @@
| ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_HYPERV
+#define NVDIMM_CMD_MAX 31
#define NVDIMM_STANDARD_CMDMASK \
(1 << ND_CMD_SMART | 1 << ND_CMD_SMART_THRESHOLD | 1 << ND_CMD_DIMM_FLAGS \
@@ -144,32 +145,32 @@ struct nfit_spa {
unsigned long ars_state;
u32 clear_err_unit;
u32 max_ars;
- struct acpi_nfit_system_address spa[0];
+ struct acpi_nfit_system_address spa[];
};
struct nfit_dcr {
struct list_head list;
- struct acpi_nfit_control_region dcr[0];
+ struct acpi_nfit_control_region dcr[];
};
struct nfit_bdw {
struct list_head list;
- struct acpi_nfit_data_region bdw[0];
+ struct acpi_nfit_data_region bdw[];
};
struct nfit_idt {
struct list_head list;
- struct acpi_nfit_interleave idt[0];
+ struct acpi_nfit_interleave idt[];
};
struct nfit_flush {
struct list_head list;
- struct acpi_nfit_flush_address flush[0];
+ struct acpi_nfit_flush_address flush[];
};
struct nfit_memdev {
struct list_head list;
- struct acpi_nfit_memory_map memdev[0];
+ struct acpi_nfit_memory_map memdev[];
};
enum nfit_mem_flags {
diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index eadbf90e65d1..47b4969d9b93 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -72,47 +72,6 @@ int acpi_map_pxm_to_node(int pxm)
}
EXPORT_SYMBOL(acpi_map_pxm_to_node);
-/**
- * acpi_map_pxm_to_online_node - Map proximity ID to online node
- * @pxm: ACPI proximity ID
- *
- * This is similar to acpi_map_pxm_to_node(), but always returns an online
- * node. When the mapped node from a given proximity ID is offline, it
- * looks up the node distance table and returns the nearest online node.
- *
- * ACPI device drivers, which are called after the NUMA initialization has
- * completed in the kernel, can call this interface to obtain their device
- * NUMA topology from ACPI tables. Such drivers do not have to deal with
- * offline nodes. A node may be offline when a device proximity ID is
- * unique, SRAT memory entry does not exist, or NUMA is disabled, ex.
- * "numa=off" on x86.
- */
-int acpi_map_pxm_to_online_node(int pxm)
-{
- int node, min_node;
-
- node = acpi_map_pxm_to_node(pxm);
-
- if (node == NUMA_NO_NODE)
- node = 0;
-
- min_node = node;
- if (!node_online(node)) {
- int min_dist = INT_MAX, dist, n;
-
- for_each_online_node(n) {
- dist = node_distance(node, n);
- if (dist < min_dist) {
- min_dist = dist;
- min_node = n;
- }
- }
- }
-
- return min_node;
-}
-EXPORT_SYMBOL(acpi_map_pxm_to_online_node);
-
static void __init
acpi_table_print_srat_entry(struct acpi_subtable_header *header)
{
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index ad0185c8dcee..0101b65250cb 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -410,6 +410,7 @@ static const struct pci_device_id ahci_pci_tbl[] = {
{ PCI_VDEVICE(INTEL, 0x22a3), board_ahci_mobile }, /* Cherry Tr. AHCI */
{ PCI_VDEVICE(INTEL, 0x5ae3), board_ahci_mobile }, /* ApolloLake AHCI */
{ PCI_VDEVICE(INTEL, 0x34d3), board_ahci_mobile }, /* Ice Lake LP AHCI */
+ { PCI_VDEVICE(INTEL, 0x02d7), board_ahci_mobile }, /* Comet Lake PCH RAID */
/* JMicron 360/1/3/5/6, match class to avoid IDE function */
{ PCI_VENDOR_ID_JMICRON, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
@@ -1495,7 +1496,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance)
static void ahci_remap_check(struct pci_dev *pdev, int bar,
struct ahci_host_priv *hpriv)
{
- int i, count = 0;
+ int i;
u32 cap;
/*
@@ -1516,13 +1517,14 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
continue;
/* We've found a remapped device */
- count++;
+ hpriv->remapped_nvme++;
}
- if (!count)
+ if (!hpriv->remapped_nvme)
return;
- dev_warn(&pdev->dev, "Found %d remapped NVMe devices.\n", count);
+ dev_warn(&pdev->dev, "Found %u remapped NVMe devices.\n",
+ hpriv->remapped_nvme);
dev_warn(&pdev->dev,
"Switch your BIOS from RAID to AHCI mode to use them.\n");
@@ -1642,6 +1644,18 @@ static void ahci_intel_pcs_quirk(struct pci_dev *pdev, struct ahci_host_priv *hp
}
}
+static ssize_t remapped_nvme_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ata_host *host = dev_get_drvdata(dev);
+ struct ahci_host_priv *hpriv = host->private_data;
+
+ return sprintf(buf, "%u\n", hpriv->remapped_nvme);
+}
+
+static DEVICE_ATTR_RO(remapped_nvme);
+
static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
{
unsigned int board_id = ent->driver_data;
@@ -1745,6 +1759,10 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
/* detect remapped nvme devices */
ahci_remap_check(pdev, ahci_pci_bar, hpriv);
+ sysfs_add_file_to_group(&pdev->dev.kobj,
+ &dev_attr_remapped_nvme.attr,
+ NULL);
+
/* must set flag prior to save config in order to take effect */
if (ahci_broken_devslp(pdev))
hpriv->flags |= AHCI_HFLAG_NO_DEVSLP;
@@ -1896,6 +1914,9 @@ static void ahci_shutdown_one(struct pci_dev *pdev)
static void ahci_remove_one(struct pci_dev *pdev)
{
+ sysfs_remove_file_from_group(&pdev->dev.kobj,
+ &dev_attr_remapped_nvme.attr,
+ NULL);
pm_runtime_get_noresume(&pdev->dev);
ata_pci_remove_one(pdev);
}
diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h
index 3dbf398c92ea..d991dd46e89c 100644
--- a/drivers/ata/ahci.h
+++ b/drivers/ata/ahci.h
@@ -336,6 +336,7 @@ struct ahci_host_priv {
u32 em_loc; /* enclosure management location */
u32 em_buf_sz; /* EM buffer size in byte */
u32 em_msg_type; /* EM message type */
+ u32 remapped_nvme; /* NVMe remapped device count */
bool got_runtime_pm; /* Did we do pm_runtime_get? */
struct clk *clks[AHCI_MAX_CLKS]; /* Optional */
struct reset_control *rsts; /* Optional */
diff --git a/drivers/ata/ahci_imx.c b/drivers/ata/ahci_imx.c
index 948d2c6557f3..388baf528fa8 100644
--- a/drivers/ata/ahci_imx.c
+++ b/drivers/ata/ahci_imx.c
@@ -782,7 +782,7 @@ static int ahci_imx_softreset(struct ata_link *link, unsigned int *class,
struct ata_host *host = dev_get_drvdata(ap->dev);
struct ahci_host_priv *hpriv = host->private_data;
struct imx_ahci_priv *imxpriv = hpriv->plat_data;
- int ret = -EIO;
+ int ret;
if (imxpriv->type == AHCI_IMX53)
ret = ahci_pmp_retry_srst_ops.softreset(link, class, deadline);
diff --git a/drivers/ata/libata-pmp.c b/drivers/ata/libata-pmp.c
index 3ff14071617c..79f2aeeb482a 100644
--- a/drivers/ata/libata-pmp.c
+++ b/drivers/ata/libata-pmp.c
@@ -763,6 +763,7 @@ static int sata_pmp_eh_recover_pmp(struct ata_port *ap,
if (dev->flags & ATA_DFLAG_DETACH) {
detach = 1;
+ rc = -ENODEV;
goto fail;
}
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 4086718f6876..dbec3a05590a 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -27,6 +27,24 @@
#define MEMORY_CLASS_NAME "memory"
+static const char *const online_type_to_str[] = {
+ [MMOP_OFFLINE] = "offline",
+ [MMOP_ONLINE] = "online",
+ [MMOP_ONLINE_KERNEL] = "online_kernel",
+ [MMOP_ONLINE_MOVABLE] = "online_movable",
+};
+
+int memhp_online_type_from_str(const char *str)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
+ if (sysfs_streq(str, online_type_to_str[i]))
+ return i;
+ }
+ return -EINVAL;
+}
+
#define to_memory_block(dev) container_of(dev, struct memory_block, dev)
static int sections_per_block;
@@ -145,45 +163,6 @@ int memory_notify(unsigned long val, void *v)
}
/*
- * The probe routines leave the pages uninitialized, just as the bootmem code
- * does. Make sure we do not access them, but instead use only information from
- * within sections.
- */
-static bool pages_correctly_probed(unsigned long start_pfn)
-{
- unsigned long section_nr = pfn_to_section_nr(start_pfn);
- unsigned long section_nr_end = section_nr + sections_per_block;
- unsigned long pfn = start_pfn;
-
- /*
- * memmap between sections is not contiguous except with
- * SPARSEMEM_VMEMMAP. We lookup the page once per section
- * and assume memmap is contiguous within each section
- */
- for (; section_nr < section_nr_end; section_nr++) {
- if (WARN_ON_ONCE(!pfn_valid(pfn)))
- return false;
-
- if (!present_section_nr(section_nr)) {
- pr_warn("section %ld pfn[%lx, %lx) not present\n",
- section_nr, pfn, pfn + PAGES_PER_SECTION);
- return false;
- } else if (!valid_section_nr(section_nr)) {
- pr_warn("section %ld pfn[%lx, %lx) no valid memmap\n",
- section_nr, pfn, pfn + PAGES_PER_SECTION);
- return false;
- } else if (online_section_nr(section_nr)) {
- pr_warn("section %ld pfn[%lx, %lx) is already online\n",
- section_nr, pfn, pfn + PAGES_PER_SECTION);
- return false;
- }
- pfn += PAGES_PER_SECTION;
- }
-
- return true;
-}
-
-/*
* MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
* OK to have direct references to sparsemem variables in here.
*/
@@ -199,9 +178,6 @@ memory_block_action(unsigned long start_section_nr, unsigned long action,
switch (action) {
case MEM_ONLINE:
- if (!pages_correctly_probed(start_pfn))
- return -EBUSY;
-
ret = online_pages(start_pfn, nr_pages, online_type, nid);
break;
case MEM_OFFLINE:
@@ -245,17 +221,14 @@ static int memory_subsys_online(struct device *dev)
return 0;
/*
- * If we are called from state_store(), online_type will be
- * set >= 0 Otherwise we were called from the device online
- * attribute and need to set the online_type.
+ * When called via device_online() without configuring the online_type,
+ * we want to default to MMOP_ONLINE.
*/
- if (mem->online_type < 0)
- mem->online_type = MMOP_ONLINE_KEEP;
+ if (mem->online_type == MMOP_OFFLINE)
+ mem->online_type = MMOP_ONLINE;
ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
-
- /* clear online_type */
- mem->online_type = -1;
+ mem->online_type = MMOP_OFFLINE;
return ret;
}
@@ -267,40 +240,27 @@ static int memory_subsys_offline(struct device *dev)
if (mem->state == MEM_OFFLINE)
return 0;
- /* Can't offline block with non-present sections */
- if (mem->section_count != sections_per_block)
- return -EINVAL;
-
return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
}
static ssize_t state_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
+ const int online_type = memhp_online_type_from_str(buf);
struct memory_block *mem = to_memory_block(dev);
- int ret, online_type;
+ int ret;
+
+ if (online_type < 0)
+ return -EINVAL;
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
- if (sysfs_streq(buf, "online_kernel"))
- online_type = MMOP_ONLINE_KERNEL;
- else if (sysfs_streq(buf, "online_movable"))
- online_type = MMOP_ONLINE_MOVABLE;
- else if (sysfs_streq(buf, "online"))
- online_type = MMOP_ONLINE_KEEP;
- else if (sysfs_streq(buf, "offline"))
- online_type = MMOP_OFFLINE;
- else {
- ret = -EINVAL;
- goto err;
- }
-
switch (online_type) {
case MMOP_ONLINE_KERNEL:
case MMOP_ONLINE_MOVABLE:
- case MMOP_ONLINE_KEEP:
+ case MMOP_ONLINE:
/* mem->online_type is protected by device_hotplug_lock */
mem->online_type = online_type;
ret = device_online(&mem->dev);
@@ -312,7 +272,6 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr,
ret = -EINVAL; /* should never happen */
}
-err:
unlock_device_hotplug();
if (ret < 0)
@@ -380,7 +339,8 @@ static ssize_t valid_zones_show(struct device *dev,
}
nid = mem->nid;
- default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages);
+ default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, start_pfn,
+ nr_pages);
strcat(buf, default_zone->name);
print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL,
@@ -418,23 +378,20 @@ static DEVICE_ATTR_RO(block_size_bytes);
static ssize_t auto_online_blocks_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- if (memhp_auto_online)
- return sprintf(buf, "online\n");
- else
- return sprintf(buf, "offline\n");
+ return sprintf(buf, "%s\n",
+ online_type_to_str[memhp_default_online_type]);
}
static ssize_t auto_online_blocks_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
- if (sysfs_streq(buf, "online"))
- memhp_auto_online = true;
- else if (sysfs_streq(buf, "offline"))
- memhp_auto_online = false;
- else
+ const int online_type = memhp_online_type_from_str(buf);
+
+ if (online_type < 0)
return -EINVAL;
+ memhp_default_online_type = online_type;
return count;
}
@@ -627,7 +584,7 @@ static int init_memory_block(struct memory_block **memory,
static int add_memory_block(unsigned long base_section_nr)
{
- int ret, section_count = 0;
+ int section_count = 0;
struct memory_block *mem;
unsigned long nr;
@@ -638,12 +595,8 @@ static int add_memory_block(unsigned long base_section_nr)
if (section_count == 0)
return 0;
- ret = init_memory_block(&mem, base_memory_block_id(base_section_nr),
- MEM_ONLINE);
- if (ret)
- return ret;
- mem->section_count = section_count;
- return 0;
+ return init_memory_block(&mem, base_memory_block_id(base_section_nr),
+ MEM_ONLINE);
}
static void unregister_memory(struct memory_block *memory)
@@ -679,7 +632,6 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
ret = init_memory_block(&mem, block_id, MEM_OFFLINE);
if (ret)
break;
- mem->section_count = sections_per_block;
}
if (ret) {
end_block_id = block_id;
@@ -688,7 +640,6 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
mem = find_memory_block_by_id(block_id);
if (WARN_ON_ONCE(!mem))
continue;
- mem->section_count = 0;
unregister_memory(mem);
}
}
@@ -717,7 +668,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
mem = find_memory_block_by_id(block_id);
if (WARN_ON_ONCE(!mem))
continue;
- mem->section_count = 0;
unregister_memory_block_under_nodes(mem);
unregister_memory(mem);
}
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index a42c49e04954..da693e6a834e 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -429,11 +429,12 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos,
* information.
*/
struct file *file = lo->lo_backing_file;
+ struct request_queue *q = lo->lo_queue;
int ret;
mode |= FALLOC_FL_KEEP_SIZE;
- if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
+ if (!blk_queue_discard(q)) {
ret = -EOPNOTSUPP;
goto out;
}
@@ -463,7 +464,7 @@ static void lo_complete_rq(struct request *rq)
if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) ||
req_op(rq) != REQ_OP_READ) {
if (cmd->ret < 0)
- ret = BLK_STS_IOERR;
+ ret = errno_to_blk_status(cmd->ret);
goto end_io;
}
@@ -868,27 +869,46 @@ static void loop_config_discard(struct loop_device *lo)
struct request_queue *q = lo->lo_queue;
/*
+ * If the backing device is a block device, mirror its zeroing
+ * capability. Set the discard sectors to the block device's zeroing
+ * capabilities because loop discards result in blkdev_issue_zeroout(),
+ * not blkdev_issue_discard(). This maintains consistent behavior with
+ * file-backed loop devices: discarded regions read back as zero.
+ */
+ if (S_ISBLK(inode->i_mode) && !lo->lo_encrypt_key_size) {
+ struct request_queue *backingq;
+
+ backingq = bdev_get_queue(inode->i_bdev);
+ blk_queue_max_discard_sectors(q,
+ backingq->limits.max_write_zeroes_sectors);
+
+ blk_queue_max_write_zeroes_sectors(q,
+ backingq->limits.max_write_zeroes_sectors);
+
+ /*
* We use punch hole to reclaim the free space used by the
* image a.k.a. discard. However we do not support discard if
* encryption is enabled, because it may give an attacker
* useful information.
*/
- if ((!file->f_op->fallocate) ||
- lo->lo_encrypt_key_size) {
+ } else if (!file->f_op->fallocate || lo->lo_encrypt_key_size) {
q->limits.discard_granularity = 0;
q->limits.discard_alignment = 0;
blk_queue_max_discard_sectors(q, 0);
blk_queue_max_write_zeroes_sectors(q, 0);
- blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
- return;
- }
- q->limits.discard_granularity = inode->i_sb->s_blocksize;
- q->limits.discard_alignment = 0;
+ } else {
+ q->limits.discard_granularity = inode->i_sb->s_blocksize;
+ q->limits.discard_alignment = 0;
+
+ blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
+ blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
+ }
- blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
- blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
- blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
+ if (q->limits.max_write_zeroes_sectors)
+ blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
+ else
+ blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
}
static void loop_unprepare_queue(struct loop_device *lo)
@@ -1955,7 +1975,10 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
failed:
/* complete non-aio request */
if (!cmd->use_aio || ret) {
- cmd->ret = ret ? -EIO : 0;
+ if (ret == -EOPNOTSUPP)
+ cmd->ret = ret;
+ else
+ cmd->ret = ret ? -EIO : 0;
blk_mq_complete_request(rq);
}
}
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6343402c09e6..1e0a6b19ae0d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -337,10 +337,7 @@ struct rbd_img_request {
u64 snap_id; /* for reads */
struct ceph_snap_context *snapc; /* for writes */
};
- union {
- struct request *rq; /* block request */
- struct rbd_obj_request *obj_request; /* obj req initiator */
- };
+ struct rbd_obj_request *obj_request; /* obj req initiator */
struct list_head lock_item;
struct list_head object_extents; /* obj_req.ex structs */
@@ -349,7 +346,6 @@ struct rbd_img_request {
struct pending_result pending;
struct work_struct work;
int work_result;
- struct kref kref;
};
#define for_each_obj_request(ireq, oreq) \
@@ -1320,15 +1316,6 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
kref_put(&obj_request->kref, rbd_obj_request_destroy);
}
-static void rbd_img_request_destroy(struct kref *kref);
-static void rbd_img_request_put(struct rbd_img_request *img_request)
-{
- rbd_assert(img_request != NULL);
- dout("%s: img %p (was %d)\n", __func__, img_request,
- kref_read(&img_request->kref));
- kref_put(&img_request->kref, rbd_img_request_destroy);
-}
-
static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
struct rbd_obj_request *obj_request)
{
@@ -1366,18 +1353,10 @@ static void rbd_osd_submit(struct ceph_osd_request *osd_req)
static void img_request_layered_set(struct rbd_img_request *img_request)
{
set_bit(IMG_REQ_LAYERED, &img_request->flags);
- smp_mb();
-}
-
-static void img_request_layered_clear(struct rbd_img_request *img_request)
-{
- clear_bit(IMG_REQ_LAYERED, &img_request->flags);
- smp_mb();
}
static bool img_request_layered_test(struct rbd_img_request *img_request)
{
- smp_mb();
return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
}
@@ -1619,10 +1598,8 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
if (!rbd_dev->parent_spec)
return false;
- down_read(&rbd_dev->header_rwsem);
if (rbd_dev->parent_overlap)
counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
- up_read(&rbd_dev->header_rwsem);
if (counter < 0)
rbd_warn(rbd_dev, "parent reference overflow");
@@ -1630,63 +1607,54 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
return counter > 0;
}
-/*
- * Caller is responsible for filling in the list of object requests
- * that comprises the image request, and the Linux request pointer
- * (if there is one).
- */
-static struct rbd_img_request *rbd_img_request_create(
- struct rbd_device *rbd_dev,
- enum obj_operation_type op_type,
- struct ceph_snap_context *snapc)
+static void rbd_img_request_init(struct rbd_img_request *img_request,
+ struct rbd_device *rbd_dev,
+ enum obj_operation_type op_type)
{
- struct rbd_img_request *img_request;
-
- img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
- if (!img_request)
- return NULL;
+ memset(img_request, 0, sizeof(*img_request));
img_request->rbd_dev = rbd_dev;
img_request->op_type = op_type;
- if (!rbd_img_is_write(img_request))
- img_request->snap_id = rbd_dev->spec->snap_id;
- else
- img_request->snapc = snapc;
-
- if (rbd_dev_parent_get(rbd_dev))
- img_request_layered_set(img_request);
INIT_LIST_HEAD(&img_request->lock_item);
INIT_LIST_HEAD(&img_request->object_extents);
mutex_init(&img_request->state_mutex);
- kref_init(&img_request->kref);
+}
+
+static void rbd_img_capture_header(struct rbd_img_request *img_req)
+{
+ struct rbd_device *rbd_dev = img_req->rbd_dev;
+
+ lockdep_assert_held(&rbd_dev->header_rwsem);
- return img_request;
+ if (rbd_img_is_write(img_req))
+ img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
+ else
+ img_req->snap_id = rbd_dev->spec->snap_id;
+
+ if (rbd_dev_parent_get(rbd_dev))
+ img_request_layered_set(img_req);
}
-static void rbd_img_request_destroy(struct kref *kref)
+static void rbd_img_request_destroy(struct rbd_img_request *img_request)
{
- struct rbd_img_request *img_request;
struct rbd_obj_request *obj_request;
struct rbd_obj_request *next_obj_request;
- img_request = container_of(kref, struct rbd_img_request, kref);
-
dout("%s: img %p\n", __func__, img_request);
WARN_ON(!list_empty(&img_request->lock_item));
for_each_obj_request_safe(img_request, obj_request, next_obj_request)
rbd_img_obj_request_del(img_request, obj_request);
- if (img_request_layered_test(img_request)) {
- img_request_layered_clear(img_request);
+ if (img_request_layered_test(img_request))
rbd_dev_parent_put(img_request->rbd_dev);
- }
if (rbd_img_is_write(img_request))
ceph_put_snap_context(img_request->snapc);
- kmem_cache_free(rbd_img_request_cache, img_request);
+ if (test_bit(IMG_REQ_CHILD, &img_request->flags))
+ kmem_cache_free(rbd_img_request_cache, img_request);
}
#define BITS_PER_OBJ 2
@@ -2849,17 +2817,22 @@ static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
{
struct rbd_img_request *img_req = obj_req->img_request;
+ struct rbd_device *parent = img_req->rbd_dev->parent;
struct rbd_img_request *child_img_req;
int ret;
- child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
- OBJ_OP_READ, NULL);
+ child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
if (!child_img_req)
return -ENOMEM;
+ rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
child_img_req->obj_request = obj_req;
+ down_read(&parent->header_rwsem);
+ rbd_img_capture_header(child_img_req);
+ up_read(&parent->header_rwsem);
+
dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
obj_req);
@@ -2888,7 +2861,7 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
obj_req->copyup_bvecs);
}
if (ret) {
- rbd_img_request_put(child_img_req);
+ rbd_img_request_destroy(child_img_req);
return ret;
}
@@ -3647,15 +3620,15 @@ again:
if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
struct rbd_obj_request *obj_req = img_req->obj_request;
- rbd_img_request_put(img_req);
+ rbd_img_request_destroy(img_req);
if (__rbd_obj_handle_request(obj_req, &result)) {
img_req = obj_req->img_request;
goto again;
}
} else {
- struct request *rq = img_req->rq;
+ struct request *rq = blk_mq_rq_from_pdu(img_req);
- rbd_img_request_put(img_req);
+ rbd_img_request_destroy(img_req);
blk_mq_end_request(rq, errno_to_blk_status(result));
}
}
@@ -4707,84 +4680,36 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
static void rbd_queue_workfn(struct work_struct *work)
{
- struct request *rq = blk_mq_rq_from_pdu(work);
- struct rbd_device *rbd_dev = rq->q->queuedata;
- struct rbd_img_request *img_request;
- struct ceph_snap_context *snapc = NULL;
+ struct rbd_img_request *img_request =
+ container_of(work, struct rbd_img_request, work);
+ struct rbd_device *rbd_dev = img_request->rbd_dev;
+ enum obj_operation_type op_type = img_request->op_type;
+ struct request *rq = blk_mq_rq_from_pdu(img_request);
u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
u64 length = blk_rq_bytes(rq);
- enum obj_operation_type op_type;
u64 mapping_size;
int result;
- switch (req_op(rq)) {
- case REQ_OP_DISCARD:
- op_type = OBJ_OP_DISCARD;
- break;
- case REQ_OP_WRITE_ZEROES:
- op_type = OBJ_OP_ZEROOUT;
- break;
- case REQ_OP_WRITE:
- op_type = OBJ_OP_WRITE;
- break;
- case REQ_OP_READ:
- op_type = OBJ_OP_READ;
- break;
- default:
- dout("%s: non-fs request type %d\n", __func__, req_op(rq));
- result = -EIO;
- goto err;
- }
-
/* Ignore/skip any zero-length requests */
-
if (!length) {
dout("%s: zero-length request\n", __func__);
result = 0;
- goto err_rq;
- }
-
- if (op_type != OBJ_OP_READ) {
- if (rbd_is_ro(rbd_dev)) {
- rbd_warn(rbd_dev, "%s on read-only mapping",
- obj_op_name(op_type));
- result = -EIO;
- goto err;
- }
- rbd_assert(!rbd_is_snap(rbd_dev));
- }
-
- if (offset && length > U64_MAX - offset + 1) {
- rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
- length);
- result = -EINVAL;
- goto err_rq; /* Shouldn't happen */
+ goto err_img_request;
}
blk_mq_start_request(rq);
down_read(&rbd_dev->header_rwsem);
mapping_size = rbd_dev->mapping.size;
- if (op_type != OBJ_OP_READ) {
- snapc = rbd_dev->header.snapc;
- ceph_get_snap_context(snapc);
- }
+ rbd_img_capture_header(img_request);
up_read(&rbd_dev->header_rwsem);
if (offset + length > mapping_size) {
rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
length, mapping_size);
result = -EIO;
- goto err_rq;
- }
-
- img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
- if (!img_request) {
- result = -ENOMEM;
- goto err_rq;
+ goto err_img_request;
}
- img_request->rq = rq;
- snapc = NULL; /* img_request consumes a ref */
dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
img_request, obj_op_name(op_type), offset, length);
@@ -4801,23 +4726,51 @@ static void rbd_queue_workfn(struct work_struct *work)
return;
err_img_request:
- rbd_img_request_put(img_request);
-err_rq:
+ rbd_img_request_destroy(img_request);
if (result)
rbd_warn(rbd_dev, "%s %llx at %llx result %d",
obj_op_name(op_type), length, offset, result);
- ceph_put_snap_context(snapc);
-err:
blk_mq_end_request(rq, errno_to_blk_status(result));
}
static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
- struct request *rq = bd->rq;
- struct work_struct *work = blk_mq_rq_to_pdu(rq);
+ struct rbd_device *rbd_dev = hctx->queue->queuedata;
+ struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
+ enum obj_operation_type op_type;
- queue_work(rbd_wq, work);
+ switch (req_op(bd->rq)) {
+ case REQ_OP_DISCARD:
+ op_type = OBJ_OP_DISCARD;
+ break;
+ case REQ_OP_WRITE_ZEROES:
+ op_type = OBJ_OP_ZEROOUT;
+ break;
+ case REQ_OP_WRITE:
+ op_type = OBJ_OP_WRITE;
+ break;
+ case REQ_OP_READ:
+ op_type = OBJ_OP_READ;
+ break;
+ default:
+ rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
+ return BLK_STS_IOERR;
+ }
+
+ rbd_img_request_init(img_req, rbd_dev, op_type);
+
+ if (rbd_img_is_write(img_req)) {
+ if (rbd_is_ro(rbd_dev)) {
+ rbd_warn(rbd_dev, "%s on read-only mapping",
+ obj_op_name(img_req->op_type));
+ return BLK_STS_IOERR;
+ }
+ rbd_assert(!rbd_is_snap(rbd_dev));
+ }
+
+ INIT_WORK(&img_req->work, rbd_queue_workfn);
+ queue_work(rbd_wq, &img_req->work);
return BLK_STS_OK;
}
@@ -4984,18 +4937,8 @@ out:
return ret;
}
-static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
- unsigned int hctx_idx, unsigned int numa_node)
-{
- struct work_struct *work = blk_mq_rq_to_pdu(rq);
-
- INIT_WORK(work, rbd_queue_workfn);
- return 0;
-}
-
static const struct blk_mq_ops rbd_mq_ops = {
.queue_rq = rbd_queue_rq,
- .init_request = rbd_init_request,
};
static int rbd_init_disk(struct rbd_device *rbd_dev)
@@ -5027,8 +4970,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
- rbd_dev->tag_set.nr_hw_queues = 1;
- rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
+ rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
+ rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
if (err)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 915cf5b6388c..3b889ea950c2 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -47,6 +47,7 @@
#include <linux/bitmap.h>
#include <linux/list.h>
#include <linux/workqueue.h>
+#include <linux/sched/mm.h>
#include <xen/xen.h>
#include <xen/xenbus.h>
@@ -2189,10 +2190,12 @@ static void blkfront_setup_discard(struct blkfront_info *info)
static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
{
- unsigned int psegs, grants;
+ unsigned int psegs, grants, memflags;
int err, i;
struct blkfront_info *info = rinfo->dev_info;
+ memflags = memalloc_noio_save();
+
if (info->max_indirect_segments == 0) {
if (!HAS_EXTRA_REQ)
grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
@@ -2224,7 +2227,7 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
BUG_ON(!list_empty(&rinfo->indirect_pages));
for (i = 0; i < num; i++) {
- struct page *indirect_page = alloc_page(GFP_NOIO);
+ struct page *indirect_page = alloc_page(GFP_KERNEL);
if (!indirect_page)
goto out_of_memory;
list_add(&indirect_page->lru, &rinfo->indirect_pages);
@@ -2235,15 +2238,15 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
rinfo->shadow[i].grants_used =
kvcalloc(grants,
sizeof(rinfo->shadow[i].grants_used[0]),
- GFP_NOIO);
+ GFP_KERNEL);
rinfo->shadow[i].sg = kvcalloc(psegs,
sizeof(rinfo->shadow[i].sg[0]),
- GFP_NOIO);
+ GFP_KERNEL);
if (info->max_indirect_segments)
rinfo->shadow[i].indirect_grants =
kvcalloc(INDIRECT_GREFS(grants),
sizeof(rinfo->shadow[i].indirect_grants[0]),
- GFP_NOIO);
+ GFP_KERNEL);
if ((rinfo->shadow[i].grants_used == NULL) ||
(rinfo->shadow[i].sg == NULL) ||
(info->max_indirect_segments &&
@@ -2252,6 +2255,7 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
sg_init_table(rinfo->shadow[i].sg, psegs);
}
+ memalloc_noio_restore(memflags);
return 0;
@@ -2271,6 +2275,9 @@ out_of_memory:
__free_page(indirect_page);
}
}
+
+ memalloc_noio_restore(memflags);
+
return -ENOMEM;
}
diff --git a/drivers/char/hw_random/omap3-rom-rng.c b/drivers/char/hw_random/omap3-rom-rng.c
index a431c5cbe2be..e0d77fa048fb 100644
--- a/drivers/char/hw_random/omap3-rom-rng.c
+++ b/drivers/char/hw_random/omap3-rom-rng.c
@@ -4,7 +4,7 @@
* Copyright (C) 2009 Nokia Corporation
* Author: Juha Yrjola <juha.yrjola@solidboot.com>
*
- * Copyright (C) 2013 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (C) 2013 Pali Rohár <pali@kernel.org>
*
* This file is licensed under the terms of the GNU General Public
* License version 2. This program is licensed "as is" without any
@@ -178,5 +178,5 @@ module_platform_driver(omap3_rom_rng_driver);
MODULE_ALIAS("platform:omap3-rom-rng");
MODULE_AUTHOR("Juha Yrjola");
-MODULE_AUTHOR("Pali Rohár <pali.rohar@gmail.com>");
+MODULE_AUTHOR("Pali Rohár <pali@kernel.org>");
MODULE_LICENSE("GPL");
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index cad9563f8f48..c48d8f086382 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -618,6 +618,8 @@ static DEFINE_MUTEX(ipmidriver_mutex);
static LIST_HEAD(ipmi_interfaces);
static DEFINE_MUTEX(ipmi_interfaces_mutex);
+#define ipmi_interfaces_mutex_held() \
+ lockdep_is_held(&ipmi_interfaces_mutex)
static struct srcu_struct ipmi_interfaces_srcu;
/*
@@ -1321,7 +1323,8 @@ static void _ipmi_destroy_user(struct ipmi_user *user)
* synchronize_srcu()) then free everything in that list.
*/
mutex_lock(&intf->cmd_rcvrs_mutex);
- list_for_each_entry_rcu(rcvr, &intf->cmd_rcvrs, link) {
+ list_for_each_entry_rcu(rcvr, &intf->cmd_rcvrs, link,
+ lockdep_is_held(&intf->cmd_rcvrs_mutex)) {
if (rcvr->user == user) {
list_del_rcu(&rcvr->link);
rcvr->next = rcvrs;
@@ -1599,7 +1602,8 @@ static struct cmd_rcvr *find_cmd_rcvr(struct ipmi_smi *intf,
{
struct cmd_rcvr *rcvr;
- list_for_each_entry_rcu(rcvr, &intf->cmd_rcvrs, link) {
+ list_for_each_entry_rcu(rcvr, &intf->cmd_rcvrs, link,
+ lockdep_is_held(&intf->cmd_rcvrs_mutex)) {
if ((rcvr->netfn == netfn) && (rcvr->cmd == cmd)
&& (rcvr->chans & (1 << chan)))
return rcvr;
@@ -1614,7 +1618,8 @@ static int is_cmd_rcvr_exclusive(struct ipmi_smi *intf,
{
struct cmd_rcvr *rcvr;
- list_for_each_entry_rcu(rcvr, &intf->cmd_rcvrs, link) {
+ list_for_each_entry_rcu(rcvr, &intf->cmd_rcvrs, link,
+ lockdep_is_held(&intf->cmd_rcvrs_mutex)) {
if ((rcvr->netfn == netfn) && (rcvr->cmd == cmd)
&& (rcvr->chans & chans))
return 0;
@@ -3188,8 +3193,8 @@ static void __get_guid(struct ipmi_smi *intf)
if (rv)
/* Send failed, no GUID available. */
bmc->dyn_guid_set = 0;
-
- wait_event(intf->waitq, bmc->dyn_guid_set != 2);
+ else
+ wait_event(intf->waitq, bmc->dyn_guid_set != 2);
/* dyn_guid_set makes the guid data available. */
smp_rmb();
@@ -3450,7 +3455,8 @@ int ipmi_add_smi(struct module *owner,
/* Look for a hole in the numbers. */
i = 0;
link = &ipmi_interfaces;
- list_for_each_entry_rcu(tintf, &ipmi_interfaces, link) {
+ list_for_each_entry_rcu(tintf, &ipmi_interfaces, link,
+ ipmi_interfaces_mutex_held()) {
if (tintf->intf_num != i) {
link = &tintf->link;
break;
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index 8ac390c2b514..b7145f370d3b 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -313,6 +313,7 @@ static int start_send(struct ssif_info *ssif_info,
static unsigned long *ipmi_ssif_lock_cond(struct ssif_info *ssif_info,
unsigned long *flags)
+ __acquires(&ssif_info->lock)
{
spin_lock_irqsave(&ssif_info->lock, *flags);
return flags;
@@ -320,6 +321,7 @@ static unsigned long *ipmi_ssif_lock_cond(struct ssif_info *ssif_info,
static void ipmi_ssif_unlock_cond(struct ssif_info *ssif_info,
unsigned long *flags)
+ __releases(&ssif_info->lock)
{
spin_unlock_irqrestore(&ssif_info->lock, *flags);
}
diff --git a/drivers/char/ipmi/kcs_bmc_aspeed.c b/drivers/char/ipmi/kcs_bmc_aspeed.c
index 3c955946e647..a140203c079b 100644
--- a/drivers/char/ipmi/kcs_bmc_aspeed.c
+++ b/drivers/char/ipmi/kcs_bmc_aspeed.c
@@ -12,6 +12,7 @@
#include <linux/mfd/syscon.h>
#include <linux/module.h>
#include <linux/of.h>
+#include <linux/of_address.h>
#include <linux/platform_device.h>
#include <linux/poll.h>
#include <linux/regmap.h>
@@ -233,58 +234,154 @@ static const struct kcs_ioreg ast_kcs_bmc_ioregs[KCS_CHANNEL_MAX] = {
{ .idr = LPC_IDR4, .odr = LPC_ODR4, .str = LPC_STR4 },
};
-static int aspeed_kcs_probe(struct platform_device *pdev)
+static struct kcs_bmc *aspeed_kcs_probe_of_v1(struct platform_device *pdev)
{
- struct device *dev = &pdev->dev;
struct aspeed_kcs_bmc *priv;
- struct kcs_bmc *kcs_bmc;
- u32 chan, addr;
+ struct device_node *np;
+ struct kcs_bmc *kcs;
+ u32 channel;
+ u32 slave;
int rc;
- rc = of_property_read_u32(dev->of_node, "kcs_chan", &chan);
- if ((rc != 0) || (chan == 0 || chan > KCS_CHANNEL_MAX)) {
- dev_err(dev, "no valid 'kcs_chan' configured\n");
- return -ENODEV;
+ np = pdev->dev.of_node;
+
+ rc = of_property_read_u32(np, "kcs_chan", &channel);
+ if ((rc != 0) || (channel == 0 || channel > KCS_CHANNEL_MAX)) {
+ dev_err(&pdev->dev, "no valid 'kcs_chan' configured\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ kcs = kcs_bmc_alloc(&pdev->dev, sizeof(struct aspeed_kcs_bmc), channel);
+ if (!kcs)
+ return ERR_PTR(-ENOMEM);
+
+ priv = kcs_bmc_priv(kcs);
+ priv->map = syscon_node_to_regmap(pdev->dev.parent->of_node);
+ if (IS_ERR(priv->map)) {
+ dev_err(&pdev->dev, "Couldn't get regmap\n");
+ return ERR_PTR(-ENODEV);
}
- rc = of_property_read_u32(dev->of_node, "kcs_addr", &addr);
+ rc = of_property_read_u32(np, "kcs_addr", &slave);
if (rc) {
- dev_err(dev, "no valid 'kcs_addr' configured\n");
- return -ENODEV;
+ dev_err(&pdev->dev, "no valid 'kcs_addr' configured\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ kcs->ioreg = ast_kcs_bmc_ioregs[channel - 1];
+ aspeed_kcs_set_address(kcs, slave);
+
+ return kcs;
+}
+
+static int aspeed_kcs_calculate_channel(const struct kcs_ioreg *regs)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ast_kcs_bmc_ioregs); i++) {
+ if (!memcmp(&ast_kcs_bmc_ioregs[i], regs, sizeof(*regs)))
+ return i + 1;
}
- kcs_bmc = kcs_bmc_alloc(dev, sizeof(*priv), chan);
- if (!kcs_bmc)
- return -ENOMEM;
+ return -EINVAL;
+}
+
+static struct kcs_bmc *aspeed_kcs_probe_of_v2(struct platform_device *pdev)
+{
+ struct aspeed_kcs_bmc *priv;
+ struct device_node *np;
+ struct kcs_ioreg ioreg;
+ struct kcs_bmc *kcs;
+ const __be32 *reg;
+ int channel;
+ u32 slave;
+ int rc;
+
+ np = pdev->dev.of_node;
+
+ /* Don't translate addresses, we want offsets for the regmaps */
+ reg = of_get_address(np, 0, NULL, NULL);
+ if (!reg)
+ return ERR_PTR(-EINVAL);
+ ioreg.idr = be32_to_cpup(reg);
+
+ reg = of_get_address(np, 1, NULL, NULL);
+ if (!reg)
+ return ERR_PTR(-EINVAL);
+ ioreg.odr = be32_to_cpup(reg);
+
+ reg = of_get_address(np, 2, NULL, NULL);
+ if (!reg)
+ return ERR_PTR(-EINVAL);
+ ioreg.str = be32_to_cpup(reg);
- priv = kcs_bmc_priv(kcs_bmc);
- priv->map = syscon_node_to_regmap(dev->parent->of_node);
+ channel = aspeed_kcs_calculate_channel(&ioreg);
+ if (channel < 0)
+ return ERR_PTR(channel);
+
+ kcs = kcs_bmc_alloc(&pdev->dev, sizeof(struct aspeed_kcs_bmc), channel);
+ if (!kcs)
+ return ERR_PTR(-ENOMEM);
+
+ kcs->ioreg = ioreg;
+
+ priv = kcs_bmc_priv(kcs);
+ priv->map = syscon_node_to_regmap(pdev->dev.parent->of_node);
if (IS_ERR(priv->map)) {
- dev_err(dev, "Couldn't get regmap\n");
- return -ENODEV;
+ dev_err(&pdev->dev, "Couldn't get regmap\n");
+ return ERR_PTR(-ENODEV);
}
- kcs_bmc->ioreg = ast_kcs_bmc_ioregs[chan - 1];
+ rc = of_property_read_u32(np, "aspeed,lpc-io-reg", &slave);
+ if (rc)
+ return ERR_PTR(rc);
+
+ aspeed_kcs_set_address(kcs, slave);
+
+ return kcs;
+}
+
+static int aspeed_kcs_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct kcs_bmc *kcs_bmc;
+ struct device_node *np;
+ int rc;
+
+ np = pdev->dev.of_node;
+ if (of_device_is_compatible(np, "aspeed,ast2400-kcs-bmc") ||
+ of_device_is_compatible(np, "aspeed,ast2500-kcs-bmc"))
+ kcs_bmc = aspeed_kcs_probe_of_v1(pdev);
+ else if (of_device_is_compatible(np, "aspeed,ast2400-kcs-bmc-v2") ||
+ of_device_is_compatible(np, "aspeed,ast2500-kcs-bmc-v2"))
+ kcs_bmc = aspeed_kcs_probe_of_v2(pdev);
+ else
+ return -EINVAL;
+
+ if (IS_ERR(kcs_bmc))
+ return PTR_ERR(kcs_bmc);
+
kcs_bmc->io_inputb = aspeed_kcs_inb;
kcs_bmc->io_outputb = aspeed_kcs_outb;
- dev_set_drvdata(dev, kcs_bmc);
-
- aspeed_kcs_set_address(kcs_bmc, addr);
- aspeed_kcs_enable_channel(kcs_bmc, true);
rc = aspeed_kcs_config_irq(kcs_bmc, pdev);
if (rc)
return rc;
+ dev_set_drvdata(dev, kcs_bmc);
+
+ aspeed_kcs_enable_channel(kcs_bmc, true);
+
rc = misc_register(&kcs_bmc->miscdev);
if (rc) {
dev_err(dev, "Unable to register device\n");
return rc;
}
- pr_info("channel=%u addr=0x%x idr=0x%x odr=0x%x str=0x%x\n",
- chan, addr,
- kcs_bmc->ioreg.idr, kcs_bmc->ioreg.odr, kcs_bmc->ioreg.str);
+ dev_dbg(&pdev->dev,
+ "Probed KCS device %d (IDR=0x%x, ODR=0x%x, STR=0x%x)\n",
+ kcs_bmc->channel, kcs_bmc->ioreg.idr, kcs_bmc->ioreg.odr,
+ kcs_bmc->ioreg.str);
return 0;
}
@@ -301,6 +398,8 @@ static int aspeed_kcs_remove(struct platform_device *pdev)
static const struct of_device_id ast_kcs_bmc_match[] = {
{ .compatible = "aspeed,ast2400-kcs-bmc" },
{ .compatible = "aspeed,ast2500-kcs-bmc" },
+ { .compatible = "aspeed,ast2400-kcs-bmc-v2" },
+ { .compatible = "aspeed,ast2500-kcs-bmc-v2" },
{ }
};
MODULE_DEVICE_TABLE(of, ast_kcs_bmc_match);
diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c
index db124bc1ca2c..fcc53215bac8 100644
--- a/drivers/cpuidle/cpuidle-haltpoll.c
+++ b/drivers/cpuidle/cpuidle-haltpoll.c
@@ -94,7 +94,7 @@ static void haltpoll_uninit(void)
haltpoll_cpuidle_devices = NULL;
}
-static bool haltpool_want(void)
+static bool haltpoll_want(void)
{
return kvm_para_has_hint(KVM_HINTS_REALTIME) || force;
}
@@ -110,7 +110,7 @@ static int __init haltpoll_init(void)
cpuidle_poll_state_init(drv);
- if (!kvm_para_available() || !haltpool_want())
+ if (!kvm_para_available() || !haltpoll_want())
return -ENODEV;
ret = cpuidle_register_driver(drv);
diff --git a/drivers/crypto/chelsio/chcr_ktls.c b/drivers/crypto/chelsio/chcr_ktls.c
index 73658b71d4a3..cd1769ecdc1c 100644
--- a/drivers/crypto/chelsio/chcr_ktls.c
+++ b/drivers/crypto/chelsio/chcr_ktls.c
@@ -2,6 +2,7 @@
/* Copyright (C) 2020 Chelsio Communications. All rights reserved. */
#ifdef CONFIG_CHELSIO_TLS_DEVICE
+#include <linux/highmem.h>
#include "chcr_ktls.h"
#include "clip_tbl.h"
diff --git a/drivers/crypto/hisilicon/Kconfig b/drivers/crypto/hisilicon/Kconfig
index 095850d01dcc..f09c6cf7823e 100644
--- a/drivers/crypto/hisilicon/Kconfig
+++ b/drivers/crypto/hisilicon/Kconfig
@@ -27,6 +27,7 @@ config CRYPTO_DEV_HISI_SEC2
select CRYPTO_SHA256
select CRYPTO_SHA512
depends on PCI && PCI_MSI
+ depends on UACCE || UACCE=n
depends on ARM64 || (COMPILE_TEST && 64BIT)
help
Support for HiSilicon SEC Engine of version 2 in crypto subsystem.
@@ -58,6 +59,7 @@ config CRYPTO_DEV_HISI_ZIP
config CRYPTO_DEV_HISI_HPRE
tristate "Support for HISI HPRE accelerator"
depends on PCI && PCI_MSI
+ depends on UACCE || UACCE=n
depends on ARM64 || (COMPILE_TEST && 64BIT)
select CRYPTO_DEV_HISI_QM
select CRYPTO_DH
diff --git a/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c b/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c
index 946fb62949b2..06202bcffb33 100644
--- a/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c
+++ b/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c
@@ -1161,13 +1161,13 @@ static inline u32 create_aead_null_output_list(struct aead_request *req,
inputlen);
if (status != inputlen) {
status = -EINVAL;
- goto error;
+ goto error_free;
}
status = sg_copy_from_buffer(req->dst, sg_nents(req->dst), ptr,
inputlen);
if (status != inputlen) {
status = -EINVAL;
- goto error;
+ goto error_free;
}
kfree(ptr);
}
@@ -1209,8 +1209,10 @@ static inline u32 create_aead_null_output_list(struct aead_request *req,
req_info->outcnt = argcnt;
return 0;
-error:
+
+error_free:
kfree(ptr);
+error:
return status;
}
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 46e46047a1f7..df238c8b6ef2 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -421,8 +421,10 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
* device outside of mmap of the resulting character device.
*/
dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC);
- if (!dax_dev)
+ if (IS_ERR(dax_dev)) {
+ rc = PTR_ERR(dax_dev);
goto err;
+ }
/* a device_dax instance is dead while the driver is not attached */
kill_dax(dax_dev);
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 0aa4b6bc5101..8e32345be0f7 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -344,6 +344,23 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
}
EXPORT_SYMBOL_GPL(dax_copy_to_iter);
+int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
+ size_t nr_pages)
+{
+ if (!dax_alive(dax_dev))
+ return -ENXIO;
+ /*
+ * There are no callers that want to zero more than one page as of now.
+ * Once users are there, this check can be removed after the
+ * device mapper code has been updated to split ranges across targets.
+ */
+ if (nr_pages != 1)
+ return -EIO;
+
+ return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages);
+}
+EXPORT_SYMBOL_GPL(dax_zero_page_range);
+
#ifdef CONFIG_ARCH_HAS_PMEM_API
void arch_wb_cache_pmem(void *addr, size_t size);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
@@ -551,9 +568,16 @@ struct dax_device *alloc_dax(void *private, const char *__host,
dev_t devt;
int minor;
+ if (ops && !ops->zero_page_range) {
+ pr_debug("%s: error: device does not provide dax"
+ " operation zero_page_range()\n",
+ __host ? __host : "Unknown");
+ return ERR_PTR(-EINVAL);
+ }
+
host = kstrdup(__host, GFP_KERNEL);
if (__host && !host)
- return NULL;
+ return ERR_PTR(-ENOMEM);
minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL);
if (minor < 0)
@@ -576,7 +600,7 @@ struct dax_device *alloc_dax(void *private, const char *__host,
ida_simple_remove(&dax_minor_ida, minor);
err_minor:
kfree(host);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL_GPL(alloc_dax);
diff --git a/drivers/dma-buf/Kconfig b/drivers/dma-buf/Kconfig
index ef73b678419c..9626673f1d83 100644
--- a/drivers/dma-buf/Kconfig
+++ b/drivers/dma-buf/Kconfig
@@ -43,11 +43,12 @@ config DMABUF_MOVE_NOTIFY
bool "Move notify between drivers (EXPERIMENTAL)"
default n
help
- Don''t pin buffers if the dynamic DMA-buf interface is available on both the
- exporter as well as the importer. This fixes a security problem where
- userspace is able to pin unrestricted amounts of memory through DMA-buf.
- But marked experimental because we don''t jet have a consistent execution
- context and memory management between drivers.
+ Don't pin buffers if the dynamic DMA-buf interface is available on
+ both the exporter as well as the importer. This fixes a security
+ problem where userspace is able to pin unrestricted amounts of memory
+ through DMA-buf.
+ This is marked experimental because we don't yet have a consistent
+ execution context and memory management between drivers.
config DMABUF_SELFTESTS
tristate "Selftests for the dma-buf interfaces"
diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
index 265303a396ca..f6a2f42ffc51 100644
--- a/drivers/dma/tegra20-apb-dma.c
+++ b/drivers/dma/tegra20-apb-dma.c
@@ -1493,7 +1493,6 @@ static int tegra_dma_probe(struct platform_device *pdev)
irq = platform_get_irq(pdev, i);
if (irq < 0) {
ret = irq;
- dev_err(&pdev->dev, "No irq resource for chan %d\n", i);
goto err_pm_disable;
}
diff --git a/drivers/firmware/edd.c b/drivers/firmware/edd.c
index 29906e39ab4b..14d0970a7198 100644
--- a/drivers/firmware/edd.c
+++ b/drivers/firmware/edd.c
@@ -341,7 +341,7 @@ edd_show_legacy_max_cylinder(struct edd_device *edev, char *buf)
if (!info || !buf)
return -EINVAL;
- p += snprintf(p, left, "%u\n", info->legacy_max_cylinder);
+ p += scnprintf(p, left, "%u\n", info->legacy_max_cylinder);
return (p - buf);
}
@@ -356,7 +356,7 @@ edd_show_legacy_max_head(struct edd_device *edev, char *buf)
if (!info || !buf)
return -EINVAL;
- p += snprintf(p, left, "%u\n", info->legacy_max_head);
+ p += scnprintf(p, left, "%u\n", info->legacy_max_head);
return (p - buf);
}
@@ -371,7 +371,7 @@ edd_show_legacy_sectors_per_track(struct edd_device *edev, char *buf)
if (!info || !buf)
return -EINVAL;
- p += snprintf(p, left, "%u\n", info->legacy_sectors_per_track);
+ p += scnprintf(p, left, "%u\n", info->legacy_sectors_per_track);
return (p - buf);
}
diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c
index db0c1a9c1699..fc9f8ab533a7 100644
--- a/drivers/firmware/efi/libstub/arm64-stub.c
+++ b/drivers/firmware/efi/libstub/arm64-stub.c
@@ -75,14 +75,12 @@ efi_status_t handle_kernel_image(unsigned long *image_addr,
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && phys_seed != 0) {
/*
- * If CONFIG_DEBUG_ALIGN_RODATA is not set, produce a
- * displacement in the interval [0, MIN_KIMG_ALIGN) that
- * doesn't violate this kernel's de-facto alignment
+ * Produce a displacement in the interval [0, MIN_KIMG_ALIGN)
+ * that doesn't violate this kernel's de-facto alignment
* constraints.
*/
u32 mask = (MIN_KIMG_ALIGN - 1) & ~(EFI_KIMG_ALIGN - 1);
- u32 offset = !IS_ENABLED(CONFIG_DEBUG_ALIGN_RODATA) ?
- (phys_seed >> 32) & mask : TEXT_OFFSET;
+ u32 offset = (phys_seed >> 32) & mask;
/*
* With CONFIG_RANDOMIZE_TEXT_OFFSET=y, TEXT_OFFSET may not
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index faa3e7102156..559dc24ef436 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2340,8 +2340,6 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
{
int i, r;
- amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
- amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
if (!adev->ip_blocks[i].status.valid)
@@ -3356,6 +3354,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
}
}
+ amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
+ amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
+
amdgpu_amdkfd_suspend(adev, !fbcon);
amdgpu_ras_suspend(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
index f197f1be0969..abe94a55ecad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
@@ -89,7 +89,8 @@ void amdgpu_pm_acpi_event_handler(struct amdgpu_device *adev)
adev->pm.ac_power = true;
else
adev->pm.ac_power = false;
- if (adev->powerplay.pp_funcs->enable_bapm)
+ if (adev->powerplay.pp_funcs &&
+ adev->powerplay.pp_funcs->enable_bapm)
amdgpu_dpm_enable_bapm(adev, adev->pm.ac_power);
mutex_unlock(&adev->pm.mutex);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index be50867ea644..deaa26808841 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -818,7 +818,7 @@ static int psp_ras_initialize(struct psp_context *psp)
if (!psp->adev->psp.ta_ras_ucode_size ||
!psp->adev->psp.ta_ras_start_addr) {
- dev_warn(psp->adev->dev, "RAS: ras ta ucode is not available\n");
+ dev_info(psp->adev->dev, "RAS: optional ras ta ucode is not available\n");
return 0;
}
@@ -902,7 +902,7 @@ static int psp_hdcp_initialize(struct psp_context *psp)
if (!psp->adev->psp.ta_hdcp_ucode_size ||
!psp->adev->psp.ta_hdcp_start_addr) {
- dev_warn(psp->adev->dev, "HDCP: hdcp ta ucode is not available\n");
+ dev_info(psp->adev->dev, "HDCP: optional hdcp ta ucode is not available\n");
return 0;
}
@@ -1048,7 +1048,7 @@ static int psp_dtm_initialize(struct psp_context *psp)
if (!psp->adev->psp.ta_dtm_ucode_size ||
!psp->adev->psp.ta_dtm_start_addr) {
- dev_warn(psp->adev->dev, "DTM: dtm ta ucode is not available\n");
+ dev_info(psp->adev->dev, "DTM: optional dtm ta ucode is not available\n");
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 3c32a94d2424..ab379b44679c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1424,12 +1424,22 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
{
struct amdgpu_ras *ras =
container_of(work, struct amdgpu_ras, recovery_work);
+ struct amdgpu_device *remote_adev = NULL;
+ struct amdgpu_device *adev = ras->adev;
+ struct list_head device_list, *device_list_handle = NULL;
+ struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false);
+
+ /* Build list of devices to query RAS related errors */
+ if (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
+ device_list_handle = &hive->device_list;
+ } else {
+ list_add_tail(&adev->gmc.xgmi.head, &device_list);
+ device_list_handle = &device_list;
+ }
- /*
- * Query and print non zero error counter per IP block for
- * awareness before recovering GPU.
- */
- amdgpu_ras_log_on_err_counter(ras->adev);
+ list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) {
+ amdgpu_ras_log_on_err_counter(remote_adev);
+ }
if (amdgpu_device_should_recover_gpu(ras->adev))
amdgpu_device_gpu_recover(ras->adev, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index c8f2aa1db13b..d78059fd2c72 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -279,7 +279,7 @@ static const struct soc15_reg_golden golden_settings_gc_10_1_2_nv12[] =
#define DEFAULT_SH_MEM_CONFIG \
((SH_MEM_ADDRESS_MODE_64 << SH_MEM_CONFIG__ADDRESS_MODE__SHIFT) | \
- (SH_MEM_ALIGNMENT_MODE_UNALIGNED << SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | \
+ (SH_MEM_ALIGNMENT_MODE_DWORD << SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | \
(SH_MEM_RETRY_MODE_ALL << SH_MEM_CONFIG__RETRY_MODE__SHIFT) | \
(3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT))
@@ -1113,7 +1113,7 @@ static int gfx_v10_0_mec_init(struct amdgpu_device *adev)
return r;
}
- memset(hpd, 0, adev->gfx.mec.hpd_eop_obj->tbo.mem.size);
+ memset(hpd, 0, mec_hpd_size);
amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj);
amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
@@ -4104,6 +4104,12 @@ static void gfx_v10_0_update_medium_grain_clock_gating(struct amdgpu_device *ade
/* It is disabled by HW by default */
if (enable && (adev->cg_flags & AMD_CG_SUPPORT_GFX_MGCG)) {
+ /* 0 - Disable some blocks' MGCG */
+ WREG32_SOC15(GC, 0, mmGRBM_GFX_INDEX, 0xe0000000);
+ WREG32_SOC15(GC, 0, mmCGTT_WD_CLK_CTRL, 0xff000000);
+ WREG32_SOC15(GC, 0, mmCGTT_VGT_CLK_CTRL, 0xff000000);
+ WREG32_SOC15(GC, 0, mmCGTT_IA_CLK_CTRL, 0xff000000);
+
/* 1 - RLC_CGTT_MGCG_OVERRIDE */
def = data = RREG32_SOC15(GC, 0, mmRLC_CGTT_MGCG_OVERRIDE);
data &= ~(RLC_CGTT_MGCG_OVERRIDE__GRBM_CGTT_SCLK_OVERRIDE_MASK |
@@ -4143,19 +4149,20 @@ static void gfx_v10_0_update_medium_grain_clock_gating(struct amdgpu_device *ade
if (def != data)
WREG32_SOC15(GC, 0, mmRLC_CGTT_MGCG_OVERRIDE, data);
- /* 2 - disable MGLS in RLC */
+ /* 2 - disable MGLS in CP */
+ data = RREG32_SOC15(GC, 0, mmCP_MEM_SLP_CNTL);
+ if (data & CP_MEM_SLP_CNTL__CP_MEM_LS_EN_MASK) {
+ data &= ~CP_MEM_SLP_CNTL__CP_MEM_LS_EN_MASK;
+ WREG32_SOC15(GC, 0, mmCP_MEM_SLP_CNTL, data);
+ }
+
+ /* 3 - disable MGLS in RLC */
data = RREG32_SOC15(GC, 0, mmRLC_MEM_SLP_CNTL);
if (data & RLC_MEM_SLP_CNTL__RLC_MEM_LS_EN_MASK) {
data &= ~RLC_MEM_SLP_CNTL__RLC_MEM_LS_EN_MASK;
WREG32_SOC15(GC, 0, mmRLC_MEM_SLP_CNTL, data);
}
- /* 3 - disable MGLS in CP */
- data = RREG32_SOC15(GC, 0, mmCP_MEM_SLP_CNTL);
- if (data & CP_MEM_SLP_CNTL__CP_MEM_LS_EN_MASK) {
- data &= ~CP_MEM_SLP_CNTL__CP_MEM_LS_EN_MASK;
- WREG32_SOC15(GC, 0, mmCP_MEM_SLP_CNTL, data);
- }
}
}
@@ -4266,7 +4273,7 @@ static int gfx_v10_0_update_gfx_clock_gating(struct amdgpu_device *adev,
/* === CGCG /CGLS for GFX 3D Only === */
gfx_v10_0_update_3d_clock_gating(adev, enable);
/* === MGCG + MGLS === */
- gfx_v10_0_update_medium_grain_clock_gating(adev, enable);
+ /* gfx_v10_0_update_medium_grain_clock_gating(adev, enable); */
}
if (adev->cg_flags &
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 37c8231f1407..e6b113ed2f40 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -1217,6 +1217,8 @@ static void gfx_v9_0_check_fw_write_wait(struct amdgpu_device *adev)
adev->gfx.mec_fw_write_wait = true;
break;
default:
+ adev->gfx.me_fw_write_wait = true;
+ adev->gfx.mec_fw_write_wait = true;
break;
}
}
@@ -1946,7 +1948,7 @@ static int gfx_v9_0_mec_init(struct amdgpu_device *adev)
return r;
}
- memset(hpd, 0, adev->gfx.mec.hpd_eop_obj->tbo.mem.size);
+ memset(hpd, 0, mec_hpd_size);
amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj);
amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
index cceb46faf212..dce945ef21a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
@@ -710,14 +710,16 @@ static int gfx_v9_4_query_utc_edc_status(struct amdgpu_device *adev,
sec_count = REG_GET_FIELD(data, VML2_MEM_ECC_CNTL, SEC_COUNT);
if (sec_count) {
- DRM_INFO("Instance[%d]: SubBlock %s, SEC %d\n", i,
+ dev_info(adev->dev,
+ "Instance[%d]: SubBlock %s, SEC %d\n", i,
vml2_mems[i], sec_count);
err_data->ce_count += sec_count;
}
ded_count = REG_GET_FIELD(data, VML2_MEM_ECC_CNTL, DED_COUNT);
if (ded_count) {
- DRM_INFO("Instance[%d]: SubBlock %s, DED %d\n", i,
+ dev_info(adev->dev,
+ "Instance[%d]: SubBlock %s, DED %d\n", i,
vml2_mems[i], ded_count);
err_data->ue_count += ded_count;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
index 0d413fabd015..c0e3efcb09bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
@@ -1539,8 +1539,11 @@ static const struct soc15_reg_entry mmhub_v9_4_edc_cnt_regs[] = {
{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA7_EDC_CNT3), 0, 0, 0 },
};
-static int mmhub_v9_4_get_ras_error_count(const struct soc15_reg_entry *reg,
- uint32_t value, uint32_t *sec_count, uint32_t *ded_count)
+static int mmhub_v9_4_get_ras_error_count(struct amdgpu_device *adev,
+ const struct soc15_reg_entry *reg,
+ uint32_t value,
+ uint32_t *sec_count,
+ uint32_t *ded_count)
{
uint32_t i;
uint32_t sec_cnt, ded_cnt;
@@ -1553,7 +1556,7 @@ static int mmhub_v9_4_get_ras_error_count(const struct soc15_reg_entry *reg,
mmhub_v9_4_ras_fields[i].sec_count_mask) >>
mmhub_v9_4_ras_fields[i].sec_count_shift;
if (sec_cnt) {
- DRM_INFO("MMHUB SubBlock %s, SEC %d\n",
+ dev_info(adev->dev, "MMHUB SubBlock %s, SEC %d\n",
mmhub_v9_4_ras_fields[i].name,
sec_cnt);
*sec_count += sec_cnt;
@@ -1563,7 +1566,7 @@ static int mmhub_v9_4_get_ras_error_count(const struct soc15_reg_entry *reg,
mmhub_v9_4_ras_fields[i].ded_count_mask) >>
mmhub_v9_4_ras_fields[i].ded_count_shift;
if (ded_cnt) {
- DRM_INFO("MMHUB SubBlock %s, DED %d\n",
+ dev_info(adev->dev, "MMHUB SubBlock %s, DED %d\n",
mmhub_v9_4_ras_fields[i].name,
ded_cnt);
*ded_count += ded_cnt;
@@ -1588,7 +1591,7 @@ static void mmhub_v9_4_query_ras_error_count(struct amdgpu_device *adev,
reg_value =
RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v9_4_edc_cnt_regs[i]));
if (reg_value)
- mmhub_v9_4_get_ras_error_count(&mmhub_v9_4_edc_cnt_regs[i],
+ mmhub_v9_4_get_ras_error_count(adev, &mmhub_v9_4_edc_cnt_regs[i],
reg_value, &sec_count, &ded_count);
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index d5386f15c4a5..05bc6d96ec52 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -1112,9 +1112,9 @@ kfd_gtt_out:
return 0;
kfd_gtt_no_free_chunk:
- pr_debug("Allocation failed with mem_obj = %p\n", mem_obj);
+ pr_debug("Allocation failed with mem_obj = %p\n", *mem_obj);
mutex_unlock(&kfd->gtt_sa_lock);
- kfree(mem_obj);
+ kfree(*mem_obj);
return -ENOMEM;
}
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index d3674d805a0a..f7c5cdc10a70 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -3639,6 +3639,9 @@ fill_dc_plane_info_and_addr(struct amdgpu_device *adev,
case DRM_FORMAT_NV12:
plane_info->format = SURFACE_PIXEL_FORMAT_VIDEO_420_YCrCb;
break;
+ case DRM_FORMAT_P010:
+ plane_info->format = SURFACE_PIXEL_FORMAT_VIDEO_420_10bpc_YCrCb;
+ break;
default:
DRM_ERROR(
"Unsupported screen format %s\n",
@@ -4720,10 +4723,10 @@ amdgpu_dm_connector_atomic_duplicate_state(struct drm_connector *connector)
static int
amdgpu_dm_connector_late_register(struct drm_connector *connector)
{
+#if defined(CONFIG_DEBUG_FS)
struct amdgpu_dm_connector *amdgpu_dm_connector =
to_amdgpu_dm_connector(connector);
-#if defined(CONFIG_DEBUG_FS)
connector_debugfs_init(amdgpu_dm_connector);
#endif
@@ -5535,6 +5538,8 @@ static int get_plane_formats(const struct drm_plane *plane,
if (plane_cap && plane_cap->pixel_format_support.nv12)
formats[num_formats++] = DRM_FORMAT_NV12;
+ if (plane_cap && plane_cap->pixel_format_support.p010)
+ formats[num_formats++] = DRM_FORMAT_P010;
break;
case DRM_PLANE_TYPE_OVERLAY:
@@ -5587,12 +5592,15 @@ static int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm,
}
if (plane->type == DRM_PLANE_TYPE_PRIMARY &&
- plane_cap && plane_cap->pixel_format_support.nv12) {
+ plane_cap &&
+ (plane_cap->pixel_format_support.nv12 ||
+ plane_cap->pixel_format_support.p010)) {
/* This only affects YUV formats. */
drm_plane_create_color_properties(
plane,
BIT(DRM_COLOR_YCBCR_BT601) |
- BIT(DRM_COLOR_YCBCR_BT709),
+ BIT(DRM_COLOR_YCBCR_BT709) |
+ BIT(DRM_COLOR_YCBCR_BT2020),
BIT(DRM_COLOR_YCBCR_LIMITED_RANGE) |
BIT(DRM_COLOR_YCBCR_FULL_RANGE),
DRM_COLOR_YCBCR_BT709, DRM_COLOR_YCBCR_LIMITED_RANGE);
@@ -5921,7 +5929,8 @@ void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm,
adev->mode_info.underscan_vborder_property,
0);
- drm_connector_attach_max_bpc_property(&aconnector->base, 8, 16);
+ if (!aconnector->mst_port)
+ drm_connector_attach_max_bpc_property(&aconnector->base, 8, 16);
/* This defaults to the max in the range, but we want 8bpc for non-edp. */
aconnector->base.state->max_bpc = (connector_type == DRM_MODE_CONNECTOR_eDP) ? 16 : 8;
@@ -5940,8 +5949,9 @@ void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm,
&aconnector->base.base,
dm->ddev->mode_config.hdr_output_metadata_property, 0);
- drm_connector_attach_vrr_capable_property(
- &aconnector->base);
+ if (!aconnector->mst_port)
+ drm_connector_attach_vrr_capable_property(&aconnector->base);
+
#ifdef CONFIG_DRM_AMD_DC_HDCP
if (adev->dm.hdcp_workqueue)
drm_connector_attach_content_protection_property(&aconnector->base, true);
@@ -6264,12 +6274,6 @@ static int get_cursor_position(struct drm_plane *plane, struct drm_crtc *crtc,
y <= -amdgpu_crtc->max_cursor_height)
return 0;
- if (crtc->primary->state) {
- /* avivo cursor are offset into the total surface */
- x += crtc->primary->state->src_x >> 16;
- y += crtc->primary->state->src_y >> 16;
- }
-
if (x < 0) {
xorigin = min(-x, amdgpu_crtc->max_cursor_width - 1);
x = 0;
@@ -6279,6 +6283,7 @@ static int get_cursor_position(struct drm_plane *plane, struct drm_crtc *crtc,
y = 0;
}
position->enable = true;
+ position->translate_by_source = true;
position->x = x;
position->y = y;
position->x_hotspot = xorigin;
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c
index 5b70ed3cdb88..78e1c11d4ae5 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c
@@ -192,10 +192,13 @@ void hdcp_update_display(struct hdcp_workqueue *hdcp_work,
&hdcp_work->srm_version);
display->adjust.disable = 0;
- if (content_type == DRM_MODE_HDCP_CONTENT_TYPE0)
+ if (content_type == DRM_MODE_HDCP_CONTENT_TYPE0) {
+ hdcp_w->link.adjust.hdcp1.disable = 0;
hdcp_w->link.adjust.hdcp2.force_type = MOD_HDCP_FORCE_TYPE_0;
- else if (content_type == DRM_MODE_HDCP_CONTENT_TYPE1)
+ } else if (content_type == DRM_MODE_HDCP_CONTENT_TYPE1) {
+ hdcp_w->link.adjust.hdcp1.disable = 1;
hdcp_w->link.adjust.hdcp2.force_type = MOD_HDCP_FORCE_TYPE_1;
+ }
schedule_delayed_work(&hdcp_w->property_validate_dwork,
msecs_to_jiffies(DRM_HDCP_CHECK_PERIOD_MS));
@@ -263,7 +266,7 @@ static void event_callback(struct work_struct *work)
mutex_lock(&hdcp_work->mutex);
- cancel_delayed_work(&hdcp_work->watchdog_timer_dwork);
+ cancel_delayed_work(&hdcp_work->callback_dwork);
mod_hdcp_process_event(&hdcp_work->hdcp, MOD_HDCP_EVENT_CALLBACK,
&hdcp_work->output);
@@ -344,6 +347,8 @@ static void event_watchdog_timer(struct work_struct *work)
mutex_lock(&hdcp_work->mutex);
+ cancel_delayed_work(&hdcp_work->watchdog_timer_dwork);
+
mod_hdcp_process_event(&hdcp_work->hdcp,
MOD_HDCP_EVENT_WATCHDOG_TIMEOUT,
&hdcp_work->output);
@@ -414,7 +419,8 @@ static void update_config(void *handle, struct cp_psp_stream_config *config)
link->dp.rev = aconnector->dc_link->dpcd_caps.dpcd_rev.raw;
link->dp.mst_supported = config->mst_supported;
display->adjust.disable = 1;
- link->adjust.auth_delay = 2;
+ link->adjust.auth_delay = 3;
+ link->adjust.hdcp1.disable = 0;
hdcp_update_display(hdcp_work, link_index, aconnector, DRM_MODE_HDCP_CONTENT_TYPE0, false);
}
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
index e8208df420d9..fabbe78d5aef 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
@@ -410,6 +410,14 @@ dm_dp_add_mst_connector(struct drm_dp_mst_topology_mgr *mgr,
drm_connector_attach_encoder(&aconnector->base,
&aconnector->mst_encoder->base);
+ connector->max_bpc_property = master->base.max_bpc_property;
+ if (connector->max_bpc_property)
+ drm_connector_attach_max_bpc_property(connector, 8, 16);
+
+ connector->vrr_capable_property = master->base.vrr_capable_property;
+ if (connector->vrr_capable_property)
+ drm_connector_attach_vrr_capable_property(connector);
+
drm_object_attach_property(
&connector->base,
dev->mode_config.path_property,
diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c
index ab267ddd4abe..24c5765890fa 100644
--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c
+++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c
@@ -643,7 +643,7 @@ static void rn_clk_mgr_helper_populate_bw_params(struct clk_bw_params *bw_params
/* Find lowest DPM, FCLK is filled in reverse order*/
for (i = PP_SMU_NUM_FCLK_DPM_LEVELS - 1; i >= 0; i--) {
- if (clock_table->FClocks[i].Freq != 0) {
+ if (clock_table->FClocks[i].Freq != 0 && clock_table->FClocks[i].Vol != 0) {
j = i;
break;
}
diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c
index 2ffb22177df9..8489f1e56892 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc.c
@@ -283,6 +283,8 @@ bool dc_stream_adjust_vmin_vmax(struct dc *dc,
int i = 0;
bool ret = false;
+ stream->adjust = *adjust;
+
for (i = 0; i < MAX_PIPES; i++) {
struct pipe_ctx *pipe = &dc->current_state->res_ctx.pipe_ctx[i];
@@ -1360,6 +1362,26 @@ bool dc_commit_state(struct dc *dc, struct dc_state *context)
return (result == DC_OK);
}
+static bool is_flip_pending_in_pipes(struct dc *dc, struct dc_state *context)
+{
+ int i;
+ struct pipe_ctx *pipe;
+
+ for (i = 0; i < MAX_PIPES; i++) {
+ pipe = &context->res_ctx.pipe_ctx[i];
+
+ if (!pipe->plane_state)
+ continue;
+
+ /* Must set to false to start with, due to OR in update function */
+ pipe->plane_state->status.is_flip_pending = false;
+ dc->hwss.update_pending_status(pipe);
+ if (pipe->plane_state->status.is_flip_pending)
+ return true;
+ }
+ return false;
+}
+
bool dc_post_update_surfaces_to_stream(struct dc *dc)
{
int i;
@@ -1370,6 +1392,9 @@ bool dc_post_update_surfaces_to_stream(struct dc *dc)
post_surface_trace(dc);
+ if (is_flip_pending_in_pipes(dc, context))
+ return true;
+
for (i = 0; i < dc->res_pool->pipe_count; i++)
if (context->res_ctx.pipe_ctx[i].stream == NULL ||
context->res_ctx.pipe_ctx[i].plane_state == NULL) {
@@ -1703,6 +1728,9 @@ static enum surface_update_type det_surface_update(const struct dc *dc,
if (u->coeff_reduction_factor)
update_flags->bits.coeff_reduction_change = 1;
+ if (u->gamut_remap_matrix)
+ update_flags->bits.gamut_remap_change = 1;
+
if (u->gamma) {
enum surface_pixel_format format = SURFACE_PIXEL_FORMAT_GRPH_BEGIN;
@@ -1728,7 +1756,8 @@ static enum surface_update_type det_surface_update(const struct dc *dc,
if (update_flags->bits.input_csc_change
|| update_flags->bits.coeff_reduction_change
- || update_flags->bits.gamma_change) {
+ || update_flags->bits.gamma_change
+ || update_flags->bits.gamut_remap_change) {
type = UPDATE_TYPE_FULL;
elevate_update_type(&overall_type, type);
}
@@ -1832,8 +1861,9 @@ enum surface_update_type dc_check_update_surfaces_for_stream(
// Else we fallback to mem compare.
} else if (memcmp(&dc->current_state->bw_ctx.bw.dcn.clk, &dc->clk_mgr->clks, offsetof(struct dc_clocks, prev_p_state_change_support)) != 0) {
dc->optimized_required = true;
- } else if (dc->wm_optimized_required)
- dc->optimized_required = true;
+ }
+
+ dc->optimized_required |= dc->wm_optimized_required;
}
return type;
@@ -1973,6 +2003,10 @@ static void copy_surface_update_to_plane(
if (srf_update->coeff_reduction_factor)
surface->coeff_reduction_factor =
*srf_update->coeff_reduction_factor;
+
+ if (srf_update->gamut_remap_matrix)
+ surface->gamut_remap_matrix =
+ *srf_update->gamut_remap_matrix;
}
static void copy_stream_update_to_stream(struct dc *dc,
@@ -2431,7 +2465,7 @@ void dc_commit_updates_for_stream(struct dc *dc,
enum surface_update_type update_type;
struct dc_state *context;
struct dc_context *dc_ctx = dc->ctx;
- int i;
+ int i, j;
stream_status = dc_stream_get_status(stream);
context = dc->current_state;
@@ -2469,6 +2503,17 @@ void dc_commit_updates_for_stream(struct dc *dc,
copy_surface_update_to_plane(surface, &srf_updates[i]);
+ if (update_type >= UPDATE_TYPE_MED) {
+ for (j = 0; j < dc->res_pool->pipe_count; j++) {
+ struct pipe_ctx *pipe_ctx =
+ &context->res_ctx.pipe_ctx[j];
+
+ if (pipe_ctx->plane_state != surface)
+ continue;
+
+ resource_build_scaling_params(pipe_ctx);
+ }
+ }
}
copy_stream_update_to_stream(dc, context, stream, stream_update);
diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
index 75c7ce4c7581..f4bcc71b2920 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
@@ -1077,6 +1077,7 @@ bool resource_build_scaling_params(struct pipe_ctx *pipe_ctx)
* on certain displays, such as the Sharp 4k
*/
pipe_ctx->plane_res.scl_data.lb_params.depth = LB_PIXEL_DEPTH_30BPP;
+ pipe_ctx->plane_res.scl_data.lb_params.alpha_en = plane_state->per_pixel_alpha;
pipe_ctx->plane_res.scl_data.recout.x += timing->h_border_left;
pipe_ctx->plane_res.scl_data.recout.y += timing->v_border_top;
diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h
index d3ceb39e428e..1935cf6601eb 100644
--- a/drivers/gpu/drm/amd/display/dc/dc.h
+++ b/drivers/gpu/drm/amd/display/dc/dc.h
@@ -726,6 +726,7 @@ union surface_update_flags {
uint32_t output_tf_change:1;
uint32_t pixel_format_change:1;
uint32_t plane_size_change:1;
+ uint32_t gamut_remap_change:1;
/* Full updates */
uint32_t new_plane:1;
@@ -760,6 +761,7 @@ struct dc_plane_state {
struct dc_csc_transform input_csc_color_matrix;
struct fixed31_32 coeff_reduction_factor;
struct fixed31_32 hdr_mult;
+ struct colorspace_transform gamut_remap_matrix;
// TODO: No longer used, remove
struct dc_hdr_static_metadata hdr_static_ctx;
@@ -839,6 +841,7 @@ struct dc_surface_update {
const struct dc_transfer_func *func_shaper;
const struct dc_3dlut *lut3d_func;
const struct dc_transfer_func *blend_tf;
+ const struct colorspace_transform *gamut_remap_matrix;
};
/*
diff --git a/drivers/gpu/drm/amd/display/dc/dc_hw_types.h b/drivers/gpu/drm/amd/display/dc/dc_hw_types.h
index 25c50bcab9e9..a8dc3082e3e1 100644
--- a/drivers/gpu/drm/amd/display/dc/dc_hw_types.h
+++ b/drivers/gpu/drm/amd/display/dc/dc_hw_types.h
@@ -385,6 +385,8 @@ struct dc_cursor_position {
*/
bool enable;
+ /* Translate cursor x/y by the source rectangle for each plane. */
+ bool translate_by_source;
};
struct dc_cursor_mi_param {
diff --git a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c
index 0976e378659f..c279982947e1 100644
--- a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c
+++ b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c
@@ -2685,6 +2685,23 @@ void dce110_set_cursor_position(struct pipe_ctx *pipe_ctx)
.mirror = pipe_ctx->plane_state->horizontal_mirror
};
+ /**
+ * If the cursor's source viewport is clipped then we need to
+ * translate the cursor to appear in the correct position on
+ * the screen.
+ *
+ * This translation isn't affected by scaling so it needs to be
+ * done *after* we adjust the position for the scale factor.
+ *
+ * This is only done by opt-in for now since there are still
+ * some usecases like tiled display that might enable the
+ * cursor on both streams while expecting dc to clip it.
+ */
+ if (pos_cpy.translate_by_source) {
+ pos_cpy.x += pipe_ctx->plane_state->src_rect.x;
+ pos_cpy.y += pipe_ctx->plane_state->src_rect.y;
+ }
+
if (pipe_ctx->plane_state->address.type
== PLN_ADDR_TYPE_VIDEO_PROGRESSIVE)
pos_cpy.enable = false;
diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c
index 9cc3314966bd..b0357546471b 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c
@@ -2004,6 +2004,12 @@ void dcn10_program_gamut_remap(struct pipe_ctx *pipe_ctx)
for (i = 0; i < CSC_TEMPERATURE_MATRIX_SIZE; i++)
adjust.temperature_matrix[i] =
pipe_ctx->stream->gamut_remap_matrix.matrix[i];
+ } else if (pipe_ctx->plane_state &&
+ pipe_ctx->plane_state->gamut_remap_matrix.enable_remap == true) {
+ adjust.gamut_adjust_type = GRAPHICS_GAMUT_ADJUST_TYPE_SW;
+ for (i = 0; i < CSC_TEMPERATURE_MATRIX_SIZE; i++)
+ adjust.temperature_matrix[i] =
+ pipe_ctx->plane_state->gamut_remap_matrix.matrix[i];
}
pipe_ctx->plane_res.dpp->funcs->dpp_set_gamut_remap(pipe_ctx->plane_res.dpp, &adjust);
@@ -3015,12 +3021,50 @@ void dcn10_set_cursor_position(struct pipe_ctx *pipe_ctx)
int x_pos = pos_cpy.x;
int y_pos = pos_cpy.y;
- // translate cursor from stream space to plane space
+ /**
+ * DC cursor is stream space, HW cursor is plane space and drawn
+ * as part of the framebuffer.
+ *
+ * Cursor position can't be negative, but hotspot can be used to
+ * shift cursor out of the plane bounds. Hotspot must be smaller
+ * than the cursor size.
+ */
+
+ /**
+ * Translate cursor from stream space to plane space.
+ *
+ * If the cursor is scaled then we need to scale the position
+ * to be in the approximately correct place. We can't do anything
+ * about the actual size being incorrect, that's a limitation of
+ * the hardware.
+ */
x_pos = (x_pos - x_plane) * pipe_ctx->plane_state->src_rect.width /
pipe_ctx->plane_state->dst_rect.width;
y_pos = (y_pos - y_plane) * pipe_ctx->plane_state->src_rect.height /
pipe_ctx->plane_state->dst_rect.height;
+ /**
+ * If the cursor's source viewport is clipped then we need to
+ * translate the cursor to appear in the correct position on
+ * the screen.
+ *
+ * This translation isn't affected by scaling so it needs to be
+ * done *after* we adjust the position for the scale factor.
+ *
+ * This is only done by opt-in for now since there are still
+ * some usecases like tiled display that might enable the
+ * cursor on both streams while expecting dc to clip it.
+ */
+ if (pos_cpy.translate_by_source) {
+ x_pos += pipe_ctx->plane_state->src_rect.x;
+ y_pos += pipe_ctx->plane_state->src_rect.y;
+ }
+
+ /**
+ * If the position is negative then we need to add to the hotspot
+ * to shift the cursor outside the plane.
+ */
+
if (x_pos < 0) {
pos_cpy.x_hotspot -= x_pos;
x_pos = 0;
diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c
index 63acb8ff7462..17d96ec6acd8 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c
@@ -343,6 +343,23 @@ void optc1_set_blank_data_double_buffer(struct timing_generator *optc, bool enab
}
/**
+ * optc1_set_timing_double_buffer() - DRR double buffering control
+ *
+ * Sets double buffer point for V_TOTAL, H_TOTAL, VTOTAL_MIN,
+ * VTOTAL_MAX, VTOTAL_MIN_SEL and VTOTAL_MAX_SEL registers.
+ *
+ * Options: any time, start of frame, dp start of frame (range timing)
+ */
+void optc1_set_timing_double_buffer(struct timing_generator *optc, bool enable)
+{
+ struct optc *optc1 = DCN10TG_FROM_TG(optc);
+ uint32_t mode = enable ? 2 : 0;
+
+ REG_UPDATE(OTG_DOUBLE_BUFFER_CONTROL,
+ OTG_RANGE_TIMING_DBUF_UPDATE_MODE, mode);
+}
+
+/**
* unblank_crtc
* Call ASIC Control Object to UnBlank CRTC.
*/
@@ -1353,6 +1370,7 @@ void optc1_clear_optc_underflow(struct timing_generator *optc)
void optc1_tg_init(struct timing_generator *optc)
{
optc1_set_blank_data_double_buffer(optc, true);
+ optc1_set_timing_double_buffer(optc, true);
optc1_clear_optc_underflow(optc);
}
diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.h b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.h
index f277656d5464..9a459a8fe8a0 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.h
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.h
@@ -185,6 +185,7 @@ struct dcn_optc_registers {
SF(OTG0_OTG_GLOBAL_CONTROL0, OTG_MASTER_UPDATE_LOCK_SEL, mask_sh),\
SF(OTG0_OTG_DOUBLE_BUFFER_CONTROL, OTG_UPDATE_PENDING, mask_sh),\
SF(OTG0_OTG_DOUBLE_BUFFER_CONTROL, OTG_BLANK_DATA_DOUBLE_BUFFER_EN, mask_sh),\
+ SF(OTG0_OTG_DOUBLE_BUFFER_CONTROL, OTG_RANGE_TIMING_DBUF_UPDATE_MODE, mask_sh),\
SF(OTG0_OTG_H_TOTAL, OTG_H_TOTAL, mask_sh),\
SF(OTG0_OTG_H_BLANK_START_END, OTG_H_BLANK_START, mask_sh),\
SF(OTG0_OTG_H_BLANK_START_END, OTG_H_BLANK_END, mask_sh),\
@@ -643,6 +644,8 @@ bool optc1_is_optc_underflow_occurred(struct timing_generator *optc);
void optc1_set_blank_data_double_buffer(struct timing_generator *optc, bool enable);
+void optc1_set_timing_double_buffer(struct timing_generator *optc, bool enable);
+
bool optc1_get_otg_active_size(struct timing_generator *optc,
uint32_t *otg_active_width,
uint32_t *otg_active_height);
diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c
index 261bdc3a8218..07265ca7d28c 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c
@@ -552,7 +552,8 @@ static const struct dc_plane_cap plane_cap = {
.pixel_format_support = {
.argb8888 = true,
.nv12 = true,
- .fp16 = true
+ .fp16 = true,
+ .p010 = true
},
.max_upscale_factor = {
@@ -584,7 +585,7 @@ static const struct dc_debug_options debug_defaults_drv = {
.disable_pplib_clock_request = false,
.disable_pplib_wm_range = false,
.pplib_wm_report_mode = WM_REPORT_DEFAULT,
- .pipe_split_policy = MPC_SPLIT_AVOID_MULT_DISP,
+ .pipe_split_policy = MPC_SPLIT_DYNAMIC,
.force_single_disp_pipe_split = true,
.disable_dcc = DCC_ENABLE,
.voltage_align_fclk = true,
diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c
index 233318260da4..22f421e82733 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c
@@ -1373,6 +1373,7 @@ static void dcn20_update_dchubp_dpp(
}
if (pipe_ctx->update_flags.bits.viewport ||
+ (context == dc->current_state && plane_state->update_flags.bits.position_change) ||
(context == dc->current_state && plane_state->update_flags.bits.scaling_change) ||
(context == dc->current_state && pipe_ctx->stream->update_flags.bits.scaling)) {
diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c
index a67395208991..5cdbba0cd873 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c
@@ -1012,7 +1012,8 @@ static const struct dc_plane_cap plane_cap = {
.pixel_format_support = {
.argb8888 = true,
.nv12 = true,
- .fp16 = true
+ .fp16 = true,
+ .p010 = true
},
.max_upscale_factor = {
@@ -3342,7 +3343,7 @@ void dcn20_cap_soc_clocks(
void dcn20_update_bounding_box(struct dc *dc, struct _vcs_dpi_soc_bounding_box_st *bb,
struct pp_smu_nv_clock_table *max_clocks, unsigned int *uclk_states, unsigned int num_states)
{
- struct _vcs_dpi_voltage_scaling_st calculated_states[MAX_CLOCK_LIMIT_STATES];
+ struct _vcs_dpi_voltage_scaling_st calculated_states[DC__VOLTAGE_STATES];
int i;
int num_calculated_states = 0;
int min_dcfclk = 0;
diff --git a/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c b/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c
index 51b5910cd05f..b25484aa8222 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c
@@ -300,7 +300,7 @@ struct _vcs_dpi_soc_bounding_box_st dcn2_1_soc = {
.xfc_bus_transport_time_us = 4,
.xfc_xbuf_latency_tolerance_us = 4,
.use_urgent_burst_bw = 1,
- .num_states = 9
+ .num_states = 8
};
#ifndef MAX
@@ -838,7 +838,8 @@ static const struct dc_plane_cap plane_cap = {
.pixel_format_support = {
.argb8888 = true,
.nv12 = true,
- .fp16 = true
+ .fp16 = true,
+ .p010 = true
},
.max_upscale_factor = {
@@ -1376,21 +1377,8 @@ static void update_bw_bounding_box(struct dc *dc, struct clk_bw_params *bw_param
unsigned int i, j, k;
int closest_clk_lvl;
- // diags does not retrieve proper values from SMU
- // cap states to 5 and make state 5 the max state
- if (IS_FPGA_MAXIMUS_DC(dc->ctx->dce_environment) || IS_DIAG_DC(dc->ctx->dce_environment)) {
- dcn2_1_soc.num_states = 5;
-
- dcn2_1_soc.clock_limits[5].state = 5;
- dcn2_1_soc.clock_limits[5].dcfclk_mhz = 810.0;
- dcn2_1_soc.clock_limits[5].fabricclk_mhz = 1600.0;
- dcn2_1_soc.clock_limits[5].dispclk_mhz = 1395.0;
- dcn2_1_soc.clock_limits[5].dppclk_mhz = 1285.0;
- dcn2_1_soc.clock_limits[5].phyclk_mhz = 1325.0;
- dcn2_1_soc.clock_limits[5].socclk_mhz = 953.0;
- dcn2_1_soc.clock_limits[5].dscclk_mhz = 489.0;
- dcn2_1_soc.clock_limits[5].dram_speed_mts = 4266.0;
- } else {
+ // Default clock levels are used for diags, which may lead to overclocking.
+ if (!IS_FPGA_MAXIMUS_DC(dc->ctx->dce_environment) && !IS_DIAG_DC(dc->ctx->dce_environment)) {
dcn2_1_ip.max_num_otg = pool->base.res_cap->num_timing_generator;
dcn2_1_ip.max_num_dpp = pool->base.pipe_count;
dcn2_1_soc.num_chans = bw_params->num_channels;
@@ -1403,16 +1391,16 @@ static void update_bw_bounding_box(struct dc *dc, struct clk_bw_params *bw_param
dcn2_1_soc.clock_limits[0].dram_speed_mts = clk_table->entries[0].memclk_mhz * 2;
/*
- * Other levels: find cloest DCN clocks that fit the given clock limit using dcfclk
- * as indicater
+ * Other levels: find closest DCN clocks that fit the given clock limit using dcfclk
+ * as indicator
*/
closest_clk_lvl = -1;
/* index currently being filled */
k = 1;
for (i = 1; i < clk_table->num_entries; i++) {
- /* loop backwards, skip duplicate state, +1 because SMU has precision issue */
- for (j = dcn2_1_soc.num_states - 2; j >= k; j--) {
+ /* loop backwards, skip duplicate state*/
+ for (j = dcn2_1_soc.num_states - 1; j >= k; j--) {
if ((unsigned int) dcn2_1_soc.clock_limits[j].dcfclk_mhz <= clk_table->entries[i].dcfclk_mhz) {
closest_clk_lvl = j;
break;
@@ -1437,13 +1425,13 @@ static void update_bw_bounding_box(struct dc *dc, struct clk_bw_params *bw_param
k++;
}
}
-
- /* duplicate last level */
- dcn2_1_soc.clock_limits[k] = dcn2_1_soc.clock_limits[k - 1];
- dcn2_1_soc.clock_limits[k].state = k;
- dcn2_1_soc.num_states = k + 1;
+ dcn2_1_soc.num_states = k;
}
+ /* duplicate last level */
+ dcn2_1_soc.clock_limits[dcn2_1_soc.num_states] = dcn2_1_soc.clock_limits[dcn2_1_soc.num_states - 1];
+ dcn2_1_soc.clock_limits[dcn2_1_soc.num_states].state = dcn2_1_soc.num_states;
+
dml_init_instance(&dc->dml, &dcn2_1_soc, &dcn2_1_ip, DML_PROJECT_DCN21);
}
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dc_features.h b/drivers/gpu/drm/amd/display/dc/dml/dc_features.h
index ea4cde952f4f..2a1983324629 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dc_features.h
+++ b/drivers/gpu/drm/amd/display/dc/dml/dc_features.h
@@ -29,7 +29,7 @@
#define DC__PRESENT 1
#define DC__PRESENT__1 1
#define DC__NUM_DPP 4
-#define DC__VOLTAGE_STATES 7
+#define DC__VOLTAGE_STATES 9
#define DC__NUM_DPP__4 1
#define DC__NUM_DPP__0_PRESENT 1
#define DC__NUM_DPP__1_PRESENT 1
diff --git a/drivers/gpu/drm/amd/display/dc/dml/display_mode_structs.h b/drivers/gpu/drm/amd/display/dc/dml/display_mode_structs.h
index dfd3be452766..687010c17324 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/display_mode_structs.h
+++ b/drivers/gpu/drm/amd/display/dc/dml/display_mode_structs.h
@@ -22,11 +22,12 @@
* Authors: AMD
*
*/
+
+#include "dc_features.h"
+
#ifndef __DISPLAY_MODE_STRUCTS_H__
#define __DISPLAY_MODE_STRUCTS_H__
-#define MAX_CLOCK_LIMIT_STATES 9
-
typedef struct _vcs_dpi_voltage_scaling_st voltage_scaling_st;
typedef struct _vcs_dpi_soc_bounding_box_st soc_bounding_box_st;
typedef struct _vcs_dpi_ip_params_st ip_params_st;
@@ -68,7 +69,7 @@ struct _vcs_dpi_voltage_scaling_st {
};
struct _vcs_dpi_soc_bounding_box_st {
- struct _vcs_dpi_voltage_scaling_st clock_limits[MAX_CLOCK_LIMIT_STATES];
+ struct _vcs_dpi_voltage_scaling_st clock_limits[DC__VOLTAGE_STATES];
unsigned int num_states;
double sr_exit_time_us;
double sr_enter_plus_exit_time_us;
diff --git a/drivers/gpu/drm/amd/display/include/dal_asic_id.h b/drivers/gpu/drm/amd/display/include/dal_asic_id.h
index 8a87d0ed90ae..2359e88d6029 100644
--- a/drivers/gpu/drm/amd/display/include/dal_asic_id.h
+++ b/drivers/gpu/drm/amd/display/include/dal_asic_id.h
@@ -136,6 +136,7 @@
#define RAVEN2_A0 0x81
#define RAVEN1_F0 0xF0
#define RAVEN_UNKNOWN 0xFF
+#define RENOIR_A0 0x91
#ifndef ASICREV_IS_RAVEN
#define ASICREV_IS_RAVEN(eChipRev) ((eChipRev >= RAVEN_A0) && eChipRev < RAVEN_UNKNOWN)
#endif
@@ -171,8 +172,6 @@ enum {
#define ASICREV_IS_NAVI10_P(eChipRev) (eChipRev < NV_NAVI12_P_A0)
#define ASICREV_IS_NAVI12_P(eChipRev) ((eChipRev >= NV_NAVI12_P_A0) && (eChipRev < NV_NAVI14_M_A0))
#define ASICREV_IS_NAVI14_M(eChipRev) ((eChipRev >= NV_NAVI14_M_A0) && (eChipRev < NV_UNKNOWN))
-#define RENOIR_A0 0x91
-#define DEVICE_ID_RENOIR_1636 0x1636 // Renoir
#define ASICREV_IS_RENOIR(eChipRev) ((eChipRev >= RENOIR_A0) && (eChipRev < RAVEN1_F0))
/*
@@ -183,6 +182,9 @@ enum {
#define DEVICE_ID_TEMASH_9839 0x9839
#define DEVICE_ID_TEMASH_983D 0x983D
+/* RENOIR */
+#define DEVICE_ID_RENOIR_1636 0x1636
+
/* Asic Family IDs for different asic family. */
#define FAMILY_CI 120 /* Sea Islands: Hawaii (P), Bonaire (M) */
#define FAMILY_KV 125 /* Fusion => Kaveri: Spectre, Spooky; Kabini: Kalindi */
diff --git a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
index 4e542826cd26..c33454a9e0b4 100644
--- a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
+++ b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
@@ -734,6 +734,7 @@ void mod_freesync_build_vrr_params(struct mod_freesync *mod_freesync,
{
struct core_freesync *core_freesync = NULL;
unsigned long long nominal_field_rate_in_uhz = 0;
+ unsigned long long rounded_nominal_in_uhz = 0;
unsigned int refresh_range = 0;
unsigned long long min_refresh_in_uhz = 0;
unsigned long long max_refresh_in_uhz = 0;
@@ -750,17 +751,20 @@ void mod_freesync_build_vrr_params(struct mod_freesync *mod_freesync,
min_refresh_in_uhz = in_config->min_refresh_in_uhz;
max_refresh_in_uhz = in_config->max_refresh_in_uhz;
- // Don't allow min > max
- if (min_refresh_in_uhz > max_refresh_in_uhz)
- min_refresh_in_uhz = max_refresh_in_uhz;
-
// Full range may be larger than current video timing, so cap at nominal
if (max_refresh_in_uhz > nominal_field_rate_in_uhz)
max_refresh_in_uhz = nominal_field_rate_in_uhz;
// Full range may be larger than current video timing, so cap at nominal
- if (min_refresh_in_uhz > nominal_field_rate_in_uhz)
- min_refresh_in_uhz = nominal_field_rate_in_uhz;
+ if (min_refresh_in_uhz > max_refresh_in_uhz)
+ min_refresh_in_uhz = max_refresh_in_uhz;
+
+ // If a monitor reports exactly max refresh of 2x of min, enforce it on nominal
+ rounded_nominal_in_uhz =
+ div_u64(nominal_field_rate_in_uhz + 50000, 100000) * 100000;
+ if (in_config->max_refresh_in_uhz == (2 * in_config->min_refresh_in_uhz) &&
+ in_config->max_refresh_in_uhz == rounded_nominal_in_uhz)
+ min_refresh_in_uhz = div_u64(nominal_field_rate_in_uhz, 2);
if (!vrr_settings_require_update(core_freesync,
in_config, (unsigned int)min_refresh_in_uhz, (unsigned int)max_refresh_in_uhz,
@@ -792,11 +796,6 @@ void mod_freesync_build_vrr_params(struct mod_freesync *mod_freesync,
refresh_range = in_out_vrr->max_refresh_in_uhz -
in_out_vrr->min_refresh_in_uhz;
- in_out_vrr->btr.margin_in_us = in_out_vrr->max_duration_in_us -
- 2 * in_out_vrr->min_duration_in_us;
- if (in_out_vrr->btr.margin_in_us > BTR_MAX_MARGIN)
- in_out_vrr->btr.margin_in_us = BTR_MAX_MARGIN;
-
in_out_vrr->supported = true;
}
@@ -804,9 +803,14 @@ void mod_freesync_build_vrr_params(struct mod_freesync *mod_freesync,
in_out_vrr->btr.btr_enabled = in_config->btr;
- if (in_out_vrr->max_refresh_in_uhz <
- 2 * in_out_vrr->min_refresh_in_uhz)
+ if (in_out_vrr->max_refresh_in_uhz < (2 * in_out_vrr->min_refresh_in_uhz))
in_out_vrr->btr.btr_enabled = false;
+ else {
+ in_out_vrr->btr.margin_in_us = in_out_vrr->max_duration_in_us -
+ 2 * in_out_vrr->min_duration_in_us;
+ if (in_out_vrr->btr.margin_in_us > BTR_MAX_MARGIN)
+ in_out_vrr->btr.margin_in_us = BTR_MAX_MARGIN;
+ }
in_out_vrr->btr.btr_active = false;
in_out_vrr->btr.inserted_duration_in_us = 0;
@@ -1008,8 +1012,8 @@ unsigned long long mod_freesync_calc_nominal_field_rate(
unsigned int total = stream->timing.h_total * stream->timing.v_total;
/* Calculate nominal field rate for stream, rounded up to nearest integer */
- nominal_field_rate_in_uhz = stream->timing.pix_clk_100hz / 10;
- nominal_field_rate_in_uhz *= 1000ULL * 1000ULL * 1000ULL;
+ nominal_field_rate_in_uhz = stream->timing.pix_clk_100hz;
+ nominal_field_rate_in_uhz *= 100000000ULL;
nominal_field_rate_in_uhz = div_u64(nominal_field_rate_in_uhz, total);
diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.c
index e9fbd94f8635..cc1d3f470b99 100644
--- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.c
+++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.c
@@ -328,8 +328,7 @@ enum mod_hdcp_status mod_hdcp_add_display(struct mod_hdcp *hdcp,
/* add display to connection */
hdcp->connection.link = *link;
*display_container = *display;
- status = mod_hdcp_add_display_to_topology(hdcp, display_container);
-
+ status = mod_hdcp_add_display_to_topology(hdcp, display->index);
if (status != MOD_HDCP_STATUS_SUCCESS)
goto out;
@@ -375,7 +374,7 @@ enum mod_hdcp_status mod_hdcp_remove_display(struct mod_hdcp *hdcp,
status = mod_hdcp_remove_display_from_topology(hdcp, index);
if (status != MOD_HDCP_STATUS_SUCCESS)
goto out;
- memset(display, 0, sizeof(struct mod_hdcp_display));
+ display->state = MOD_HDCP_DISPLAY_INACTIVE;
/* request authentication when connection is not reset */
if (current_state(hdcp) != HDCP_UNINITIALIZED)
diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h
index 60ff1a0028ac..5cb4546be0ef 100644
--- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h
+++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h
@@ -328,7 +328,7 @@ void mod_hdcp_dump_binary_message(uint8_t *msg, uint32_t msg_size,
/* psp functions */
enum mod_hdcp_status mod_hdcp_add_display_to_topology(
- struct mod_hdcp *hdcp, struct mod_hdcp_display *display);
+ struct mod_hdcp *hdcp, uint8_t index);
enum mod_hdcp_status mod_hdcp_remove_display_from_topology(
struct mod_hdcp *hdcp, uint8_t index);
enum mod_hdcp_status mod_hdcp_hdcp1_create_session(struct mod_hdcp *hdcp);
@@ -503,6 +503,11 @@ static inline uint8_t is_display_active(struct mod_hdcp_display *display)
return display->state >= MOD_HDCP_DISPLAY_ACTIVE;
}
+static inline uint8_t is_display_added(struct mod_hdcp_display *display)
+{
+ return display->state >= MOD_HDCP_DISPLAY_ACTIVE_AND_ADDED;
+}
+
static inline uint8_t is_display_encryption_enabled(struct mod_hdcp_display *display)
{
return display->state >= MOD_HDCP_DISPLAY_ENCRYPTION_ENABLED;
@@ -510,23 +515,34 @@ static inline uint8_t is_display_encryption_enabled(struct mod_hdcp_display *dis
static inline uint8_t get_active_display_count(struct mod_hdcp *hdcp)
{
- uint8_t active_count = 0;
+ uint8_t added_count = 0;
uint8_t i;
for (i = 0; i < MAX_NUM_OF_DISPLAYS; i++)
if (is_display_active(&hdcp->displays[i]))
- active_count++;
- return active_count;
+ added_count++;
+ return added_count;
+}
+
+static inline uint8_t get_added_display_count(struct mod_hdcp *hdcp)
+{
+ uint8_t added_count = 0;
+ uint8_t i;
+
+ for (i = 0; i < MAX_NUM_OF_DISPLAYS; i++)
+ if (is_display_added(&hdcp->displays[i]))
+ added_count++;
+ return added_count;
}
-static inline struct mod_hdcp_display *get_first_active_display(
+static inline struct mod_hdcp_display *get_first_added_display(
struct mod_hdcp *hdcp)
{
uint8_t i;
struct mod_hdcp_display *display = NULL;
for (i = 0; i < MAX_NUM_OF_DISPLAYS; i++)
- if (is_display_active(&hdcp->displays[i])) {
+ if (is_display_added(&hdcp->displays[i])) {
display = &hdcp->displays[i];
break;
}
diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c
index f244b72e74e0..37c8c05497d6 100644
--- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c
+++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c
@@ -129,7 +129,7 @@ static inline uint8_t get_device_count(struct mod_hdcp *hdcp)
static inline enum mod_hdcp_status check_device_count(struct mod_hdcp *hdcp)
{
/* device count must be greater than or equal to tracked hdcp displays */
- return (get_device_count(hdcp) < get_active_display_count(hdcp)) ?
+ return (get_device_count(hdcp) < get_added_display_count(hdcp)) ?
MOD_HDCP_STATUS_HDCP1_DEVICE_COUNT_MISMATCH_FAILURE :
MOD_HDCP_STATUS_SUCCESS;
}
diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c
index 549c113abcf7..491c00f48026 100644
--- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c
+++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c
@@ -208,7 +208,7 @@ static inline uint8_t get_device_count(struct mod_hdcp *hdcp)
static enum mod_hdcp_status check_device_count(struct mod_hdcp *hdcp)
{
/* device count must be greater than or equal to tracked hdcp displays */
- return (get_device_count(hdcp) < get_active_display_count(hdcp)) ?
+ return (get_device_count(hdcp) < get_added_display_count(hdcp)) ?
MOD_HDCP_STATUS_HDCP2_DEVICE_COUNT_MISMATCH_FAILURE :
MOD_HDCP_STATUS_SUCCESS;
}
diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c
index 836e47954938..c2929815c3ee 100644
--- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c
+++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c
@@ -54,7 +54,7 @@ enum mod_hdcp_status mod_hdcp_remove_display_from_topology(
dtm_cmd = (struct ta_dtm_shared_memory *)psp->dtm_context.dtm_shared_buf;
- if (!display || !is_display_active(display))
+ if (!display || !is_display_added(display))
return MOD_HDCP_STATUS_DISPLAY_NOT_FOUND;
memset(dtm_cmd, 0, sizeof(struct ta_dtm_shared_memory));
@@ -73,21 +73,25 @@ enum mod_hdcp_status mod_hdcp_remove_display_from_topology(
HDCP_TOP_REMOVE_DISPLAY_TRACE(hdcp, display->index);
return MOD_HDCP_STATUS_SUCCESS;
- }
-
-enum mod_hdcp_status mod_hdcp_add_display_to_topology(
- struct mod_hdcp *hdcp, struct mod_hdcp_display *display)
+
+}
+enum mod_hdcp_status mod_hdcp_add_display_to_topology(struct mod_hdcp *hdcp,
+ uint8_t index)
{
struct psp_context *psp = hdcp->config.psp.handle;
struct ta_dtm_shared_memory *dtm_cmd;
+ struct mod_hdcp_display *display =
+ get_active_display_at_index(hdcp, index);
struct mod_hdcp_link *link = &hdcp->connection.link;
if (!psp->dtm_context.dtm_initialized) {
DRM_ERROR("Failed to add display topology, DTM TA is not initialized.");
- display->state = MOD_HDCP_DISPLAY_INACTIVE;
return MOD_HDCP_STATUS_FAILURE;
}
+ if (!display || is_display_added(display))
+ return MOD_HDCP_STATUS_UPDATE_TOPOLOGY_FAILURE;
+
dtm_cmd = (struct ta_dtm_shared_memory *)psp->dtm_context.dtm_shared_buf;
memset(dtm_cmd, 0, sizeof(struct ta_dtm_shared_memory));
@@ -109,11 +113,10 @@ enum mod_hdcp_status mod_hdcp_add_display_to_topology(
psp_dtm_invoke(psp, dtm_cmd->cmd_id);
- if (dtm_cmd->dtm_status != TA_DTM_STATUS__SUCCESS) {
- display->state = MOD_HDCP_DISPLAY_INACTIVE;
+ if (dtm_cmd->dtm_status != TA_DTM_STATUS__SUCCESS)
return MOD_HDCP_STATUS_UPDATE_TOPOLOGY_FAILURE;
- }
+ display->state = MOD_HDCP_DISPLAY_ACTIVE_AND_ADDED;
HDCP_TOP_ADD_DISPLAY_TRACE(hdcp, display->index);
return MOD_HDCP_STATUS_SUCCESS;
@@ -123,7 +126,7 @@ enum mod_hdcp_status mod_hdcp_hdcp1_create_session(struct mod_hdcp *hdcp)
{
struct psp_context *psp = hdcp->config.psp.handle;
- struct mod_hdcp_display *display = get_first_active_display(hdcp);
+ struct mod_hdcp_display *display = get_first_added_display(hdcp);
struct ta_hdcp_shared_memory *hdcp_cmd;
if (!psp->hdcp_context.hdcp_initialized) {
@@ -176,7 +179,7 @@ enum mod_hdcp_status mod_hdcp_hdcp1_destroy_session(struct mod_hdcp *hdcp)
if (is_display_encryption_enabled(
&hdcp->displays[i])) {
hdcp->displays[i].state =
- MOD_HDCP_DISPLAY_ACTIVE;
+ MOD_HDCP_DISPLAY_ACTIVE_AND_ADDED;
HDCP_HDCP1_DISABLED_TRACE(hdcp,
hdcp->displays[i].index);
}
@@ -228,7 +231,7 @@ enum mod_hdcp_status mod_hdcp_hdcp1_enable_encryption(struct mod_hdcp *hdcp)
{
struct psp_context *psp = hdcp->config.psp.handle;
struct ta_hdcp_shared_memory *hdcp_cmd;
- struct mod_hdcp_display *display = get_first_active_display(hdcp);
+ struct mod_hdcp_display *display = get_first_added_display(hdcp);
hdcp_cmd = (struct ta_hdcp_shared_memory *)psp->hdcp_context.hdcp_shared_buf;
memset(hdcp_cmd, 0, sizeof(struct ta_hdcp_shared_memory));
@@ -298,7 +301,8 @@ enum mod_hdcp_status mod_hdcp_hdcp1_enable_dp_stream_encryption(struct mod_hdcp
for (i = 0; i < MAX_NUM_OF_DISPLAYS; i++) {
- if (hdcp->displays[i].adjust.disable)
+ if (hdcp->displays[i].state != MOD_HDCP_DISPLAY_ACTIVE_AND_ADDED ||
+ hdcp->displays[i].adjust.disable)
continue;
memset(hdcp_cmd, 0, sizeof(struct ta_hdcp_shared_memory));
@@ -360,7 +364,7 @@ enum mod_hdcp_status mod_hdcp_hdcp2_create_session(struct mod_hdcp *hdcp)
{
struct psp_context *psp = hdcp->config.psp.handle;
struct ta_hdcp_shared_memory *hdcp_cmd;
- struct mod_hdcp_display *display = get_first_active_display(hdcp);
+ struct mod_hdcp_display *display = get_first_added_display(hdcp);
if (!psp->hdcp_context.hdcp_initialized) {
DRM_ERROR("Failed to create hdcp session, HDCP TA is not initialized");
@@ -419,7 +423,7 @@ enum mod_hdcp_status mod_hdcp_hdcp2_destroy_session(struct mod_hdcp *hdcp)
if (is_display_encryption_enabled(
&hdcp->displays[i])) {
hdcp->displays[i].state =
- MOD_HDCP_DISPLAY_ACTIVE;
+ MOD_HDCP_DISPLAY_ACTIVE_AND_ADDED;
HDCP_HDCP2_DISABLED_TRACE(hdcp,
hdcp->displays[i].index);
}
@@ -658,7 +662,7 @@ enum mod_hdcp_status mod_hdcp_hdcp2_enable_encryption(struct mod_hdcp *hdcp)
{
struct psp_context *psp = hdcp->config.psp.handle;
struct ta_hdcp_shared_memory *hdcp_cmd;
- struct mod_hdcp_display *display = get_first_active_display(hdcp);
+ struct mod_hdcp_display *display = get_first_added_display(hdcp);
hdcp_cmd = (struct ta_hdcp_shared_memory *)psp->hdcp_context.hdcp_shared_buf;
memset(hdcp_cmd, 0, sizeof(struct ta_hdcp_shared_memory));
@@ -743,7 +747,8 @@ enum mod_hdcp_status mod_hdcp_hdcp2_enable_dp_stream_encryption(struct mod_hdcp
for (i = 0; i < MAX_NUM_OF_DISPLAYS; i++) {
- if (hdcp->displays[i].adjust.disable)
+ if (hdcp->displays[i].state != MOD_HDCP_DISPLAY_ACTIVE_AND_ADDED ||
+ hdcp->displays[i].adjust.disable)
continue;
hdcp_cmd->in_msg.hdcp2_enable_dp_stream_encryption.display_handle = hdcp->displays[i].index;
hdcp_cmd->in_msg.hdcp2_enable_dp_stream_encryption.session_handle = hdcp->auth.id;
diff --git a/drivers/gpu/drm/amd/display/modules/inc/mod_hdcp.h b/drivers/gpu/drm/amd/display/modules/inc/mod_hdcp.h
index eae9309cfb24..c088602bc1a0 100644
--- a/drivers/gpu/drm/amd/display/modules/inc/mod_hdcp.h
+++ b/drivers/gpu/drm/amd/display/modules/inc/mod_hdcp.h
@@ -117,6 +117,7 @@ enum mod_hdcp_operation_mode {
enum mod_hdcp_display_state {
MOD_HDCP_DISPLAY_INACTIVE = 0,
MOD_HDCP_DISPLAY_ACTIVE,
+ MOD_HDCP_DISPLAY_ACTIVE_AND_ADDED,
MOD_HDCP_DISPLAY_ENCRYPTION_ENABLED
};
diff --git a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
index c195575366a3..2a12614a12c2 100644
--- a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
+++ b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
@@ -1452,7 +1452,8 @@ static int pp_get_asic_baco_state(void *handle, int *state)
if (!hwmgr)
return -EINVAL;
- if (!hwmgr->pm_en || !hwmgr->hwmgr_func->get_asic_baco_state)
+ if (!(hwmgr->not_vf && amdgpu_dpm) ||
+ !hwmgr->hwmgr_func->get_asic_baco_state)
return 0;
mutex_lock(&hwmgr->smu_lock);
diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
index c6d3bef15320..1ef0923f7190 100644
--- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
@@ -35,6 +35,7 @@
#include "arcturus_ppt.h"
#include "smu_v11_0_pptable.h"
#include "arcturus_ppsmc.h"
+#include "nbio/nbio_7_4_offset.h"
#include "nbio/nbio_7_4_sh_mask.h"
#include "amdgpu_xgmi.h"
#include <linux/i2c.h>
@@ -793,8 +794,21 @@ static int arcturus_force_clk_levels(struct smu_context *smu,
struct arcturus_dpm_table *dpm_table;
struct arcturus_single_dpm_table *single_dpm_table;
uint32_t soft_min_level, soft_max_level;
+ uint32_t smu_version;
int ret = 0;
+ ret = smu_get_smc_version(smu, NULL, &smu_version);
+ if (ret) {
+ pr_err("Failed to get smu version!\n");
+ return ret;
+ }
+
+ if (smu_version >= 0x361200) {
+ pr_err("Forcing clock level is not supported with "
+ "54.18 and onwards SMU firmwares\n");
+ return -EOPNOTSUPP;
+ }
+
soft_min_level = mask ? (ffs(mask) - 1) : 0;
soft_max_level = mask ? (fls(mask) - 1) : 0;
@@ -1511,6 +1525,38 @@ static int arcturus_set_power_profile_mode(struct smu_context *smu,
return 0;
}
+static int arcturus_set_performance_level(struct smu_context *smu,
+ enum amd_dpm_forced_level level)
+{
+ uint32_t smu_version;
+ int ret;
+
+ ret = smu_get_smc_version(smu, NULL, &smu_version);
+ if (ret) {
+ pr_err("Failed to get smu version!\n");
+ return ret;
+ }
+
+ switch (level) {
+ case AMD_DPM_FORCED_LEVEL_HIGH:
+ case AMD_DPM_FORCED_LEVEL_LOW:
+ case AMD_DPM_FORCED_LEVEL_PROFILE_STANDARD:
+ case AMD_DPM_FORCED_LEVEL_PROFILE_MIN_SCLK:
+ case AMD_DPM_FORCED_LEVEL_PROFILE_MIN_MCLK:
+ case AMD_DPM_FORCED_LEVEL_PROFILE_PEAK:
+ if (smu_version >= 0x361200) {
+ pr_err("Forcing clock level is not supported with "
+ "54.18 and onwards SMU firmwares\n");
+ return -EOPNOTSUPP;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return smu_v11_0_set_performance_level(smu, level);
+}
+
static void arcturus_dump_pptable(struct smu_context *smu)
{
struct smu_table_context *table_context = &smu->smu_table;
@@ -2210,6 +2256,18 @@ static void arcturus_i2c_eeprom_control_fini(struct i2c_adapter *control)
i2c_del_adapter(control);
}
+static bool arcturus_is_baco_supported(struct smu_context *smu)
+{
+ struct amdgpu_device *adev = smu->adev;
+ uint32_t val;
+
+ if (!smu_v11_0_baco_is_support(smu))
+ return false;
+
+ val = RREG32_SOC15(NBIO, 0, mmRCC_BIF_STRAP0);
+ return (val & RCC_BIF_STRAP0__STRAP_PX_CAPABLE_MASK) ? true : false;
+}
+
static uint32_t arcturus_get_pptable_power_limit(struct smu_context *smu)
{
PPTable_t *pptable = smu->smu_table.driver_pptable;
@@ -2272,7 +2330,7 @@ static const struct pptable_funcs arcturus_ppt_funcs = {
.get_profiling_clk_mask = arcturus_get_profiling_clk_mask,
.get_power_profile_mode = arcturus_get_power_profile_mode,
.set_power_profile_mode = arcturus_set_power_profile_mode,
- .set_performance_level = smu_v11_0_set_performance_level,
+ .set_performance_level = arcturus_set_performance_level,
/* debug (internal used) */
.dump_pptable = arcturus_dump_pptable,
.get_power_limit = arcturus_get_power_limit,
@@ -2321,7 +2379,7 @@ static const struct pptable_funcs arcturus_ppt_funcs = {
.register_irq_handler = smu_v11_0_register_irq_handler,
.set_azalia_d3_pme = smu_v11_0_set_azalia_d3_pme,
.get_max_sustainable_clocks_by_dc = smu_v11_0_get_max_sustainable_clocks_by_dc,
- .baco_is_support= smu_v11_0_baco_is_support,
+ .baco_is_support= arcturus_is_baco_supported,
.baco_get_state = smu_v11_0_baco_get_state,
.baco_set_state = smu_v11_0_baco_set_state,
.baco_enter = smu_v11_0_baco_enter,
diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
index 9c60b38ab53a..15030284b444 100644
--- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
@@ -28,13 +28,15 @@
#include "smu_internal.h"
#include "atomfirmware.h"
#include "amdgpu_atomfirmware.h"
+#include "soc15_common.h"
#include "smu_v11_0.h"
#include "smu11_driver_if_navi10.h"
#include "atom.h"
#include "navi10_ppt.h"
#include "smu_v11_0_pptable.h"
#include "smu_v11_0_ppsmc.h"
-#include "nbio/nbio_7_4_sh_mask.h"
+#include "nbio/nbio_2_3_offset.h"
+#include "nbio/nbio_2_3_sh_mask.h"
#include "asic_reg/mp/mp_11_0_sh_mask.h"
@@ -1985,6 +1987,18 @@ static int navi10_setup_od_limits(struct smu_context *smu) {
return 0;
}
+static bool navi10_is_baco_supported(struct smu_context *smu)
+{
+ struct amdgpu_device *adev = smu->adev;
+ uint32_t val;
+
+ if (!smu_v11_0_baco_is_support(smu))
+ return false;
+
+ val = RREG32_SOC15(NBIO, 0, mmRCC_BIF_STRAP0);
+ return (val & RCC_BIF_STRAP0__STRAP_PX_CAPABLE_MASK) ? true : false;
+}
+
static int navi10_set_default_od_settings(struct smu_context *smu, bool initialize) {
OverDriveTable_t *od_table, *boot_od_table;
int ret = 0;
@@ -2361,7 +2375,7 @@ static const struct pptable_funcs navi10_ppt_funcs = {
.register_irq_handler = smu_v11_0_register_irq_handler,
.set_azalia_d3_pme = smu_v11_0_set_azalia_d3_pme,
.get_max_sustainable_clocks_by_dc = smu_v11_0_get_max_sustainable_clocks_by_dc,
- .baco_is_support= smu_v11_0_baco_is_support,
+ .baco_is_support= navi10_is_baco_supported,
.baco_get_state = smu_v11_0_baco_get_state,
.baco_set_state = smu_v11_0_baco_set_state,
.baco_enter = smu_v11_0_baco_enter,
diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
index 7bf52ecba01d..ff73a735b888 100644
--- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
@@ -239,6 +239,7 @@ static int renoir_print_clk_levels(struct smu_context *smu,
uint32_t cur_value = 0, value = 0, count = 0, min = 0, max = 0;
DpmClocks_t *clk_table = smu->smu_table.clocks_table;
SmuMetrics_t metrics;
+ bool cur_value_match_level = false;
if (!clk_table || clk_type >= SMU_CLK_COUNT)
return -EINVAL;
@@ -297,8 +298,13 @@ static int renoir_print_clk_levels(struct smu_context *smu,
GET_DPM_CUR_FREQ(clk_table, clk_type, i, value);
size += sprintf(buf + size, "%d: %uMhz %s\n", i, value,
cur_value == value ? "*" : "");
+ if (cur_value == value)
+ cur_value_match_level = true;
}
+ if (!cur_value_match_level)
+ size += sprintf(buf + size, " %uMhz *\n", cur_value);
+
return size;
}
@@ -887,6 +893,17 @@ static int renoir_read_sensor(struct smu_context *smu,
return ret;
}
+static bool renoir_is_dpm_running(struct smu_context *smu)
+{
+ /*
+ * Util now, the pmfw hasn't exported the interface of SMU
+ * feature mask to APU SKU so just force on all the feature
+ * at early initial stage.
+ */
+ return true;
+
+}
+
static const struct pptable_funcs renoir_ppt_funcs = {
.get_smu_msg_index = renoir_get_smu_msg_index,
.get_smu_clk_index = renoir_get_smu_clk_index,
@@ -927,6 +944,7 @@ static const struct pptable_funcs renoir_ppt_funcs = {
.mode2_reset = smu_v12_0_mode2_reset,
.set_soft_freq_limited_range = smu_v12_0_set_soft_freq_limited_range,
.set_driver_table_location = smu_v12_0_set_driver_table_location,
+ .is_dpm_running = renoir_is_dpm_running,
};
void renoir_set_ppt_funcs(struct smu_context *smu)
diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.h b/drivers/gpu/drm/amd/powerplay/renoir_ppt.h
index 2a390ddd37dd..89cd6da118a3 100644
--- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.h
+++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.h
@@ -37,7 +37,7 @@ extern void renoir_set_ppt_funcs(struct smu_context *smu);
freq = table->SocClocks[dpm_level].Freq; \
break; \
case SMU_MCLK: \
- freq = table->MemClocks[dpm_level].Freq; \
+ freq = table->FClocks[dpm_level].Freq; \
break; \
case SMU_DCEFCLK: \
freq = table->DcfClocks[dpm_level].Freq; \
diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
index d19e1d0d56c0..541c932a6005 100644
--- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
@@ -42,8 +42,6 @@
#include "asic_reg/thm/thm_11_0_2_sh_mask.h"
#include "asic_reg/mp/mp_11_0_offset.h"
#include "asic_reg/mp/mp_11_0_sh_mask.h"
-#include "asic_reg/nbio/nbio_7_4_offset.h"
-#include "asic_reg/nbio/nbio_7_4_sh_mask.h"
#include "asic_reg/smuio/smuio_11_0_0_offset.h"
#include "asic_reg/smuio/smuio_11_0_0_sh_mask.h"
@@ -1662,9 +1660,7 @@ static int smu_v11_0_baco_set_armd3_sequence(struct smu_context *smu, enum smu_v
bool smu_v11_0_baco_is_support(struct smu_context *smu)
{
- struct amdgpu_device *adev = smu->adev;
struct smu_baco_context *smu_baco = &smu->smu_baco;
- uint32_t val;
bool baco_support;
mutex_lock(&smu_baco->mutex);
@@ -1679,11 +1675,7 @@ bool smu_v11_0_baco_is_support(struct smu_context *smu)
!smu_feature_is_enabled(smu, SMU_FEATURE_BACO_BIT))
return false;
- val = RREG32_SOC15(NBIO, 0, mmRCC_BIF_STRAP0);
- if (val & RCC_BIF_STRAP0__STRAP_PX_CAPABLE_MASK)
- return true;
-
- return false;
+ return true;
}
enum smu_baco_state smu_v11_0_baco_get_state(struct smu_context *smu)
@@ -1700,11 +1692,9 @@ enum smu_baco_state smu_v11_0_baco_get_state(struct smu_context *smu)
int smu_v11_0_baco_set_state(struct smu_context *smu, enum smu_baco_state state)
{
-
struct smu_baco_context *smu_baco = &smu->smu_baco;
struct amdgpu_device *adev = smu->adev;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
- uint32_t bif_doorbell_intr_cntl;
uint32_t data;
int ret = 0;
@@ -1713,14 +1703,7 @@ int smu_v11_0_baco_set_state(struct smu_context *smu, enum smu_baco_state state)
mutex_lock(&smu_baco->mutex);
- bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL);
-
if (state == SMU_BACO_STATE_ENTER) {
- bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl,
- BIF_DOORBELL_INT_CNTL,
- DOORBELL_INTERRUPT_DISABLE, 1);
- WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
-
if (!ras || !ras->supported) {
data = RREG32_SOC15(THM, 0, mmTHM_BACO_CNTL);
data |= 0x80000000;
@@ -1735,11 +1718,6 @@ int smu_v11_0_baco_set_state(struct smu_context *smu, enum smu_baco_state state)
if (ret)
goto out;
- bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl,
- BIF_DOORBELL_INT_CNTL,
- DOORBELL_INTERRUPT_DISABLE, 0);
- WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
-
/* clear vbios scratch 6 and 7 for coming asic reinit */
WREG32(adev->bios_scratch_reg_offset + 6, 0);
WREG32(adev->bios_scratch_reg_offset + 7, 0);
diff --git a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
index 49ff3756bd9f..3f1044326dcb 100644
--- a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
@@ -35,6 +35,7 @@
#include "vega20_ppt.h"
#include "vega20_pptable.h"
#include "vega20_ppsmc.h"
+#include "nbio/nbio_7_4_offset.h"
#include "nbio/nbio_7_4_sh_mask.h"
#include "asic_reg/thm/thm_11_0_2_offset.h"
#include "asic_reg/thm/thm_11_0_2_sh_mask.h"
@@ -3174,6 +3175,17 @@ static int vega20_update_pcie_parameters(struct smu_context *smu,
return ret;
}
+static bool vega20_is_baco_supported(struct smu_context *smu)
+{
+ struct amdgpu_device *adev = smu->adev;
+ uint32_t val;
+
+ if (!smu_v11_0_baco_is_support(smu))
+ return false;
+
+ val = RREG32_SOC15(NBIO, 0, mmRCC_BIF_STRAP0);
+ return (val & RCC_BIF_STRAP0__STRAP_PX_CAPABLE_MASK) ? true : false;
+}
static const struct pptable_funcs vega20_ppt_funcs = {
.tables_init = vega20_tables_init,
@@ -3262,7 +3274,7 @@ static const struct pptable_funcs vega20_ppt_funcs = {
.register_irq_handler = smu_v11_0_register_irq_handler,
.set_azalia_d3_pme = smu_v11_0_set_azalia_d3_pme,
.get_max_sustainable_clocks_by_dc = smu_v11_0_get_max_sustainable_clocks_by_dc,
- .baco_is_support= smu_v11_0_baco_is_support,
+ .baco_is_support= vega20_is_baco_supported,
.baco_get_state = smu_v11_0_baco_get_state,
.baco_set_state = smu_v11_0_baco_set_state,
.baco_enter = smu_v11_0_baco_enter,
diff --git a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
index 9ded2cef57dd..76736fb8ed94 100644
--- a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
+++ b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
@@ -1652,8 +1652,7 @@ static ssize_t analogix_dpaux_transfer(struct drm_dp_aux *aux,
}
struct analogix_dp_device *
-analogix_dp_bind(struct device *dev, struct drm_device *drm_dev,
- struct analogix_dp_plat_data *plat_data)
+analogix_dp_probe(struct device *dev, struct analogix_dp_plat_data *plat_data)
{
struct platform_device *pdev = to_platform_device(dev);
struct analogix_dp_device *dp;
@@ -1756,22 +1755,30 @@ analogix_dp_bind(struct device *dev, struct drm_device *drm_dev,
irq_flags, "analogix-dp", dp);
if (ret) {
dev_err(&pdev->dev, "failed to request irq\n");
- goto err_disable_pm_runtime;
+ return ERR_PTR(ret);
}
disable_irq(dp->irq);
+ return dp;
+}
+EXPORT_SYMBOL_GPL(analogix_dp_probe);
+
+int analogix_dp_bind(struct analogix_dp_device *dp, struct drm_device *drm_dev)
+{
+ int ret;
+
dp->drm_dev = drm_dev;
dp->encoder = dp->plat_data->encoder;
dp->aux.name = "DP-AUX";
dp->aux.transfer = analogix_dpaux_transfer;
- dp->aux.dev = &pdev->dev;
+ dp->aux.dev = dp->dev;
ret = drm_dp_aux_register(&dp->aux);
if (ret)
- return ERR_PTR(ret);
+ return ret;
- pm_runtime_enable(dev);
+ pm_runtime_enable(dp->dev);
ret = analogix_dp_create_bridge(drm_dev, dp);
if (ret) {
@@ -1779,13 +1786,12 @@ analogix_dp_bind(struct device *dev, struct drm_device *drm_dev,
goto err_disable_pm_runtime;
}
- return dp;
+ return 0;
err_disable_pm_runtime:
+ pm_runtime_disable(dp->dev);
- pm_runtime_disable(dev);
-
- return ERR_PTR(ret);
+ return ret;
}
EXPORT_SYMBOL_GPL(analogix_dp_bind);
@@ -1802,10 +1808,15 @@ void analogix_dp_unbind(struct analogix_dp_device *dp)
drm_dp_aux_unregister(&dp->aux);
pm_runtime_disable(dp->dev);
- clk_disable_unprepare(dp->clock);
}
EXPORT_SYMBOL_GPL(analogix_dp_unbind);
+void analogix_dp_remove(struct analogix_dp_device *dp)
+{
+ clk_disable_unprepare(dp->clock);
+}
+EXPORT_SYMBOL_GPL(analogix_dp_remove);
+
#ifdef CONFIG_PM
int analogix_dp_suspend(struct analogix_dp_device *dp)
{
diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c
index bc6e208949e8..8981abe8b7c9 100644
--- a/drivers/gpu/drm/drm_mm.c
+++ b/drivers/gpu/drm/drm_mm.c
@@ -45,7 +45,6 @@
#include <linux/export.h>
#include <linux/interval_tree_generic.h>
#include <linux/seq_file.h>
-#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/stacktrace.h>
@@ -367,11 +366,6 @@ next_hole(struct drm_mm *mm,
struct drm_mm_node *node,
enum drm_mm_insert_mode mode)
{
- /* Searching is slow; check if we ran out of time/patience */
- cond_resched();
- if (fatal_signal_pending(current))
- return NULL;
-
switch (mode) {
default:
case DRM_MM_INSERT_BEST:
@@ -563,7 +557,7 @@ int drm_mm_insert_node_in_range(struct drm_mm * const mm,
return 0;
}
- return signal_pending(current) ? -ERESTARTSYS : -ENOSPC;
+ return -ENOSPC;
}
EXPORT_SYMBOL(drm_mm_insert_node_in_range);
diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
index 1de2cde2277c..282774e469ac 100644
--- a/drivers/gpu/drm/drm_prime.c
+++ b/drivers/gpu/drm/drm_prime.c
@@ -962,27 +962,40 @@ int drm_prime_sg_to_page_addr_arrays(struct sg_table *sgt, struct page **pages,
unsigned count;
struct scatterlist *sg;
struct page *page;
- u32 len, index;
+ u32 page_len, page_index;
dma_addr_t addr;
+ u32 dma_len, dma_index;
- index = 0;
+ /*
+ * Scatterlist elements contains both pages and DMA addresses, but
+ * one shoud not assume 1:1 relation between them. The sg->length is
+ * the size of the physical memory chunk described by the sg->page,
+ * while sg_dma_len(sg) is the size of the DMA (IO virtual) chunk
+ * described by the sg_dma_address(sg).
+ */
+ page_index = 0;
+ dma_index = 0;
for_each_sg(sgt->sgl, sg, sgt->nents, count) {
- len = sg_dma_len(sg);
+ page_len = sg->length;
page = sg_page(sg);
+ dma_len = sg_dma_len(sg);
addr = sg_dma_address(sg);
- while (len > 0) {
- if (WARN_ON(index >= max_entries))
+ while (pages && page_len > 0) {
+ if (WARN_ON(page_index >= max_entries))
return -1;
- if (pages)
- pages[index] = page;
- if (addrs)
- addrs[index] = addr;
-
+ pages[page_index] = page;
page++;
+ page_len -= PAGE_SIZE;
+ page_index++;
+ }
+ while (addrs && dma_len > 0) {
+ if (WARN_ON(dma_index >= max_entries))
+ return -1;
+ addrs[dma_index] = addr;
addr += PAGE_SIZE;
- len -= PAGE_SIZE;
- index++;
+ dma_len -= PAGE_SIZE;
+ dma_index++;
}
}
return 0;
diff --git a/drivers/gpu/drm/exynos/exynos_dp.c b/drivers/gpu/drm/exynos/exynos_dp.c
index d23d3502ca91..5ee090691390 100644
--- a/drivers/gpu/drm/exynos/exynos_dp.c
+++ b/drivers/gpu/drm/exynos/exynos_dp.c
@@ -159,15 +159,8 @@ static int exynos_dp_bind(struct device *dev, struct device *master, void *data)
struct drm_device *drm_dev = data;
int ret;
- dp->dev = dev;
dp->drm_dev = drm_dev;
- dp->plat_data.dev_type = EXYNOS_DP;
- dp->plat_data.power_on_start = exynos_dp_poweron;
- dp->plat_data.power_off = exynos_dp_poweroff;
- dp->plat_data.attach = exynos_dp_bridge_attach;
- dp->plat_data.get_modes = exynos_dp_get_modes;
-
if (!dp->plat_data.panel && !dp->ptn_bridge) {
ret = exynos_dp_dt_parse_panel(dp);
if (ret)
@@ -185,13 +178,11 @@ static int exynos_dp_bind(struct device *dev, struct device *master, void *data)
dp->plat_data.encoder = encoder;
- dp->adp = analogix_dp_bind(dev, dp->drm_dev, &dp->plat_data);
- if (IS_ERR(dp->adp)) {
+ ret = analogix_dp_bind(dp->adp, dp->drm_dev);
+ if (ret)
dp->encoder.funcs->destroy(&dp->encoder);
- return PTR_ERR(dp->adp);
- }
- return 0;
+ return ret;
}
static void exynos_dp_unbind(struct device *dev, struct device *master,
@@ -222,6 +213,7 @@ static int exynos_dp_probe(struct platform_device *pdev)
if (!dp)
return -ENOMEM;
+ dp->dev = dev;
/*
* We just use the drvdata until driver run into component
* add function, and then we would set drvdata to null, so
@@ -247,16 +239,29 @@ static int exynos_dp_probe(struct platform_device *pdev)
/* The remote port can be either a panel or a bridge */
dp->plat_data.panel = panel;
+ dp->plat_data.dev_type = EXYNOS_DP;
+ dp->plat_data.power_on_start = exynos_dp_poweron;
+ dp->plat_data.power_off = exynos_dp_poweroff;
+ dp->plat_data.attach = exynos_dp_bridge_attach;
+ dp->plat_data.get_modes = exynos_dp_get_modes;
dp->plat_data.skip_connector = !!bridge;
+
dp->ptn_bridge = bridge;
out:
+ dp->adp = analogix_dp_probe(dev, &dp->plat_data);
+ if (IS_ERR(dp->adp))
+ return PTR_ERR(dp->adp);
+
return component_add(&pdev->dev, &exynos_dp_ops);
}
static int exynos_dp_remove(struct platform_device *pdev)
{
+ struct exynos_dp_device *dp = platform_get_drvdata(pdev);
+
component_del(&pdev->dev, &exynos_dp_ops);
+ analogix_dp_remove(dp->adp);
return 0;
}
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 9f887a86e555..6cd1f6253814 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -28,9 +28,6 @@ subdir-ccflags-$(CONFIG_DRM_I915_WERROR) += -Werror
CFLAGS_i915_pci.o = $(call cc-disable-warning, override-init)
CFLAGS_display/intel_fbdev.o = $(call cc-disable-warning, override-init)
-subdir-ccflags-y += \
- $(call as-instr,movntdqa (%eax)$(comma)%xmm0,-DCONFIG_AS_MOVNTDQA)
-
subdir-ccflags-y += -I$(srctree)/$(src)
# Please keep these build lists sorted!
diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c
index 73d0f4648c06..2c617c98db3a 100644
--- a/drivers/gpu/drm/i915/display/intel_ddi.c
+++ b/drivers/gpu/drm/i915/display/intel_ddi.c
@@ -947,7 +947,8 @@ static const struct cnl_ddi_buf_trans *
ehl_get_combo_buf_trans(struct drm_i915_private *dev_priv, int type, int rate,
int *n_entries)
{
- if (type == INTEL_OUTPUT_DP && rate > 270000) {
+ if (type != INTEL_OUTPUT_HDMI && type != INTEL_OUTPUT_EDP &&
+ rate > 270000) {
*n_entries = ARRAY_SIZE(ehl_combo_phy_ddi_translations_hbr2_hbr3);
return ehl_combo_phy_ddi_translations_hbr2_hbr3;
}
@@ -959,7 +960,7 @@ static const struct cnl_ddi_buf_trans *
tgl_get_combo_buf_trans(struct drm_i915_private *dev_priv, int type, int rate,
int *n_entries)
{
- if (type != INTEL_OUTPUT_DP) {
+ if (type == INTEL_OUTPUT_HDMI || type == INTEL_OUTPUT_EDP) {
return icl_get_combo_buf_trans(dev_priv, type, rate, n_entries);
} else if (rate > 270000) {
*n_entries = ARRAY_SIZE(tgl_combo_phy_ddi_translations_dp_hbr2);
@@ -1869,7 +1870,11 @@ static void intel_ddi_get_power_domains(struct intel_encoder *encoder,
return;
dig_port = enc_to_dig_port(encoder);
- intel_display_power_get(dev_priv, dig_port->ddi_io_power_domain);
+
+ if (!intel_phy_is_tc(dev_priv, phy) ||
+ dig_port->tc_mode != TC_PORT_TBT_ALT)
+ intel_display_power_get(dev_priv,
+ dig_port->ddi_io_power_domain);
/*
* AUX power is only needed for (e)DP mode, and for HDMI mode on TC
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 36d069504836..b7440f06c5e2 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -896,11 +896,13 @@ static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache)
static void reloc_gpu_flush(struct reloc_cache *cache)
{
- GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / sizeof(u32));
+ struct drm_i915_gem_object *obj = cache->rq->batch->obj;
+
+ GEM_BUG_ON(cache->rq_size >= obj->base.size / sizeof(u32));
cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END;
- __i915_gem_object_flush_map(cache->rq->batch->obj, 0, cache->rq_size);
- i915_gem_object_unpin_map(cache->rq->batch->obj);
+ __i915_gem_object_flush_map(obj, 0, sizeof(u32) * (cache->rq_size + 1));
+ i915_gem_object_unpin_map(obj);
intel_gt_chipset_flush(cache->rq->engine->gt);
@@ -1477,10 +1479,8 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev)
* can read from this userspace address.
*/
offset = gen8_canonical_addr(offset & ~UPDATE);
- if (unlikely(__put_user(offset, &urelocs[r-stack].presumed_offset))) {
- remain = -EFAULT;
- goto out;
- }
+ __put_user(offset,
+ &urelocs[r - stack].presumed_offset);
}
} while (r++, --count);
urelocs += ARRAY_SIZE(stack);
diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index aed498a0d032..4c5a209cb669 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -191,10 +191,11 @@ static void gen8_ggtt_insert_entries(struct i915_address_space *vm,
enum i915_cache_level level,
u32 flags)
{
- struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm);
- struct sgt_iter sgt_iter;
- gen8_pte_t __iomem *gtt_entries;
const gen8_pte_t pte_encode = gen8_ggtt_pte_encode(0, level, 0);
+ struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm);
+ gen8_pte_t __iomem *gte;
+ gen8_pte_t __iomem *end;
+ struct sgt_iter iter;
dma_addr_t addr;
/*
@@ -202,10 +203,17 @@ static void gen8_ggtt_insert_entries(struct i915_address_space *vm,
* not to allow the user to override access to a read only page.
*/
- gtt_entries = (gen8_pte_t __iomem *)ggtt->gsm;
- gtt_entries += vma->node.start / I915_GTT_PAGE_SIZE;
- for_each_sgt_daddr(addr, sgt_iter, vma->pages)
- gen8_set_pte(gtt_entries++, pte_encode | addr);
+ gte = (gen8_pte_t __iomem *)ggtt->gsm;
+ gte += vma->node.start / I915_GTT_PAGE_SIZE;
+ end = gte + vma->node.size / I915_GTT_PAGE_SIZE;
+
+ for_each_sgt_daddr(addr, iter, vma->pages)
+ gen8_set_pte(gte++, pte_encode | addr);
+ GEM_BUG_ON(gte > end);
+
+ /* Fill the allocated but "unused" space beyond the end of the buffer */
+ while (gte < end)
+ gen8_set_pte(gte++, vm->scratch[0].encode);
/*
* We want to flush the TLBs only after we're certain all the PTE
@@ -241,13 +249,22 @@ static void gen6_ggtt_insert_entries(struct i915_address_space *vm,
u32 flags)
{
struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm);
- gen6_pte_t __iomem *entries = (gen6_pte_t __iomem *)ggtt->gsm;
- unsigned int i = vma->node.start / I915_GTT_PAGE_SIZE;
+ gen6_pte_t __iomem *gte;
+ gen6_pte_t __iomem *end;
struct sgt_iter iter;
dma_addr_t addr;
+ gte = (gen6_pte_t __iomem *)ggtt->gsm;
+ gte += vma->node.start / I915_GTT_PAGE_SIZE;
+ end = gte + vma->node.size / I915_GTT_PAGE_SIZE;
+
for_each_sgt_daddr(addr, iter, vma->pages)
- iowrite32(vm->pte_encode(addr, level, flags), &entries[i++]);
+ iowrite32(vm->pte_encode(addr, level, flags), gte++);
+ GEM_BUG_ON(gte > end);
+
+ /* Fill the allocated but "unused" space beyond the end of the buffer */
+ while (gte < end)
+ iowrite32(vm->scratch[0].encode, gte++);
/*
* We want to flush the TLBs only after we're certain all the PTE
diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c b/drivers/gpu/drm/i915/gvt/cmd_parser.c
index 9e065ad0658f..a3cc080a46c6 100644
--- a/drivers/gpu/drm/i915/gvt/cmd_parser.c
+++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c
@@ -164,6 +164,7 @@ struct decode_info {
#define OP_STATE_BASE_ADDRESS OP_3D_MEDIA(0x0, 0x1, 0x01)
#define OP_STATE_SIP OP_3D_MEDIA(0x0, 0x1, 0x02)
#define OP_3D_MEDIA_0_1_4 OP_3D_MEDIA(0x0, 0x1, 0x04)
+#define OP_SWTESS_BASE_ADDRESS OP_3D_MEDIA(0x0, 0x1, 0x03)
#define OP_3DSTATE_VF_STATISTICS_GM45 OP_3D_MEDIA(0x1, 0x0, 0x0B)
@@ -967,18 +968,6 @@ static int cmd_handler_lri(struct parser_exec_state *s)
{
int i, ret = 0;
int cmd_len = cmd_length(s);
- u32 valid_len = CMD_LEN(1);
-
- /*
- * Official intel docs are somewhat sloppy , check the definition of
- * MI_LOAD_REGISTER_IMM.
- */
- #define MAX_VALID_LEN 127
- if ((cmd_len < valid_len) || (cmd_len > MAX_VALID_LEN)) {
- gvt_err("len is not valid: len=%u valid_len=%u\n",
- cmd_len, valid_len);
- return -EFAULT;
- }
for (i = 1; i < cmd_len; i += 2) {
if (IS_BROADWELL(s->engine->i915) && s->engine->id != RCS0) {
@@ -2485,6 +2474,9 @@ static const struct cmd_info cmd_info[] = {
{"OP_3D_MEDIA_0_1_4", OP_3D_MEDIA_0_1_4, F_LEN_VAR, R_RCS, D_ALL,
ADDR_FIX_1(1), 8, NULL},
+ {"OP_SWTESS_BASE_ADDRESS", OP_SWTESS_BASE_ADDRESS,
+ F_LEN_VAR, R_RCS, D_ALL, ADDR_FIX_2(1, 2), 3, NULL},
+
{"3DSTATE_VS", OP_3DSTATE_VS, F_LEN_VAR, R_RCS, D_ALL, 0, 8, NULL},
{"3DSTATE_SF", OP_3DSTATE_SF, F_LEN_VAR, R_RCS, D_ALL, 0, 8, NULL},
diff --git a/drivers/gpu/drm/i915/gvt/display.c b/drivers/gpu/drm/i915/gvt/display.c
index 6e5c9885d9fe..a83df2f84eb9 100644
--- a/drivers/gpu/drm/i915/gvt/display.c
+++ b/drivers/gpu/drm/i915/gvt/display.c
@@ -221,7 +221,7 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu)
~(TRANS_DDI_BPC_MASK | TRANS_DDI_MODE_SELECT_MASK |
TRANS_DDI_PORT_MASK);
vgpu_vreg_t(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) |=
- (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DVI |
+ (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DP_SST |
(PORT_B << TRANS_DDI_PORT_SHIFT) |
TRANS_DDI_FUNC_ENABLE);
if (IS_BROADWELL(dev_priv)) {
@@ -241,7 +241,7 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu)
~(TRANS_DDI_BPC_MASK | TRANS_DDI_MODE_SELECT_MASK |
TRANS_DDI_PORT_MASK);
vgpu_vreg_t(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) |=
- (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DVI |
+ (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DP_SST |
(PORT_C << TRANS_DDI_PORT_SHIFT) |
TRANS_DDI_FUNC_ENABLE);
if (IS_BROADWELL(dev_priv)) {
@@ -261,7 +261,7 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu)
~(TRANS_DDI_BPC_MASK | TRANS_DDI_MODE_SELECT_MASK |
TRANS_DDI_PORT_MASK);
vgpu_vreg_t(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) |=
- (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DVI |
+ (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DP_SST |
(PORT_D << TRANS_DDI_PORT_SHIFT) |
TRANS_DDI_FUNC_ENABLE);
if (IS_BROADWELL(dev_priv)) {
diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c
index 0182e2a5acff..2faf50e1b051 100644
--- a/drivers/gpu/drm/i915/gvt/handlers.c
+++ b/drivers/gpu/drm/i915/gvt/handlers.c
@@ -462,11 +462,14 @@ static int pipeconf_mmio_write(struct intel_vgpu *vgpu, unsigned int offset,
return 0;
}
-/* ascendingly sorted */
+/* sorted in ascending order */
static i915_reg_t force_nonpriv_white_list[] = {
+ _MMIO(0xd80),
GEN9_CS_DEBUG_MODE1, //_MMIO(0x20ec)
GEN9_CTX_PREEMPT_REG,//_MMIO(0x2248)
- PS_INVOCATION_COUNT,//_MMIO(0x2348)
+ CL_PRIMITIVES_COUNT, //_MMIO(0x2340)
+ PS_INVOCATION_COUNT, //_MMIO(0x2348)
+ PS_DEPTH_COUNT, //_MMIO(0x2350)
GEN8_CS_CHICKEN1,//_MMIO(0x2580)
_MMIO(0x2690),
_MMIO(0x2694),
@@ -491,6 +494,7 @@ static i915_reg_t force_nonpriv_white_list[] = {
_MMIO(0xe18c),
_MMIO(0xe48c),
_MMIO(0xe5f4),
+ _MMIO(0x64844),
};
/* a simple bsearch */
diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c
index 1c95bf8cbed0..cb11c3184085 100644
--- a/drivers/gpu/drm/i915/gvt/scheduler.c
+++ b/drivers/gpu/drm/i915/gvt/scheduler.c
@@ -296,8 +296,8 @@ shadow_context_descriptor_update(struct intel_context *ce,
* Update bits 0-11 of the context descriptor which includes flags
* like GEN8_CTX_* cached in desc_template
*/
- desc &= ~(0x3 << GEN8_CTX_ADDRESSING_MODE_SHIFT);
- desc |= workload->ctx_desc.addressing_mode <<
+ desc &= ~(0x3ull << GEN8_CTX_ADDRESSING_MODE_SHIFT);
+ desc |= (u64)workload->ctx_desc.addressing_mode <<
GEN8_CTX_ADDRESSING_MODE_SHIFT;
ce->lrc_desc = desc;
diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
index fdd550405fd3..7b3b83bd5ab8 100644
--- a/drivers/gpu/drm/i915/i915_memcpy.c
+++ b/drivers/gpu/drm/i915/i915_memcpy.c
@@ -35,7 +35,6 @@
static DEFINE_STATIC_KEY_FALSE(has_movntdqa);
-#ifdef CONFIG_AS_MOVNTDQA
static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)
{
kernel_fpu_begin();
@@ -93,10 +92,6 @@ static void __memcpy_ntdqu(void *dst, const void *src, unsigned long len)
kernel_fpu_end();
}
-#else
-static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len) {}
-static void __memcpy_ntdqu(void *dst, const void *src, unsigned long len) {}
-#endif
/**
* i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c
index 6650f478b226..47b989834af1 100644
--- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c
+++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c
@@ -633,7 +633,7 @@ struct msm_kms *mdp5_kms_init(struct drm_device *dev)
if (config->platform.iommu) {
iommu_dev = &pdev->dev;
- if (!iommu_dev->iommu_fwspec)
+ if (!dev_iommu_fwspec_get(iommu_dev))
iommu_dev = iommu_dev->parent;
aspace = msm_gem_address_space_create(iommu_dev,
diff --git a/drivers/gpu/drm/nouveau/dispnv04/dac.c b/drivers/gpu/drm/nouveau/dispnv04/dac.c
index e8eef88a8382..ffdd447d8706 100644
--- a/drivers/gpu/drm/nouveau/dispnv04/dac.c
+++ b/drivers/gpu/drm/nouveau/dispnv04/dac.c
@@ -35,7 +35,8 @@
#include <subdev/bios/gpio.h>
#include <subdev/gpio.h>
-#include <subdev/timer.h>
+
+#include <nvif/timer.h>
int nv04_dac_output_offset(struct drm_encoder *encoder)
{
diff --git a/drivers/gpu/drm/nouveau/dispnv04/hw.c b/drivers/gpu/drm/nouveau/dispnv04/hw.c
index 3fdfafa8b0ad..b674d68ef28a 100644
--- a/drivers/gpu/drm/nouveau/dispnv04/hw.c
+++ b/drivers/gpu/drm/nouveau/dispnv04/hw.c
@@ -26,6 +26,7 @@
#include "hw.h"
#include <subdev/bios/pll.h>
+#include <nvif/timer.h>
#define CHIPSET_NFORCE 0x01a0
#define CHIPSET_NFORCE2 0x01f0
diff --git a/drivers/gpu/drm/nouveau/dispnv50/base507c.c b/drivers/gpu/drm/nouveau/dispnv50/base507c.c
index 00a85f1e1a4a..ee782151d332 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/base507c.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/base507c.c
@@ -23,6 +23,7 @@
#include <nvif/cl507c.h>
#include <nvif/event.h>
+#include <nvif/timer.h>
#include <drm/drm_atomic_helper.h>
#include <drm/drm_fourcc.h>
diff --git a/drivers/gpu/drm/nouveau/dispnv50/core507d.c b/drivers/gpu/drm/nouveau/dispnv50/core507d.c
index e7fcfa6e6467..c5152c39c684 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/core507d.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/core507d.c
@@ -23,6 +23,7 @@
#include "head.h"
#include <nvif/cl507d.h>
+#include <nvif/timer.h>
#include "nouveau_bo.h"
diff --git a/drivers/gpu/drm/nouveau/dispnv50/corec37d.c b/drivers/gpu/drm/nouveau/dispnv50/corec37d.c
index 3b36dc8d36b2..c03cb987856b 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/corec37d.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/corec37d.c
@@ -24,6 +24,8 @@
#include <nouveau_bo.h>
+#include <nvif/timer.h>
+
void
corec37d_wndw_owner(struct nv50_core *core)
{
diff --git a/drivers/gpu/drm/nouveau/dispnv50/curs507a.c b/drivers/gpu/drm/nouveau/dispnv50/curs507a.c
index 397143b639c6..8c5cf096f69b 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/curs507a.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/curs507a.c
@@ -24,21 +24,36 @@
#include "head.h"
#include <nvif/cl507a.h>
+#include <nvif/timer.h>
#include <drm/drm_atomic_helper.h>
#include <drm/drm_plane_helper.h>
+bool
+curs507a_space(struct nv50_wndw *wndw)
+{
+ nvif_msec(&nouveau_drm(wndw->plane.dev)->client.device, 2,
+ if (nvif_rd32(&wndw->wimm.base.user, 0x0008) >= 4)
+ return true;
+ );
+ WARN_ON(1);
+ return false;
+}
+
static void
curs507a_update(struct nv50_wndw *wndw, u32 *interlock)
{
- nvif_wr32(&wndw->wimm.base.user, 0x0080, 0x00000000);
+ if (curs507a_space(wndw))
+ nvif_wr32(&wndw->wimm.base.user, 0x0080, 0x00000000);
}
static void
curs507a_point(struct nv50_wndw *wndw, struct nv50_wndw_atom *asyw)
{
- nvif_wr32(&wndw->wimm.base.user, 0x0084, asyw->point.y << 16 |
- asyw->point.x);
+ if (curs507a_space(wndw)) {
+ nvif_wr32(&wndw->wimm.base.user, 0x0084, asyw->point.y << 16 |
+ asyw->point.x);
+ }
}
const struct nv50_wimm_func
diff --git a/drivers/gpu/drm/nouveau/dispnv50/cursc37a.c b/drivers/gpu/drm/nouveau/dispnv50/cursc37a.c
index 23fb29d41efe..96dff4f09f57 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/cursc37a.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/cursc37a.c
@@ -25,14 +25,17 @@
static void
cursc37a_update(struct nv50_wndw *wndw, u32 *interlock)
{
- nvif_wr32(&wndw->wimm.base.user, 0x0200, 0x00000001);
+ if (curs507a_space(wndw))
+ nvif_wr32(&wndw->wimm.base.user, 0x0200, 0x00000001);
}
static void
cursc37a_point(struct nv50_wndw *wndw, struct nv50_wndw_atom *asyw)
{
- nvif_wr32(&wndw->wimm.base.user, 0x0208, asyw->point.y << 16 |
- asyw->point.x);
+ if (curs507a_space(wndw)) {
+ nvif_wr32(&wndw->wimm.base.user, 0x0208, asyw->point.y << 16 |
+ asyw->point.x);
+ }
}
static const struct nv50_wimm_func
diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.c b/drivers/gpu/drm/nouveau/dispnv50/disp.c
index 4d1c58468dbc..6be9df1820c5 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/disp.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/disp.c
@@ -45,6 +45,7 @@
#include <nvif/cl5070.h>
#include <nvif/cl507d.h>
#include <nvif/event.h>
+#include <nvif/timer.h>
#include "nouveau_drv.h"
#include "nouveau_dma.h"
diff --git a/drivers/gpu/drm/nouveau/dispnv50/ovly827e.c b/drivers/gpu/drm/nouveau/dispnv50/ovly827e.c
index 2e68fc736fe1..4f7ce57f2036 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/ovly827e.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/ovly827e.c
@@ -24,6 +24,8 @@
#include <nouveau_bo.h>
+#include <nvif/timer.h>
+
static void
ovly827e_image_set(struct nv50_wndw *wndw, struct nv50_wndw_atom *asyw)
{
diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndw.h b/drivers/gpu/drm/nouveau/dispnv50/wndw.h
index caf397475918..a7412b9d3a98 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/wndw.h
+++ b/drivers/gpu/drm/nouveau/dispnv50/wndw.h
@@ -97,6 +97,7 @@ struct nv50_wimm_func {
};
extern const struct nv50_wimm_func curs507a;
+bool curs507a_space(struct nv50_wndw *);
int wndwc37e_new(struct nouveau_drm *, enum drm_plane_type, int, s32,
struct nv50_wndw **);
diff --git a/drivers/gpu/drm/nouveau/include/nvif/device.h b/drivers/gpu/drm/nouveau/include/nvif/device.h
index 25d969dcf67d..c2a572c67a76 100644
--- a/drivers/gpu/drm/nouveau/include/nvif/device.h
+++ b/drivers/gpu/drm/nouveau/include/nvif/device.h
@@ -23,27 +23,6 @@ int nvif_device_init(struct nvif_object *, u32 handle, s32 oclass, void *, u32,
void nvif_device_fini(struct nvif_device *);
u64 nvif_device_time(struct nvif_device *);
-/* Delay based on GPU time (ie. PTIMER).
- *
- * Will return -ETIMEDOUT unless the loop was terminated with 'break',
- * where it will return the number of nanoseconds taken instead.
- */
-#define nvif_nsec(d,n,cond...) ({ \
- struct nvif_device *_device = (d); \
- u64 _nsecs = (n), _time0 = nvif_device_time(_device); \
- s64 _taken = 0; \
- \
- do { \
- cond \
- } while (_taken = nvif_device_time(_device) - _time0, _taken < _nsecs);\
- \
- if (_taken >= _nsecs) \
- _taken = -ETIMEDOUT; \
- _taken; \
-})
-#define nvif_usec(d,u,cond...) nvif_nsec((d), (u) * 1000, ##cond)
-#define nvif_msec(d,m,cond...) nvif_usec((d), (m) * 1000, ##cond)
-
/*XXX*/
#include <subdev/bios.h>
#include <subdev/fb.h>
diff --git a/drivers/gpu/drm/nouveau/include/nvif/timer.h b/drivers/gpu/drm/nouveau/include/nvif/timer.h
new file mode 100644
index 000000000000..57587a985c4b
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/nvif/timer.h
@@ -0,0 +1,35 @@
+#ifndef __NVIF_TIMER_H__
+#define __NVIF_TIMER_H__
+#include <nvif/os.h>
+
+struct nvif_timer_wait {
+ struct nvif_device *device;
+ u64 limit;
+ u64 time0;
+ u64 time1;
+ int reads;
+};
+
+void nvif_timer_wait_init(struct nvif_device *, u64 nsec,
+ struct nvif_timer_wait *);
+s64 nvif_timer_wait_test(struct nvif_timer_wait *);
+
+/* Delay based on GPU time (ie. PTIMER).
+ *
+ * Will return -ETIMEDOUT unless the loop was terminated with 'break',
+ * where it will return the number of nanoseconds taken instead.
+ */
+#define nvif_nsec(d,n,cond...) ({ \
+ struct nvif_timer_wait _wait; \
+ s64 _taken = 0; \
+ \
+ nvif_timer_wait_init((d), (n), &_wait); \
+ do { \
+ cond \
+ } while ((_taken = nvif_timer_wait_test(&_wait)) >= 0); \
+ \
+ _taken; \
+})
+#define nvif_usec(d,u,cond...) nvif_nsec((d), (u) * 1000, ##cond)
+#define nvif_msec(d,m,cond...) nvif_usec((d), (m) * 1000, ##cond)
+#endif
diff --git a/drivers/gpu/drm/nouveau/include/nvif/user.h b/drivers/gpu/drm/nouveau/include/nvif/user.h
index 03c11826b693..6825574d93c2 100644
--- a/drivers/gpu/drm/nouveau/include/nvif/user.h
+++ b/drivers/gpu/drm/nouveau/include/nvif/user.h
@@ -10,6 +10,7 @@ struct nvif_user {
struct nvif_user_func {
void (*doorbell)(struct nvif_user *, u32 token);
+ u64 (*time)(struct nvif_user *);
};
int nvif_user_init(struct nvif_device *);
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 2b4b21b02e40..c40f127de3d0 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -1494,8 +1494,13 @@ nouveau_ttm_io_mem_reserve(struct ttm_bo_device *bdev, struct ttm_mem_reg *reg)
ret = nvif_object_map_handle(&mem->mem.object,
&args, argc,
&handle, &length);
- if (ret != 1)
- return ret ? ret : -EINVAL;
+ if (ret != 1) {
+ if (WARN_ON(ret == 0))
+ return -EINVAL;
+ if (ret == -ENOSPC)
+ return -EAGAIN;
+ return ret;
+ }
reg->bus.base = 0;
reg->bus.offset = handle;
diff --git a/drivers/gpu/drm/nouveau/nouveau_debugfs.c b/drivers/gpu/drm/nouveau/nouveau_debugfs.c
index 7dfbbbc1beea..15a3d40edf02 100644
--- a/drivers/gpu/drm/nouveau/nouveau_debugfs.c
+++ b/drivers/gpu/drm/nouveau/nouveau_debugfs.c
@@ -222,22 +222,18 @@ nouveau_drm_debugfs_init(struct drm_minor *minor)
{
struct nouveau_drm *drm = nouveau_drm(minor->dev);
struct dentry *dentry;
- int i, ret;
+ int i;
for (i = 0; i < ARRAY_SIZE(nouveau_debugfs_files); i++) {
- dentry = debugfs_create_file(nouveau_debugfs_files[i].name,
- S_IRUGO | S_IWUSR,
- minor->debugfs_root, minor->dev,
- nouveau_debugfs_files[i].fops);
- if (!dentry)
- return -ENOMEM;
+ debugfs_create_file(nouveau_debugfs_files[i].name,
+ S_IRUGO | S_IWUSR,
+ minor->debugfs_root, minor->dev,
+ nouveau_debugfs_files[i].fops);
}
- ret = drm_debugfs_create_files(nouveau_debugfs_list,
- NOUVEAU_DEBUGFS_ENTRIES,
- minor->debugfs_root, minor);
- if (ret)
- return ret;
+ drm_debugfs_create_files(nouveau_debugfs_list,
+ NOUVEAU_DEBUGFS_ENTRIES,
+ minor->debugfs_root, minor);
/* Set the size of the vbios since we know it, and it's confusing to
* userspace if it wants to seek() but the file has a length of 0
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index 6b1629c14dd7..ca4087f5a15b 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -618,6 +618,64 @@ nouveau_drm_device_fini(struct drm_device *dev)
kfree(drm);
}
+/*
+ * On some Intel PCIe bridge controllers doing a
+ * D0 -> D3hot -> D3cold -> D0 sequence causes Nvidia GPUs to not reappear.
+ * Skipping the intermediate D3hot step seems to make it work again. This is
+ * probably caused by not meeting the expectation the involved AML code has
+ * when the GPU is put into D3hot state before invoking it.
+ *
+ * This leads to various manifestations of this issue:
+ * - AML code execution to power on the GPU hits an infinite loop (as the
+ * code waits on device memory to change).
+ * - kernel crashes, as all PCI reads return -1, which most code isn't able
+ * to handle well enough.
+ *
+ * In all cases dmesg will contain at least one line like this:
+ * 'nouveau 0000:01:00.0: Refused to change power state, currently in D3'
+ * followed by a lot of nouveau timeouts.
+ *
+ * In the \_SB.PCI0.PEG0.PG00._OFF code deeper down writes bit 0x80 to the not
+ * documented PCI config space register 0x248 of the Intel PCIe bridge
+ * controller (0x1901) in order to change the state of the PCIe link between
+ * the PCIe port and the GPU. There are alternative code paths using other
+ * registers, which seem to work fine (executed pre Windows 8):
+ * - 0xbc bit 0x20 (publicly available documentation claims 'reserved')
+ * - 0xb0 bit 0x10 (link disable)
+ * Changing the conditions inside the firmware by poking into the relevant
+ * addresses does resolve the issue, but it seemed to be ACPI private memory
+ * and not any device accessible memory at all, so there is no portable way of
+ * changing the conditions.
+ * On a XPS 9560 that means bits [0,3] on \CPEX need to be cleared.
+ *
+ * The only systems where this behavior can be seen are hybrid graphics laptops
+ * with a secondary Nvidia Maxwell, Pascal or Turing GPU. It's unclear whether
+ * this issue only occurs in combination with listed Intel PCIe bridge
+ * controllers and the mentioned GPUs or other devices as well.
+ *
+ * documentation on the PCIe bridge controller can be found in the
+ * "7th Generation Intel® Processor Families for H Platforms Datasheet Volume 2"
+ * Section "12 PCI Express* Controller (x16) Registers"
+ */
+
+static void quirk_broken_nv_runpm(struct pci_dev *pdev)
+{
+ struct drm_device *dev = pci_get_drvdata(pdev);
+ struct nouveau_drm *drm = nouveau_drm(dev);
+ struct pci_dev *bridge = pci_upstream_bridge(pdev);
+
+ if (!bridge || bridge->vendor != PCI_VENDOR_ID_INTEL)
+ return;
+
+ switch (bridge->device) {
+ case 0x1901:
+ drm->old_pm_cap = pdev->pm_cap;
+ pdev->pm_cap = 0;
+ NV_INFO(drm, "Disabling PCI power management to avoid bug\n");
+ break;
+ }
+}
+
static int nouveau_drm_probe(struct pci_dev *pdev,
const struct pci_device_id *pent)
{
@@ -699,6 +757,7 @@ static int nouveau_drm_probe(struct pci_dev *pdev,
if (ret)
goto fail_drm_dev_init;
+ quirk_broken_nv_runpm(pdev);
return 0;
fail_drm_dev_init:
@@ -734,7 +793,11 @@ static void
nouveau_drm_remove(struct pci_dev *pdev)
{
struct drm_device *dev = pci_get_drvdata(pdev);
+ struct nouveau_drm *drm = nouveau_drm(dev);
+ /* revert our workaround */
+ if (drm->old_pm_cap)
+ pdev->pm_cap = drm->old_pm_cap;
nouveau_drm_device_remove(dev);
pci_disable_device(pdev);
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index c2c332fbde97..2a6519737800 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -140,6 +140,8 @@ struct nouveau_drm {
struct list_head clients;
+ u8 old_pm_cap;
+
struct {
struct agp_bridge_data *bridge;
u32 base;
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index e3797b2d4d17..645fedd77e21 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -171,6 +171,11 @@ nouveau_svmm_bind(struct drm_device *dev, void *data,
mm = get_task_mm(current);
down_read(&mm->mmap_sem);
+ if (!cli->svm.svmm) {
+ up_read(&mm->mmap_sem);
+ return -EINVAL;
+ }
+
for (addr = args->va_start, end = args->va_start + size; addr < end;) {
struct vm_area_struct *vma;
unsigned long next;
@@ -179,6 +184,7 @@ nouveau_svmm_bind(struct drm_device *dev, void *data,
if (!vma)
break;
+ addr = max(addr, vma->vm_start);
next = min(vma->vm_end, end);
/* This is a best effort so we ignore errors */
nouveau_dmem_migrate_vma(cli->drm, vma, addr, next);
@@ -656,9 +662,6 @@ nouveau_svm_fault(struct nvif_notify *notify)
limit = start + (ARRAY_SIZE(args.phys) << PAGE_SHIFT);
if (start < svmm->unmanaged.limit)
limit = min_t(u64, limit, svmm->unmanaged.start);
- else
- if (limit > svmm->unmanaged.start)
- start = max_t(u64, start, svmm->unmanaged.limit);
SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit);
mm = svmm->notifier.mm;
diff --git a/drivers/gpu/drm/nouveau/nvif/Kbuild b/drivers/gpu/drm/nouveau/nvif/Kbuild
index 50d583d63807..f194d354c1f5 100644
--- a/drivers/gpu/drm/nouveau/nvif/Kbuild
+++ b/drivers/gpu/drm/nouveau/nvif/Kbuild
@@ -8,6 +8,7 @@ nvif-y += nvif/fifo.o
nvif-y += nvif/mem.o
nvif-y += nvif/mmu.o
nvif-y += nvif/notify.o
+nvif-y += nvif/timer.o
nvif-y += nvif/vmm.o
# Usermode classes
diff --git a/drivers/gpu/drm/nouveau/nvif/device.c b/drivers/gpu/drm/nouveau/nvif/device.c
index 1ec101ba3b42..0e92db44bbc8 100644
--- a/drivers/gpu/drm/nouveau/nvif/device.c
+++ b/drivers/gpu/drm/nouveau/nvif/device.c
@@ -27,11 +27,15 @@
u64
nvif_device_time(struct nvif_device *device)
{
- struct nv_device_time_v0 args = {};
- int ret = nvif_object_mthd(&device->object, NV_DEVICE_V0_TIME,
- &args, sizeof(args));
- WARN_ON_ONCE(ret != 0);
- return args.time;
+ if (!device->user.func) {
+ struct nv_device_time_v0 args = {};
+ int ret = nvif_object_mthd(&device->object, NV_DEVICE_V0_TIME,
+ &args, sizeof(args));
+ WARN_ON_ONCE(ret != 0);
+ return args.time;
+ }
+
+ return device->user.func->time(&device->user);
}
void
diff --git a/drivers/gpu/drm/nouveau/nvif/timer.c b/drivers/gpu/drm/nouveau/nvif/timer.c
new file mode 100644
index 000000000000..602c1a258d10
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvif/timer.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2020 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <nvif/timer.h>
+#include <nvif/device.h>
+
+s64
+nvif_timer_wait_test(struct nvif_timer_wait *wait)
+{
+ u64 time = nvif_device_time(wait->device);
+
+ if (wait->reads == 0) {
+ wait->time0 = time;
+ wait->time1 = time;
+ }
+
+ if (wait->time1 == time) {
+ if (WARN_ON(wait->reads++ == 16))
+ return -ETIMEDOUT;
+ } else {
+ wait->time1 = time;
+ wait->reads = 1;
+ }
+
+ if (wait->time1 - wait->time0 > wait->limit)
+ return -ETIMEDOUT;
+
+ return wait->time1 - wait->time0;
+}
+
+void
+nvif_timer_wait_init(struct nvif_device *device, u64 nsec,
+ struct nvif_timer_wait *wait)
+{
+ wait->device = device;
+ wait->limit = nsec;
+ wait->reads = 0;
+}
diff --git a/drivers/gpu/drm/nouveau/nvif/userc361.c b/drivers/gpu/drm/nouveau/nvif/userc361.c
index 19f9958e7e01..1116f871b272 100644
--- a/drivers/gpu/drm/nouveau/nvif/userc361.c
+++ b/drivers/gpu/drm/nouveau/nvif/userc361.c
@@ -21,6 +21,19 @@
*/
#include <nvif/user.h>
+static u64
+nvif_userc361_time(struct nvif_user *user)
+{
+ u32 hi, lo;
+
+ do {
+ hi = nvif_rd32(&user->object, 0x084);
+ lo = nvif_rd32(&user->object, 0x080);
+ } while (hi != nvif_rd32(&user->object, 0x084));
+
+ return ((u64)hi << 32 | lo);
+}
+
static void
nvif_userc361_doorbell(struct nvif_user *user, u32 token)
{
@@ -30,4 +43,5 @@ nvif_userc361_doorbell(struct nvif_user *user, u32 token)
const struct nvif_user_func
nvif_userc361 = {
.doorbell = nvif_userc361_doorbell,
+ .time = nvif_userc361_time,
};
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
index dd8f85b8b3a7..f2f5636efac4 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
@@ -1981,8 +1981,34 @@ gf100_gr_init_(struct nvkm_gr *base)
{
struct gf100_gr *gr = gf100_gr(base);
struct nvkm_subdev *subdev = &base->engine.subdev;
+ struct nvkm_device *device = subdev->device;
+ bool reset = device->chipset == 0x137 || device->chipset == 0x138;
u32 ret;
+ /* On certain GP107/GP108 boards, we trigger a weird issue where
+ * GR will stop responding to PRI accesses after we've asked the
+ * SEC2 RTOS to boot the GR falcons. This happens with far more
+ * frequency when cold-booting a board (ie. returning from D3).
+ *
+ * The root cause for this is not known and has proven difficult
+ * to isolate, with many avenues being dead-ends.
+ *
+ * A workaround was discovered by Karol, whereby putting GR into
+ * reset for an extended period right before initialisation
+ * prevents the problem from occuring.
+ *
+ * XXX: As RM does not require any such workaround, this is more
+ * of a hack than a true fix.
+ */
+ reset = nvkm_boolopt(device->cfgopt, "NvGrResetWar", reset);
+ if (reset) {
+ nvkm_mask(device, 0x000200, 0x00001000, 0x00000000);
+ nvkm_rd32(device, 0x000200);
+ msleep(50);
+ nvkm_mask(device, 0x000200, 0x00001000, 0x00001000);
+ nvkm_rd32(device, 0x000200);
+ }
+
nvkm_pmu_pgob(gr->base.engine.subdev.device->pmu, false);
ret = nvkm_falcon_get(&gr->fecs.falcon, subdev);
diff --git a/drivers/gpu/drm/panel/panel-simple.c b/drivers/gpu/drm/panel/panel-simple.c
index 0ce81b1f36af..3ad828eaefe1 100644
--- a/drivers/gpu/drm/panel/panel-simple.c
+++ b/drivers/gpu/drm/panel/panel-simple.c
@@ -361,7 +361,6 @@ static int panel_dpi_probe(struct device *dev,
struct panel_desc *desc;
unsigned int bus_flags;
struct videomode vm;
- const char *mapping;
int ret;
np = dev->of_node;
@@ -386,16 +385,6 @@ static int panel_dpi_probe(struct device *dev,
of_property_read_u32(np, "width-mm", &desc->size.width);
of_property_read_u32(np, "height-mm", &desc->size.height);
- of_property_read_string(np, "data-mapping", &mapping);
- if (!strcmp(mapping, "rgb24"))
- desc->bus_format = MEDIA_BUS_FMT_RGB888_1X24;
- else if (!strcmp(mapping, "rgb565"))
- desc->bus_format = MEDIA_BUS_FMT_RGB565_1X16;
- else if (!strcmp(mapping, "bgr666"))
- desc->bus_format = MEDIA_BUS_FMT_RGB666_1X18;
- else if (!strcmp(mapping, "lvds666"))
- desc->bus_format = MEDIA_BUS_FMT_RGB666_1X24_CPADHI;
-
/* Extract bus_flags from display_timing */
bus_flags = 0;
vm.flags = timing->flags;
diff --git a/drivers/gpu/drm/rockchip/analogix_dp-rockchip.c b/drivers/gpu/drm/rockchip/analogix_dp-rockchip.c
index f38f5e113c6b..ce98c08aa8b4 100644
--- a/drivers/gpu/drm/rockchip/analogix_dp-rockchip.c
+++ b/drivers/gpu/drm/rockchip/analogix_dp-rockchip.c
@@ -325,15 +325,9 @@ static int rockchip_dp_bind(struct device *dev, struct device *master,
void *data)
{
struct rockchip_dp_device *dp = dev_get_drvdata(dev);
- const struct rockchip_dp_chip_data *dp_data;
struct drm_device *drm_dev = data;
int ret;
- dp_data = of_device_get_match_data(dev);
- if (!dp_data)
- return -ENODEV;
-
- dp->data = dp_data;
dp->drm_dev = drm_dev;
ret = rockchip_dp_drm_create_encoder(dp);
@@ -344,16 +338,9 @@ static int rockchip_dp_bind(struct device *dev, struct device *master,
dp->plat_data.encoder = &dp->encoder;
- dp->plat_data.dev_type = dp->data->chip_type;
- dp->plat_data.power_on_start = rockchip_dp_poweron_start;
- dp->plat_data.power_off = rockchip_dp_powerdown;
- dp->plat_data.get_modes = rockchip_dp_get_modes;
-
- dp->adp = analogix_dp_bind(dev, dp->drm_dev, &dp->plat_data);
- if (IS_ERR(dp->adp)) {
- ret = PTR_ERR(dp->adp);
+ ret = analogix_dp_bind(dp->adp, drm_dev);
+ if (ret)
goto err_cleanup_encoder;
- }
return 0;
err_cleanup_encoder:
@@ -368,8 +355,6 @@ static void rockchip_dp_unbind(struct device *dev, struct device *master,
analogix_dp_unbind(dp->adp);
dp->encoder.funcs->destroy(&dp->encoder);
-
- dp->adp = ERR_PTR(-ENODEV);
}
static const struct component_ops rockchip_dp_component_ops = {
@@ -380,10 +365,15 @@ static const struct component_ops rockchip_dp_component_ops = {
static int rockchip_dp_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
+ const struct rockchip_dp_chip_data *dp_data;
struct drm_panel *panel = NULL;
struct rockchip_dp_device *dp;
int ret;
+ dp_data = of_device_get_match_data(dev);
+ if (!dp_data)
+ return -ENODEV;
+
ret = drm_of_find_panel_or_bridge(dev->of_node, 1, 0, &panel, NULL);
if (ret < 0)
return ret;
@@ -394,7 +384,12 @@ static int rockchip_dp_probe(struct platform_device *pdev)
dp->dev = dev;
dp->adp = ERR_PTR(-ENODEV);
+ dp->data = dp_data;
dp->plat_data.panel = panel;
+ dp->plat_data.dev_type = dp->data->chip_type;
+ dp->plat_data.power_on_start = rockchip_dp_poweron_start;
+ dp->plat_data.power_off = rockchip_dp_powerdown;
+ dp->plat_data.get_modes = rockchip_dp_get_modes;
ret = rockchip_dp_of_probe(dp);
if (ret < 0)
@@ -402,12 +397,19 @@ static int rockchip_dp_probe(struct platform_device *pdev)
platform_set_drvdata(pdev, dp);
+ dp->adp = analogix_dp_probe(dev, &dp->plat_data);
+ if (IS_ERR(dp->adp))
+ return PTR_ERR(dp->adp);
+
return component_add(dev, &rockchip_dp_component_ops);
}
static int rockchip_dp_remove(struct platform_device *pdev)
{
+ struct rockchip_dp_device *dp = platform_get_drvdata(pdev);
+
component_del(&pdev->dev, &rockchip_dp_component_ops);
+ analogix_dp_remove(dp->adp);
return 0;
}
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 6ee3b96f0d13..0ad30b112982 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -442,66 +442,6 @@ vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
}
EXPORT_SYMBOL(ttm_bo_vm_fault);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/**
- * ttm_pgprot_is_wrprotecting - Is a page protection value write-protecting?
- * @prot: The page protection value
- *
- * Return: true if @prot is write-protecting. false otherwise.
- */
-static bool ttm_pgprot_is_wrprotecting(pgprot_t prot)
-{
- /*
- * This is meant to say "pgprot_wrprotect(prot) == prot" in a generic
- * way. Unfortunately there is no generic pgprot_wrprotect.
- */
- return pte_val(pte_wrprotect(__pte(pgprot_val(prot)))) ==
- pgprot_val(prot);
-}
-
-static vm_fault_t ttm_bo_vm_huge_fault(struct vm_fault *vmf,
- enum page_entry_size pe_size)
-{
- struct vm_area_struct *vma = vmf->vma;
- pgprot_t prot;
- struct ttm_buffer_object *bo = vma->vm_private_data;
- vm_fault_t ret;
- pgoff_t fault_page_size = 0;
- bool write = vmf->flags & FAULT_FLAG_WRITE;
-
- switch (pe_size) {
- case PE_SIZE_PMD:
- fault_page_size = HPAGE_PMD_SIZE >> PAGE_SHIFT;
- break;
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
- case PE_SIZE_PUD:
- fault_page_size = HPAGE_PUD_SIZE >> PAGE_SHIFT;
- break;
-#endif
- default:
- WARN_ON_ONCE(1);
- return VM_FAULT_FALLBACK;
- }
-
- /* Fallback on write dirty-tracking or COW */
- if (write && ttm_pgprot_is_wrprotecting(vma->vm_page_prot))
- return VM_FAULT_FALLBACK;
-
- ret = ttm_bo_vm_reserve(bo, vmf);
- if (ret)
- return ret;
-
- prot = vm_get_page_prot(vma->vm_flags);
- ret = ttm_bo_vm_fault_reserved(vmf, prot, 1, fault_page_size);
- if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
- return ret;
-
- dma_resv_unlock(bo->base.resv);
-
- return ret;
-}
-#endif
-
void ttm_bo_vm_open(struct vm_area_struct *vma)
{
struct ttm_buffer_object *bo = vma->vm_private_data;
@@ -604,9 +544,6 @@ static const struct vm_operations_struct ttm_bo_vm_ops = {
.open = ttm_bo_vm_open,
.close = ttm_bo_vm_close,
.access = ttm_bo_vm_access,
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- .huge_fault = ttm_bo_vm_huge_fault,
-#endif
};
static struct ttm_buffer_object *ttm_bo_vm_lookup(struct ttm_bo_device *bdev,
diff --git a/drivers/gpu/drm/vboxvideo/vbox_drv.c b/drivers/gpu/drm/vboxvideo/vbox_drv.c
index 8512d970a09f..ac8f75db2ecd 100644
--- a/drivers/gpu/drm/vboxvideo/vbox_drv.c
+++ b/drivers/gpu/drm/vboxvideo/vbox_drv.c
@@ -41,6 +41,10 @@ static int vbox_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
if (!vbox_check_supported(VBE_DISPI_ID_HGSMI))
return -ENODEV;
+ ret = drm_fb_helper_remove_conflicting_pci_framebuffers(pdev, "vboxvideodrmfb");
+ if (ret)
+ return ret;
+
vbox = kzalloc(sizeof(*vbox), GFP_KERNEL);
if (!vbox)
return -ENOMEM;
diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c
index cea18dc15f77..340719238753 100644
--- a/drivers/gpu/drm/vc4/vc4_hdmi.c
+++ b/drivers/gpu/drm/vc4/vc4_hdmi.c
@@ -681,11 +681,23 @@ static enum drm_mode_status
vc4_hdmi_encoder_mode_valid(struct drm_encoder *crtc,
const struct drm_display_mode *mode)
{
- /* HSM clock must be 108% of the pixel clock. Additionally,
- * the AXI clock needs to be at least 25% of pixel clock, but
- * HSM ends up being the limiting factor.
+ /*
+ * As stated in RPi's vc4 firmware "HDMI state machine (HSM) clock must
+ * be faster than pixel clock, infinitesimally faster, tested in
+ * simulation. Otherwise, exact value is unimportant for HDMI
+ * operation." This conflicts with bcm2835's vc4 documentation, which
+ * states HSM's clock has to be at least 108% of the pixel clock.
+ *
+ * Real life tests reveal that vc4's firmware statement holds up, and
+ * users are able to use pixel clocks closer to HSM's, namely for
+ * 1920x1200@60Hz. So it was decided to have leave a 1% margin between
+ * both clocks. Which, for RPi0-3 implies a maximum pixel clock of
+ * 162MHz.
+ *
+ * Additionally, the AXI clock needs to be at least 25% of
+ * pixel clock, but HSM ends up being the limiting factor.
*/
- if (mode->clock > HSM_CLOCK_FREQ / (1000 * 108 / 100))
+ if (mode->clock > HSM_CLOCK_FREQ / (1000 * 101 / 100))
return MODE_CLOCK_HIGH;
return MODE_OK;
diff --git a/drivers/gpu/drm/virtio/virtgpu_object.c b/drivers/gpu/drm/virtio/virtgpu_object.c
index 2bfb13d1932e..d9039bb7c5e3 100644
--- a/drivers/gpu/drm/virtio/virtgpu_object.c
+++ b/drivers/gpu/drm/virtio/virtgpu_object.c
@@ -123,15 +123,17 @@ bool virtio_gpu_is_shmem(struct virtio_gpu_object *bo)
struct drm_gem_object *virtio_gpu_create_object(struct drm_device *dev,
size_t size)
{
- struct virtio_gpu_object *bo;
+ struct virtio_gpu_object_shmem *shmem;
+ struct drm_gem_shmem_object *dshmem;
- bo = kzalloc(sizeof(*bo), GFP_KERNEL);
- if (!bo)
+ shmem = kzalloc(sizeof(*shmem), GFP_KERNEL);
+ if (!shmem)
return NULL;
- bo->base.base.funcs = &virtio_gpu_shmem_funcs;
- bo->base.map_cached = true;
- return &bo->base.base;
+ dshmem = &shmem->base.base;
+ dshmem->base.funcs = &virtio_gpu_shmem_funcs;
+ dshmem->map_cached = true;
+ return &dshmem->base;
}
static int virtio_gpu_object_shmem_init(struct virtio_gpu_device *vgdev,
diff --git a/drivers/gpu/drm/xen/xen_drm_front.c b/drivers/gpu/drm/xen/xen_drm_front.c
index 4be49c1aef51..374142018171 100644
--- a/drivers/gpu/drm/xen/xen_drm_front.c
+++ b/drivers/gpu/drm/xen/xen_drm_front.c
@@ -401,7 +401,7 @@ static int xen_drm_drv_dumb_create(struct drm_file *filp,
obj = xen_drm_front_gem_create(dev, args->size);
if (IS_ERR_OR_NULL(obj)) {
- ret = PTR_ERR(obj);
+ ret = PTR_ERR_OR_ZERO(obj);
goto fail;
}
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index a02ce43d778d..32e3bc0aa665 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -533,7 +533,6 @@ struct hv_dynmem_device {
* State to synchronize hot-add.
*/
struct completion ol_waitevent;
- bool ha_waiting;
/*
* This thread handles hot-add
* requests from the host as well as notifying
@@ -634,10 +633,7 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
switch (val) {
case MEM_ONLINE:
case MEM_CANCEL_ONLINE:
- if (dm_device.ha_waiting) {
- dm_device.ha_waiting = false;
- complete(&dm_device.ol_waitevent);
- }
+ complete(&dm_device.ol_waitevent);
break;
case MEM_OFFLINE:
@@ -726,8 +722,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
has->covered_end_pfn += processed_pfn;
spin_unlock_irqrestore(&dm_device.ha_lock, flags);
- init_completion(&dm_device.ol_waitevent);
- dm_device.ha_waiting = !memhp_auto_online;
+ reinit_completion(&dm_device.ol_waitevent);
nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
ret = add_memory(nid, PFN_PHYS((start_pfn)),
@@ -753,15 +748,14 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
}
/*
- * Wait for the memory block to be onlined when memory onlining
- * is done outside of kernel (memhp_auto_online). Since the hot
- * add has succeeded, it is ok to proceed even if the pages in
- * the hot added region have not been "onlined" within the
- * allowed time.
+ * Wait for memory to get onlined. If the kernel onlined the
+ * memory when adding it, this will return directly. Otherwise,
+ * it will wait for user space to online the memory. This helps
+ * to avoid adding memory faster than it is getting onlined. As
+ * adding succeeded, it is ok to proceed even if the memory was
+ * not onlined in time.
*/
- if (dm_device.ha_waiting)
- wait_for_completion_timeout(&dm_device.ol_waitevent,
- 5*HZ);
+ wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ);
post_status(&dm_device);
}
}
@@ -1706,6 +1700,7 @@ static int balloon_probe(struct hv_device *dev,
#ifdef CONFIG_MEMORY_HOTPLUG
set_online_page_callback(&hv_online_page);
+ init_completion(&dm_device.ol_waitevent);
register_memory_notifier(&hv_memory_nb);
#endif
diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c
index d4c83009d625..ab719d372b0d 100644
--- a/drivers/hwmon/dell-smm-hwmon.c
+++ b/drivers/hwmon/dell-smm-hwmon.c
@@ -7,7 +7,7 @@
* Hwmon integration:
* Copyright (C) 2011 Jean Delvare <jdelvare@suse.de>
* Copyright (C) 2013, 2014 Guenter Roeck <linux@roeck-us.net>
- * Copyright (C) 2014, 2015 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (C) 2014, 2015 Pali Rohár <pali@kernel.org>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -86,7 +86,7 @@ static unsigned int auto_fan;
#define I8K_HWMON_HAVE_FAN3 (1 << 12)
MODULE_AUTHOR("Massimo Dal Zotto (dz@debian.org)");
-MODULE_AUTHOR("Pali Rohár <pali.rohar@gmail.com>");
+MODULE_AUTHOR("Pali Rohár <pali@kernel.org>");
MODULE_DESCRIPTION("Dell laptop SMM BIOS hwmon driver");
MODULE_LICENSE("GPL");
MODULE_ALIAS("i8k");
diff --git a/drivers/ide/ide-scan-pci.c b/drivers/ide/ide-scan-pci.c
index acf874800ca4..b0411a1827a3 100644
--- a/drivers/ide/ide-scan-pci.c
+++ b/drivers/ide/ide-scan-pci.c
@@ -89,8 +89,7 @@ static int __init ide_scan_pcidev(struct pci_dev *dev)
static int __init ide_scan_pcibus(void)
{
struct pci_dev *dev = NULL;
- struct pci_driver *d;
- struct list_head *l, *n;
+ struct pci_driver *d, *tmp;
pre_init = 0;
for_each_pci_dev(dev)
@@ -101,9 +100,8 @@ static int __init ide_scan_pcibus(void)
* are post init.
*/
- list_for_each_safe(l, n, &ide_pci_drivers) {
- list_del(l);
- d = list_entry(l, struct pci_driver, node);
+ list_for_each_entry_safe(d, tmp, &ide_pci_drivers, node) {
+ list_del(&d->node);
if (__pci_register_driver(d, d->driver.owner,
d->driver.mod_name))
printk(KERN_ERR "%s: failed to register %s driver\n",
diff --git a/drivers/iio/Kconfig b/drivers/iio/Kconfig
index 5bd51853b15e..d5c073a8aa3e 100644
--- a/drivers/iio/Kconfig
+++ b/drivers/iio/Kconfig
@@ -88,6 +88,7 @@ source "drivers/iio/orientation/Kconfig"
if IIO_TRIGGER
source "drivers/iio/trigger/Kconfig"
endif #IIO_TRIGGER
+source "drivers/iio/position/Kconfig"
source "drivers/iio/potentiometer/Kconfig"
source "drivers/iio/potentiostat/Kconfig"
source "drivers/iio/pressure/Kconfig"
diff --git a/drivers/iio/Makefile b/drivers/iio/Makefile
index bff682ad1cfb..1712011c0f4a 100644
--- a/drivers/iio/Makefile
+++ b/drivers/iio/Makefile
@@ -31,6 +31,7 @@ obj-y += light/
obj-y += magnetometer/
obj-y += multiplexer/
obj-y += orientation/
+obj-y += position/
obj-y += potentiometer/
obj-y += potentiostat/
obj-y += pressure/
diff --git a/drivers/iio/accel/cros_ec_accel_legacy.c b/drivers/iio/accel/cros_ec_accel_legacy.c
index 68e847c6255e..2532b9ad3384 100644
--- a/drivers/iio/accel/cros_ec_accel_legacy.c
+++ b/drivers/iio/accel/cros_ec_accel_legacy.c
@@ -170,7 +170,8 @@ static int cros_ec_accel_legacy_probe(struct platform_device *pdev)
if (!indio_dev)
return -ENOMEM;
- ret = cros_ec_sensors_core_init(pdev, indio_dev, true);
+ ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
+ cros_ec_sensors_capture, NULL);
if (ret)
return ret;
@@ -190,11 +191,6 @@ static int cros_ec_accel_legacy_probe(struct platform_device *pdev)
state->sign[CROS_EC_SENSOR_Z] = -1;
}
- ret = devm_iio_triggered_buffer_setup(dev, indio_dev, NULL,
- cros_ec_sensors_capture, NULL);
- if (ret)
- return ret;
-
return devm_iio_device_register(dev, indio_dev);
}
diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig
index f4da821c4022..12bb8b7ca1ff 100644
--- a/drivers/iio/adc/Kconfig
+++ b/drivers/iio/adc/Kconfig
@@ -795,6 +795,16 @@ config RCAR_GYRO_ADC
To compile this driver as a module, choose M here: the
module will be called rcar-gyroadc.
+config RN5T618_ADC
+ tristate "ADC for the RN5T618/RC5T619 family of chips"
+ depends on MFD_RN5T618
+ help
+ Say yes here to build support for the integrated ADC inside the
+ RN5T618/619 series PMICs:
+
+ This driver can also be built as a module. If so, the module
+ will be called rn5t618-adc.
+
config ROCKCHIP_SARADC
tristate "Rockchip SARADC driver"
depends on ARCH_ROCKCHIP || (ARM && COMPILE_TEST)
diff --git a/drivers/iio/adc/Makefile b/drivers/iio/adc/Makefile
index 8462455b4228..637807861112 100644
--- a/drivers/iio/adc/Makefile
+++ b/drivers/iio/adc/Makefile
@@ -75,6 +75,7 @@ obj-$(CONFIG_QCOM_VADC_COMMON) += qcom-vadc-common.o
obj-$(CONFIG_QCOM_SPMI_VADC) += qcom-spmi-vadc.o
obj-$(CONFIG_QCOM_PM8XXX_XOADC) += qcom-pm8xxx-xoadc.o
obj-$(CONFIG_RCAR_GYRO_ADC) += rcar-gyroadc.o
+obj-$(CONFIG_RN5T618_ADC) += rn5t618-adc.o
obj-$(CONFIG_ROCKCHIP_SARADC) += rockchip_saradc.o
obj-$(CONFIG_SC27XX_ADC) += sc27xx_adc.o
obj-$(CONFIG_SPEAR_ADC) += spear_adc.o
diff --git a/drivers/iio/adc/rn5t618-adc.c b/drivers/iio/adc/rn5t618-adc.c
new file mode 100644
index 000000000000..f21027e4e26a
--- /dev/null
+++ b/drivers/iio/adc/rn5t618-adc.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * ADC driver for the RICOH RN5T618 power management chip family
+ *
+ * Copyright (C) 2019 Andreas Kemnade
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mfd/rn5t618.h>
+#include <linux/platform_device.h>
+#include <linux/completion.h>
+#include <linux/regmap.h>
+#include <linux/iio/iio.h>
+#include <linux/slab.h>
+
+#define RN5T618_ADC_CONVERSION_TIMEOUT (msecs_to_jiffies(500))
+#define RN5T618_REFERENCE_VOLT 2500
+
+/* mask for selecting channels for single conversion */
+#define RN5T618_ADCCNT3_CHANNEL_MASK 0x7
+/* average 4-time conversion mode */
+#define RN5T618_ADCCNT3_AVG BIT(3)
+/* set for starting a single conversion, gets cleared by hw when done */
+#define RN5T618_ADCCNT3_GODONE BIT(4)
+/* automatic conversion, period is in ADCCNT2, selected channels are
+ * in ADCCNT1
+ */
+#define RN5T618_ADCCNT3_AUTO BIT(5)
+#define RN5T618_ADCEND_IRQ BIT(0)
+
+struct rn5t618_adc_data {
+ struct device *dev;
+ struct rn5t618 *rn5t618;
+ struct completion conv_completion;
+ int irq;
+};
+
+struct rn5t618_channel_ratios {
+ u16 numerator;
+ u16 denominator;
+};
+
+enum rn5t618_channels {
+ LIMMON = 0,
+ VBAT,
+ VADP,
+ VUSB,
+ VSYS,
+ VTHM,
+ AIN1,
+ AIN0
+};
+
+static const struct rn5t618_channel_ratios rn5t618_ratios[8] = {
+ [LIMMON] = {50, 32}, /* measured across 20mOhm, amplified by 32 */
+ [VBAT] = {2, 1},
+ [VADP] = {3, 1},
+ [VUSB] = {3, 1},
+ [VSYS] = {3, 1},
+ [VTHM] = {1, 1},
+ [AIN1] = {1, 1},
+ [AIN0] = {1, 1},
+};
+
+static int rn5t618_read_adc_reg(struct rn5t618 *rn5t618, int reg, u16 *val)
+{
+ u8 data[2];
+ int ret;
+
+ ret = regmap_bulk_read(rn5t618->regmap, reg, data, sizeof(data));
+ if (ret < 0)
+ return ret;
+
+ *val = (data[0] << 4) | (data[1] & 0xF);
+
+ return 0;
+}
+
+static irqreturn_t rn5t618_adc_irq(int irq, void *data)
+{
+ struct rn5t618_adc_data *adc = data;
+ unsigned int r = 0;
+ int ret;
+
+ /* clear low & high threshold irqs */
+ regmap_write(adc->rn5t618->regmap, RN5T618_IR_ADC1, 0);
+ regmap_write(adc->rn5t618->regmap, RN5T618_IR_ADC2, 0);
+
+ ret = regmap_read(adc->rn5t618->regmap, RN5T618_IR_ADC3, &r);
+ if (ret < 0)
+ dev_err(adc->dev, "failed to read IRQ status: %d\n", ret);
+
+ regmap_write(adc->rn5t618->regmap, RN5T618_IR_ADC3, 0);
+
+ if (r & RN5T618_ADCEND_IRQ)
+ complete(&adc->conv_completion);
+
+ return IRQ_HANDLED;
+}
+
+static int rn5t618_adc_read(struct iio_dev *iio_dev,
+ const struct iio_chan_spec *chan,
+ int *val, int *val2, long mask)
+{
+ struct rn5t618_adc_data *adc = iio_priv(iio_dev);
+ u16 raw;
+ int ret;
+
+ if (mask == IIO_CHAN_INFO_SCALE) {
+ *val = RN5T618_REFERENCE_VOLT *
+ rn5t618_ratios[chan->channel].numerator;
+ *val2 = rn5t618_ratios[chan->channel].denominator * 4095;
+
+ return IIO_VAL_FRACTIONAL;
+ }
+
+ /* select channel */
+ ret = regmap_update_bits(adc->rn5t618->regmap, RN5T618_ADCCNT3,
+ RN5T618_ADCCNT3_CHANNEL_MASK,
+ chan->channel);
+ if (ret < 0)
+ return ret;
+
+ ret = regmap_write(adc->rn5t618->regmap, RN5T618_EN_ADCIR3,
+ RN5T618_ADCEND_IRQ);
+ if (ret < 0)
+ return ret;
+
+ ret = regmap_update_bits(adc->rn5t618->regmap, RN5T618_ADCCNT3,
+ RN5T618_ADCCNT3_AVG,
+ mask == IIO_CHAN_INFO_AVERAGE_RAW ?
+ RN5T618_ADCCNT3_AVG : 0);
+ if (ret < 0)
+ return ret;
+
+ init_completion(&adc->conv_completion);
+ /* single conversion */
+ ret = regmap_update_bits(adc->rn5t618->regmap, RN5T618_ADCCNT3,
+ RN5T618_ADCCNT3_GODONE,
+ RN5T618_ADCCNT3_GODONE);
+ if (ret < 0)
+ return ret;
+
+ ret = wait_for_completion_timeout(&adc->conv_completion,
+ RN5T618_ADC_CONVERSION_TIMEOUT);
+ if (ret == 0) {
+ dev_warn(adc->dev, "timeout waiting for adc result\n");
+ return -ETIMEDOUT;
+ }
+
+ ret = rn5t618_read_adc_reg(adc->rn5t618,
+ RN5T618_ILIMDATAH + 2 * chan->channel,
+ &raw);
+ if (ret < 0)
+ return ret;
+
+ *val = raw;
+
+ return IIO_VAL_INT;
+}
+
+static const struct iio_info rn5t618_adc_iio_info = {
+ .read_raw = &rn5t618_adc_read,
+};
+
+#define RN5T618_ADC_CHANNEL(_channel, _type, _name) { \
+ .type = _type, \
+ .channel = _channel, \
+ .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \
+ BIT(IIO_CHAN_INFO_AVERAGE_RAW) | \
+ BIT(IIO_CHAN_INFO_SCALE), \
+ .datasheet_name = _name, \
+ .indexed = 1. \
+}
+
+static const struct iio_chan_spec rn5t618_adc_iio_channels[] = {
+ RN5T618_ADC_CHANNEL(LIMMON, IIO_CURRENT, "LIMMON"),
+ RN5T618_ADC_CHANNEL(VBAT, IIO_VOLTAGE, "VBAT"),
+ RN5T618_ADC_CHANNEL(VADP, IIO_VOLTAGE, "VADP"),
+ RN5T618_ADC_CHANNEL(VUSB, IIO_VOLTAGE, "VUSB"),
+ RN5T618_ADC_CHANNEL(VSYS, IIO_VOLTAGE, "VSYS"),
+ RN5T618_ADC_CHANNEL(VTHM, IIO_VOLTAGE, "VTHM"),
+ RN5T618_ADC_CHANNEL(AIN1, IIO_VOLTAGE, "AIN1"),
+ RN5T618_ADC_CHANNEL(AIN0, IIO_VOLTAGE, "AIN0")
+};
+
+static int rn5t618_adc_probe(struct platform_device *pdev)
+{
+ int ret;
+ struct iio_dev *iio_dev;
+ struct rn5t618_adc_data *adc;
+ struct rn5t618 *rn5t618 = dev_get_drvdata(pdev->dev.parent);
+
+ iio_dev = devm_iio_device_alloc(&pdev->dev, sizeof(*adc));
+ if (!iio_dev) {
+ dev_err(&pdev->dev, "failed allocating iio device\n");
+ return -ENOMEM;
+ }
+
+ adc = iio_priv(iio_dev);
+ adc->dev = &pdev->dev;
+ adc->rn5t618 = rn5t618;
+
+ if (rn5t618->irq_data)
+ adc->irq = regmap_irq_get_virq(rn5t618->irq_data,
+ RN5T618_IRQ_ADC);
+
+ if (adc->irq <= 0) {
+ dev_err(&pdev->dev, "get virq failed\n");
+ return -EINVAL;
+ }
+
+ init_completion(&adc->conv_completion);
+
+ iio_dev->name = dev_name(&pdev->dev);
+ iio_dev->dev.parent = &pdev->dev;
+ iio_dev->info = &rn5t618_adc_iio_info;
+ iio_dev->modes = INDIO_DIRECT_MODE;
+ iio_dev->channels = rn5t618_adc_iio_channels;
+ iio_dev->num_channels = ARRAY_SIZE(rn5t618_adc_iio_channels);
+
+ /* stop any auto-conversion */
+ ret = regmap_write(rn5t618->regmap, RN5T618_ADCCNT3, 0);
+ if (ret < 0)
+ return ret;
+
+ platform_set_drvdata(pdev, iio_dev);
+
+ ret = devm_request_threaded_irq(adc->dev, adc->irq, NULL,
+ rn5t618_adc_irq,
+ IRQF_ONESHOT, dev_name(adc->dev),
+ adc);
+ if (ret < 0) {
+ dev_err(adc->dev, "request irq %d failed: %d\n", adc->irq, ret);
+ return ret;
+ }
+
+ return devm_iio_device_register(adc->dev, iio_dev);
+}
+
+static struct platform_driver rn5t618_adc_driver = {
+ .driver = {
+ .name = "rn5t618-adc",
+ },
+ .probe = rn5t618_adc_probe,
+};
+
+module_platform_driver(rn5t618_adc_driver);
+MODULE_ALIAS("platform:rn5t618-adc");
+MODULE_DESCRIPTION("RICOH RN5T618 ADC driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_lid_angle.c b/drivers/iio/common/cros_ec_sensors/cros_ec_lid_angle.c
index 1dcc2a16ab2d..af801e203623 100644
--- a/drivers/iio/common/cros_ec_sensors/cros_ec_lid_angle.c
+++ b/drivers/iio/common/cros_ec_sensors/cros_ec_lid_angle.c
@@ -97,7 +97,7 @@ static int cros_ec_lid_angle_probe(struct platform_device *pdev)
if (!indio_dev)
return -ENOMEM;
- ret = cros_ec_sensors_core_init(pdev, indio_dev, false);
+ ret = cros_ec_sensors_core_init(pdev, indio_dev, false, NULL, NULL);
if (ret)
return ret;
@@ -127,7 +127,6 @@ MODULE_DEVICE_TABLE(platform, cros_ec_lid_angle_ids);
static struct platform_driver cros_ec_lid_angle_platform_driver = {
.driver = {
.name = DRV_NAME,
- .pm = &cros_ec_sensors_pm_ops,
},
.probe = cros_ec_lid_angle_probe,
.id_table = cros_ec_lid_angle_ids,
diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c
index 576e45faafaf..a66941fdb385 100644
--- a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c
+++ b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c
@@ -230,10 +230,14 @@ static int cros_ec_sensors_probe(struct platform_device *pdev)
if (!indio_dev)
return -ENOMEM;
- ret = cros_ec_sensors_core_init(pdev, indio_dev, true);
+ ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
+ cros_ec_sensors_capture,
+ cros_ec_sensors_push_data);
if (ret)
return ret;
+ iio_buffer_set_attrs(indio_dev->buffer, cros_ec_sensor_fifo_attributes);
+
indio_dev->info = &ec_sensors_info;
state = iio_priv(indio_dev);
for (channel = state->channels, i = CROS_EC_SENSOR_X;
@@ -245,7 +249,6 @@ static int cros_ec_sensors_probe(struct platform_device *pdev)
BIT(IIO_CHAN_INFO_CALIBSCALE);
channel->info_mask_shared_by_all =
BIT(IIO_CHAN_INFO_SCALE) |
- BIT(IIO_CHAN_INFO_FREQUENCY) |
BIT(IIO_CHAN_INFO_SAMP_FREQ);
channel->info_mask_shared_by_all_available =
BIT(IIO_CHAN_INFO_SAMP_FREQ);
@@ -292,11 +295,6 @@ static int cros_ec_sensors_probe(struct platform_device *pdev)
else
state->core.read_ec_sensors_data = cros_ec_sensors_read_cmd;
- ret = devm_iio_triggered_buffer_setup(dev, indio_dev, NULL,
- cros_ec_sensors_capture, NULL);
- if (ret)
- return ret;
-
return devm_iio_device_register(dev, indio_dev);
}
@@ -317,7 +315,6 @@ MODULE_DEVICE_TABLE(platform, cros_ec_sensors_ids);
static struct platform_driver cros_ec_sensors_platform_driver = {
.driver = {
.name = "cros-ec-sensors",
- .pm = &cros_ec_sensors_pm_ops,
},
.probe = cros_ec_sensors_probe,
.id_table = cros_ec_sensors_ids,
diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c
index d3a3626c7cd8..c831915ca7e5 100644
--- a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c
+++ b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c
@@ -11,7 +11,9 @@
#include <linux/iio/common/cros_ec_sensors_core.h>
#include <linux/iio/iio.h>
#include <linux/iio/kfifo_buf.h>
+#include <linux/iio/sysfs.h>
#include <linux/iio/trigger_consumer.h>
+#include <linux/iio/triggered_buffer.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -20,6 +22,12 @@
#include <linux/platform_data/cros_ec_sensorhub.h>
#include <linux/platform_device.h>
+/*
+ * Hard coded to the first device to support sensor fifo. The EC has a 2048
+ * byte fifo and will trigger an interrupt when fifo is 2/3 full.
+ */
+#define CROS_EC_FIFO_SIZE (2048 * 2 / 3)
+
static char *cros_ec_loc[] = {
[MOTIONSENSE_LOC_BASE] = "base",
[MOTIONSENSE_LOC_LID] = "lid",
@@ -53,8 +61,15 @@ static int cros_ec_get_host_cmd_version_mask(struct cros_ec_device *ec_dev,
static void get_default_min_max_freq(enum motionsensor_type type,
u32 *min_freq,
- u32 *max_freq)
+ u32 *max_freq,
+ u32 *max_fifo_events)
{
+ /*
+ * We don't know fifo size, set to size previously used by older
+ * hardware.
+ */
+ *max_fifo_events = CROS_EC_FIFO_SIZE;
+
switch (type) {
case MOTIONSENSE_TYPE_ACCEL:
case MOTIONSENSE_TYPE_GYRO:
@@ -82,9 +97,155 @@ static void get_default_min_max_freq(enum motionsensor_type type,
}
}
+static int cros_ec_sensor_set_ec_rate(struct cros_ec_sensors_core_state *st,
+ int rate)
+{
+ int ret;
+
+ if (rate > U16_MAX)
+ rate = U16_MAX;
+
+ mutex_lock(&st->cmd_lock);
+ st->param.cmd = MOTIONSENSE_CMD_EC_RATE;
+ st->param.ec_rate.data = rate;
+ ret = cros_ec_motion_send_host_cmd(st, 0);
+ mutex_unlock(&st->cmd_lock);
+ return ret;
+}
+
+static ssize_t cros_ec_sensor_set_report_latency(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct iio_dev *indio_dev = dev_to_iio_dev(dev);
+ struct cros_ec_sensors_core_state *st = iio_priv(indio_dev);
+ int integer, fract, ret;
+ int latency;
+
+ ret = iio_str_to_fixpoint(buf, 100000, &integer, &fract);
+ if (ret)
+ return ret;
+
+ /* EC rate is in ms. */
+ latency = integer * 1000 + fract / 1000;
+ ret = cros_ec_sensor_set_ec_rate(st, latency);
+ if (ret < 0)
+ return ret;
+
+ return len;
+}
+
+static ssize_t cros_ec_sensor_get_report_latency(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct iio_dev *indio_dev = dev_to_iio_dev(dev);
+ struct cros_ec_sensors_core_state *st = iio_priv(indio_dev);
+ int latency, ret;
+
+ mutex_lock(&st->cmd_lock);
+ st->param.cmd = MOTIONSENSE_CMD_EC_RATE;
+ st->param.ec_rate.data = EC_MOTION_SENSE_NO_VALUE;
+
+ ret = cros_ec_motion_send_host_cmd(st, 0);
+ latency = st->resp->ec_rate.ret;
+ mutex_unlock(&st->cmd_lock);
+ if (ret < 0)
+ return ret;
+
+ return sprintf(buf, "%d.%06u\n",
+ latency / 1000,
+ (latency % 1000) * 1000);
+}
+
+static IIO_DEVICE_ATTR(hwfifo_timeout, 0644,
+ cros_ec_sensor_get_report_latency,
+ cros_ec_sensor_set_report_latency, 0);
+
+static ssize_t hwfifo_watermark_max_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct iio_dev *indio_dev = dev_to_iio_dev(dev);
+ struct cros_ec_sensors_core_state *st = iio_priv(indio_dev);
+
+ return sprintf(buf, "%d\n", st->fifo_max_event_count);
+}
+
+static IIO_DEVICE_ATTR_RO(hwfifo_watermark_max, 0);
+
+const struct attribute *cros_ec_sensor_fifo_attributes[] = {
+ &iio_dev_attr_hwfifo_timeout.dev_attr.attr,
+ &iio_dev_attr_hwfifo_watermark_max.dev_attr.attr,
+ NULL,
+};
+EXPORT_SYMBOL_GPL(cros_ec_sensor_fifo_attributes);
+
+int cros_ec_sensors_push_data(struct iio_dev *indio_dev,
+ s16 *data,
+ s64 timestamp)
+{
+ struct cros_ec_sensors_core_state *st = iio_priv(indio_dev);
+ s16 *out;
+ s64 delta;
+ unsigned int i;
+
+ /*
+ * Ignore samples if the buffer is not set: it is needed if the ODR is
+ * set but the buffer is not enabled yet.
+ */
+ if (!iio_buffer_enabled(indio_dev))
+ return 0;
+
+ out = (s16 *)st->samples;
+ for_each_set_bit(i,
+ indio_dev->active_scan_mask,
+ indio_dev->masklength) {
+ *out = data[i];
+ out++;
+ }
+
+ if (iio_device_get_clock(indio_dev) != CLOCK_BOOTTIME)
+ delta = iio_get_time_ns(indio_dev) - cros_ec_get_time_ns();
+ else
+ delta = 0;
+
+ iio_push_to_buffers_with_timestamp(indio_dev, st->samples,
+ timestamp + delta);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cros_ec_sensors_push_data);
+
+static void cros_ec_sensors_core_clean(void *arg)
+{
+ struct platform_device *pdev = (struct platform_device *)arg;
+ struct cros_ec_sensorhub *sensor_hub =
+ dev_get_drvdata(pdev->dev.parent);
+ struct iio_dev *indio_dev = platform_get_drvdata(pdev);
+ struct cros_ec_sensors_core_state *st = iio_priv(indio_dev);
+ u8 sensor_num = st->param.info.sensor_num;
+
+ cros_ec_sensorhub_unregister_push_data(sensor_hub, sensor_num);
+}
+
+/**
+ * cros_ec_sensors_core_init() - basic initialization of the core structure
+ * @pdev: platform device created for the sensors
+ * @indio_dev: iio device structure of the device
+ * @physical_device: true if the device refers to a physical device
+ * @trigger_capture: function pointer to call buffer is triggered,
+ * for backward compatibility.
+ * @push_data: function to call when cros_ec_sensorhub receives
+ * a sample for that sensor.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
int cros_ec_sensors_core_init(struct platform_device *pdev,
struct iio_dev *indio_dev,
- bool physical_device)
+ bool physical_device,
+ cros_ec_sensors_capture_t trigger_capture,
+ cros_ec_sensorhub_push_data_cb_t push_data)
{
struct device *dev = &pdev->dev;
struct cros_ec_sensors_core_state *state = iio_priv(indio_dev);
@@ -92,6 +253,7 @@ int cros_ec_sensors_core_init(struct platform_device *pdev,
struct cros_ec_dev *ec = sensor_hub->ec;
struct cros_ec_sensor_platform *sensor_platform = dev_get_platdata(dev);
u32 ver_mask;
+ int frequencies[ARRAY_SIZE(state->frequencies) / 2] = { 0 };
int ret, i;
platform_set_drvdata(pdev, indio_dev);
@@ -123,8 +285,6 @@ int cros_ec_sensors_core_init(struct platform_device *pdev,
indio_dev->name = pdev->name;
if (physical_device) {
- indio_dev->modes = INDIO_DIRECT_MODE;
-
state->param.cmd = MOTIONSENSE_CMD_INFO;
state->param.info.sensor_num = sensor_platform->sensor_num;
ret = cros_ec_motion_send_host_cmd(state, 0);
@@ -142,16 +302,63 @@ int cros_ec_sensors_core_init(struct platform_device *pdev,
state->calib[i].scale = MOTION_SENSE_DEFAULT_SCALE;
/* 0 is a correct value used to stop the device */
- state->frequencies[0] = 0;
if (state->msg->version < 3) {
get_default_min_max_freq(state->resp->info.type,
- &state->frequencies[1],
- &state->frequencies[2]);
+ &frequencies[1],
+ &frequencies[2],
+ &state->fifo_max_event_count);
} else {
- state->frequencies[1] =
- state->resp->info_3.min_frequency;
- state->frequencies[2] =
- state->resp->info_3.max_frequency;
+ frequencies[1] = state->resp->info_3.min_frequency;
+ frequencies[2] = state->resp->info_3.max_frequency;
+ state->fifo_max_event_count =
+ state->resp->info_3.fifo_max_event_count;
+ }
+ for (i = 0; i < ARRAY_SIZE(frequencies); i++) {
+ state->frequencies[2 * i] = frequencies[i] / 1000;
+ state->frequencies[2 * i + 1] =
+ (frequencies[i] % 1000) * 1000;
+ }
+
+ if (cros_ec_check_features(ec, EC_FEATURE_MOTION_SENSE_FIFO)) {
+ /*
+ * Create a software buffer, feed by the EC FIFO.
+ * We can not use trigger here, as events are generated
+ * as soon as sample_frequency is set.
+ */
+ struct iio_buffer *buffer;
+
+ buffer = devm_iio_kfifo_allocate(dev);
+ if (!buffer)
+ return -ENOMEM;
+
+ iio_device_attach_buffer(indio_dev, buffer);
+ indio_dev->modes = INDIO_BUFFER_SOFTWARE;
+
+ ret = cros_ec_sensorhub_register_push_data(
+ sensor_hub, sensor_platform->sensor_num,
+ indio_dev, push_data);
+ if (ret)
+ return ret;
+
+ ret = devm_add_action_or_reset(
+ dev, cros_ec_sensors_core_clean, pdev);
+ if (ret)
+ return ret;
+
+ /* Timestamp coming from FIFO are in ns since boot. */
+ ret = iio_device_set_clock(indio_dev, CLOCK_BOOTTIME);
+ if (ret)
+ return ret;
+ } else {
+ /*
+ * The only way to get samples in buffer is to set a
+ * software tigger (systrig, hrtimer).
+ */
+ ret = devm_iio_triggered_buffer_setup(
+ dev, indio_dev, NULL, trigger_capture,
+ NULL);
+ if (ret)
+ return ret;
}
}
@@ -159,6 +366,16 @@ int cros_ec_sensors_core_init(struct platform_device *pdev,
}
EXPORT_SYMBOL_GPL(cros_ec_sensors_core_init);
+/**
+ * cros_ec_motion_send_host_cmd() - send motion sense host command
+ * @state: pointer to state information for device
+ * @opt_length: optional length to reduce the response size, useful on the data
+ * path. Otherwise, the maximal allowed response size is used
+ *
+ * When called, the sub-command is assumed to be set in param->cmd.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
int cros_ec_motion_send_host_cmd(struct cros_ec_sensors_core_state *state,
u16 opt_length)
{
@@ -421,6 +638,14 @@ int cros_ec_sensors_read_lpc(struct iio_dev *indio_dev,
}
EXPORT_SYMBOL_GPL(cros_ec_sensors_read_lpc);
+/**
+ * cros_ec_sensors_read_cmd() - retrieve data using the EC command protocol
+ * @indio_dev: pointer to IIO device
+ * @scan_mask: bitmap of the sensor indices to scan
+ * @data: location to store data
+ *
+ * Return: 0 on success, -errno on failure.
+ */
int cros_ec_sensors_read_cmd(struct iio_dev *indio_dev,
unsigned long scan_mask, s16 *data)
{
@@ -445,6 +670,18 @@ int cros_ec_sensors_read_cmd(struct iio_dev *indio_dev,
}
EXPORT_SYMBOL_GPL(cros_ec_sensors_read_cmd);
+/**
+ * cros_ec_sensors_capture() - the trigger handler function
+ * @irq: the interrupt number.
+ * @p: a pointer to the poll function.
+ *
+ * On a trigger event occurring, if the pollfunc is attached then this
+ * handler is called as a threaded interrupt (and hence may sleep). It
+ * is responsible for grabbing data from the device and pushing it into
+ * the associated buffer.
+ *
+ * Return: IRQ_HANDLED
+ */
irqreturn_t cros_ec_sensors_capture(int irq, void *p)
{
struct iio_poll_func *pf = p;
@@ -480,26 +717,24 @@ done:
}
EXPORT_SYMBOL_GPL(cros_ec_sensors_capture);
+/**
+ * cros_ec_sensors_core_read() - function to request a value from the sensor
+ * @st: pointer to state information for device
+ * @chan: channel specification structure table
+ * @val: will contain one element making up the returned value
+ * @val2: will contain another element making up the returned value
+ * @mask: specifies which values to be requested
+ *
+ * Return: the type of value returned by the device
+ */
int cros_ec_sensors_core_read(struct cros_ec_sensors_core_state *st,
struct iio_chan_spec const *chan,
int *val, int *val2, long mask)
{
- int ret;
+ int ret, frequency;
switch (mask) {
case IIO_CHAN_INFO_SAMP_FREQ:
- st->param.cmd = MOTIONSENSE_CMD_EC_RATE;
- st->param.ec_rate.data =
- EC_MOTION_SENSE_NO_VALUE;
-
- ret = cros_ec_motion_send_host_cmd(st, 0);
- if (ret)
- break;
-
- *val = st->resp->ec_rate.ret;
- ret = IIO_VAL_INT;
- break;
- case IIO_CHAN_INFO_FREQUENCY:
st->param.cmd = MOTIONSENSE_CMD_SENSOR_ODR;
st->param.sensor_odr.data =
EC_MOTION_SENSE_NO_VALUE;
@@ -508,8 +743,10 @@ int cros_ec_sensors_core_read(struct cros_ec_sensors_core_state *st,
if (ret)
break;
- *val = st->resp->sensor_odr.ret;
- ret = IIO_VAL_INT;
+ frequency = st->resp->sensor_odr.ret;
+ *val = frequency / 1000;
+ *val2 = (frequency % 1000) * 1000;
+ ret = IIO_VAL_INT_PLUS_MICRO;
break;
default:
ret = -EINVAL;
@@ -520,6 +757,17 @@ int cros_ec_sensors_core_read(struct cros_ec_sensors_core_state *st,
}
EXPORT_SYMBOL_GPL(cros_ec_sensors_core_read);
+/**
+ * cros_ec_sensors_core_read_avail() - get available values
+ * @indio_dev: pointer to state information for device
+ * @chan: channel specification structure table
+ * @vals: list of available values
+ * @type: type of data returned
+ * @length: number of data returned in the array
+ * @mask: specifies which values to be requested
+ *
+ * Return: an error code, IIO_AVAIL_RANGE or IIO_AVAIL_LIST
+ */
int cros_ec_sensors_core_read_avail(struct iio_dev *indio_dev,
struct iio_chan_spec const *chan,
const int **vals,
@@ -533,7 +781,7 @@ int cros_ec_sensors_core_read_avail(struct iio_dev *indio_dev,
case IIO_CHAN_INFO_SAMP_FREQ:
*length = ARRAY_SIZE(state->frequencies);
*vals = (const int *)&state->frequencies;
- *type = IIO_VAL_INT;
+ *type = IIO_VAL_INT_PLUS_MICRO;
return IIO_AVAIL_LIST;
}
@@ -541,31 +789,33 @@ int cros_ec_sensors_core_read_avail(struct iio_dev *indio_dev,
}
EXPORT_SYMBOL_GPL(cros_ec_sensors_core_read_avail);
+/**
+ * cros_ec_sensors_core_write() - function to write a value to the sensor
+ * @st: pointer to state information for device
+ * @chan: channel specification structure table
+ * @val: first part of value to write
+ * @val2: second part of value to write
+ * @mask: specifies which values to write
+ *
+ * Return: the type of value returned by the device
+ */
int cros_ec_sensors_core_write(struct cros_ec_sensors_core_state *st,
struct iio_chan_spec const *chan,
int val, int val2, long mask)
{
- int ret;
+ int ret, frequency;
switch (mask) {
- case IIO_CHAN_INFO_FREQUENCY:
+ case IIO_CHAN_INFO_SAMP_FREQ:
+ frequency = val * 1000 + val2 / 1000;
st->param.cmd = MOTIONSENSE_CMD_SENSOR_ODR;
- st->param.sensor_odr.data = val;
+ st->param.sensor_odr.data = frequency;
/* Always roundup, so caller gets at least what it asks for. */
st->param.sensor_odr.roundup = 1;
ret = cros_ec_motion_send_host_cmd(st, 0);
break;
- case IIO_CHAN_INFO_SAMP_FREQ:
- st->param.cmd = MOTIONSENSE_CMD_EC_RATE;
- st->param.ec_rate.data = val;
-
- ret = cros_ec_motion_send_host_cmd(st, 0);
- if (ret)
- break;
- st->curr_sampl_freq = val;
- break;
default:
ret = -EINVAL;
break;
@@ -574,52 +824,5 @@ int cros_ec_sensors_core_write(struct cros_ec_sensors_core_state *st,
}
EXPORT_SYMBOL_GPL(cros_ec_sensors_core_write);
-static int __maybe_unused cros_ec_sensors_prepare(struct device *dev)
-{
- struct iio_dev *indio_dev = dev_get_drvdata(dev);
- struct cros_ec_sensors_core_state *st = iio_priv(indio_dev);
-
- if (st->curr_sampl_freq == 0)
- return 0;
-
- /*
- * If the sensors are sampled at high frequency, we will not be able to
- * sleep. Set sampling to a long period if necessary.
- */
- if (st->curr_sampl_freq < CROS_EC_MIN_SUSPEND_SAMPLING_FREQUENCY) {
- mutex_lock(&st->cmd_lock);
- st->param.cmd = MOTIONSENSE_CMD_EC_RATE;
- st->param.ec_rate.data = CROS_EC_MIN_SUSPEND_SAMPLING_FREQUENCY;
- cros_ec_motion_send_host_cmd(st, 0);
- mutex_unlock(&st->cmd_lock);
- }
- return 0;
-}
-
-static void __maybe_unused cros_ec_sensors_complete(struct device *dev)
-{
- struct iio_dev *indio_dev = dev_get_drvdata(dev);
- struct cros_ec_sensors_core_state *st = iio_priv(indio_dev);
-
- if (st->curr_sampl_freq == 0)
- return;
-
- if (st->curr_sampl_freq < CROS_EC_MIN_SUSPEND_SAMPLING_FREQUENCY) {
- mutex_lock(&st->cmd_lock);
- st->param.cmd = MOTIONSENSE_CMD_EC_RATE;
- st->param.ec_rate.data = st->curr_sampl_freq;
- cros_ec_motion_send_host_cmd(st, 0);
- mutex_unlock(&st->cmd_lock);
- }
-}
-
-const struct dev_pm_ops cros_ec_sensors_pm_ops = {
-#ifdef CONFIG_PM_SLEEP
- .prepare = cros_ec_sensors_prepare,
- .complete = cros_ec_sensors_complete
-#endif
-};
-EXPORT_SYMBOL_GPL(cros_ec_sensors_pm_ops);
-
MODULE_DESCRIPTION("ChromeOS EC sensor hub core functions");
MODULE_LICENSE("GPL v2");
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index eac63c1bb8da..2352c426bfb5 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -189,7 +189,12 @@ ssize_t iio_read_const_attr(struct device *dev,
}
EXPORT_SYMBOL(iio_read_const_attr);
-static int iio_device_set_clock(struct iio_dev *indio_dev, clockid_t clock_id)
+/**
+ * iio_device_set_clock() - Set current timestamping clock for the device
+ * @indio_dev: IIO device structure containing the device
+ * @clock_id: timestamping clock posix identifier to set.
+ */
+int iio_device_set_clock(struct iio_dev *indio_dev, clockid_t clock_id)
{
int ret;
const struct iio_event_interface *ev_int = indio_dev->event_interface;
@@ -207,6 +212,7 @@ static int iio_device_set_clock(struct iio_dev *indio_dev, clockid_t clock_id)
return 0;
}
+EXPORT_SYMBOL(iio_device_set_clock);
/**
* iio_get_time_ns() - utility function to get a time stamp for events etc
diff --git a/drivers/iio/light/Kconfig b/drivers/iio/light/Kconfig
index 74970f18a93b..b27719cefcf9 100644
--- a/drivers/iio/light/Kconfig
+++ b/drivers/iio/light/Kconfig
@@ -194,6 +194,16 @@ config GP2AP020A00F
To compile this driver as a module, choose M here: the
module will be called gp2ap020a00f.
+config IQS621_ALS
+ tristate "Azoteq IQS621/622 ambient light sensors"
+ depends on MFD_IQS62X || COMPILE_TEST
+ help
+ Say Y here if you want to build support for the Azoteq IQS621
+ and IQS622 ambient light sensors.
+
+ To compile this driver as a module, choose M here: the module
+ will be called iqs621-als.
+
config SENSORS_ISL29018
tristate "Intersil 29018 light and proximity sensor"
depends on I2C
diff --git a/drivers/iio/light/Makefile b/drivers/iio/light/Makefile
index 5c1ebaf578fb..d1c8aa30b9a8 100644
--- a/drivers/iio/light/Makefile
+++ b/drivers/iio/light/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_GP2AP002) += gp2ap002.o
obj-$(CONFIG_GP2AP020A00F) += gp2ap020a00f.o
obj-$(CONFIG_HID_SENSOR_ALS) += hid-sensor-als.o
obj-$(CONFIG_HID_SENSOR_PROX) += hid-sensor-prox.o
+obj-$(CONFIG_IQS621_ALS) += iqs621-als.o
obj-$(CONFIG_SENSORS_ISL29018) += isl29018.o
obj-$(CONFIG_SENSORS_ISL29028) += isl29028.o
obj-$(CONFIG_ISL29125) += isl29125.o
diff --git a/drivers/iio/light/cros_ec_light_prox.c b/drivers/iio/light/cros_ec_light_prox.c
index 7a838e2956f4..2198b50909ed 100644
--- a/drivers/iio/light/cros_ec_light_prox.c
+++ b/drivers/iio/light/cros_ec_light_prox.c
@@ -177,10 +177,14 @@ static int cros_ec_light_prox_probe(struct platform_device *pdev)
if (!indio_dev)
return -ENOMEM;
- ret = cros_ec_sensors_core_init(pdev, indio_dev, true);
+ ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
+ cros_ec_sensors_capture,
+ cros_ec_sensors_push_data);
if (ret)
return ret;
+ iio_buffer_set_attrs(indio_dev->buffer, cros_ec_sensor_fifo_attributes);
+
indio_dev->info = &cros_ec_light_prox_info;
state = iio_priv(indio_dev);
state->core.type = state->core.resp->info.type;
@@ -189,8 +193,7 @@ static int cros_ec_light_prox_probe(struct platform_device *pdev)
/* Common part */
channel->info_mask_shared_by_all =
- BIT(IIO_CHAN_INFO_SAMP_FREQ) |
- BIT(IIO_CHAN_INFO_FREQUENCY);
+ BIT(IIO_CHAN_INFO_SAMP_FREQ);
channel->info_mask_shared_by_all_available =
BIT(IIO_CHAN_INFO_SAMP_FREQ);
channel->scan_type.realbits = CROS_EC_SENSOR_BITS;
@@ -236,11 +239,6 @@ static int cros_ec_light_prox_probe(struct platform_device *pdev)
state->core.read_ec_sensors_data = cros_ec_sensors_read_cmd;
- ret = devm_iio_triggered_buffer_setup(dev, indio_dev, NULL,
- cros_ec_sensors_capture, NULL);
- if (ret)
- return ret;
-
return devm_iio_device_register(dev, indio_dev);
}
@@ -258,7 +256,6 @@ MODULE_DEVICE_TABLE(platform, cros_ec_light_prox_ids);
static struct platform_driver cros_ec_light_prox_platform_driver = {
.driver = {
.name = "cros-ec-light-prox",
- .pm = &cros_ec_sensors_pm_ops,
},
.probe = cros_ec_light_prox_probe,
.id_table = cros_ec_light_prox_ids,
diff --git a/drivers/iio/light/iqs621-als.c b/drivers/iio/light/iqs621-als.c
new file mode 100644
index 000000000000..b2988a782bd0
--- /dev/null
+++ b/drivers/iio/light/iqs621-als.c
@@ -0,0 +1,617 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Azoteq IQS621/622 Ambient Light Sensors
+ *
+ * Copyright (C) 2019 Jeff LaBundy <jeff@labundy.com>
+ */
+
+#include <linux/device.h>
+#include <linux/iio/events.h>
+#include <linux/iio/iio.h>
+#include <linux/kernel.h>
+#include <linux/mfd/iqs62x.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+
+#define IQS621_ALS_FLAGS_LIGHT BIT(7)
+#define IQS621_ALS_FLAGS_RANGE GENMASK(3, 0)
+
+#define IQS621_ALS_UI_OUT 0x17
+
+#define IQS621_ALS_THRESH_DARK 0x80
+#define IQS621_ALS_THRESH_LIGHT 0x81
+
+#define IQS622_IR_RANGE 0x15
+#define IQS622_IR_FLAGS 0x16
+#define IQS622_IR_FLAGS_TOUCH BIT(1)
+#define IQS622_IR_FLAGS_PROX BIT(0)
+
+#define IQS622_IR_UI_OUT 0x17
+
+#define IQS622_IR_THRESH_PROX 0x91
+#define IQS622_IR_THRESH_TOUCH 0x92
+
+struct iqs621_als_private {
+ struct iqs62x_core *iqs62x;
+ struct notifier_block notifier;
+ struct mutex lock;
+ bool light_en;
+ bool range_en;
+ bool prox_en;
+ u8 als_flags;
+ u8 ir_flags_mask;
+ u8 ir_flags;
+ u8 thresh_light;
+ u8 thresh_dark;
+ u8 thresh_prox;
+};
+
+static int iqs621_als_init(struct iqs621_als_private *iqs621_als)
+{
+ struct iqs62x_core *iqs62x = iqs621_als->iqs62x;
+ unsigned int event_mask = 0;
+ int ret;
+
+ switch (iqs621_als->ir_flags_mask) {
+ case IQS622_IR_FLAGS_TOUCH:
+ ret = regmap_write(iqs62x->regmap, IQS622_IR_THRESH_TOUCH,
+ iqs621_als->thresh_prox);
+ break;
+
+ case IQS622_IR_FLAGS_PROX:
+ ret = regmap_write(iqs62x->regmap, IQS622_IR_THRESH_PROX,
+ iqs621_als->thresh_prox);
+ break;
+
+ default:
+ ret = regmap_write(iqs62x->regmap, IQS621_ALS_THRESH_LIGHT,
+ iqs621_als->thresh_light);
+ if (ret)
+ return ret;
+
+ ret = regmap_write(iqs62x->regmap, IQS621_ALS_THRESH_DARK,
+ iqs621_als->thresh_dark);
+ }
+
+ if (ret)
+ return ret;
+
+ if (iqs621_als->light_en || iqs621_als->range_en)
+ event_mask |= iqs62x->dev_desc->als_mask;
+
+ if (iqs621_als->prox_en)
+ event_mask |= iqs62x->dev_desc->ir_mask;
+
+ return regmap_update_bits(iqs62x->regmap, IQS620_GLBL_EVENT_MASK,
+ event_mask, 0);
+}
+
+static int iqs621_als_notifier(struct notifier_block *notifier,
+ unsigned long event_flags, void *context)
+{
+ struct iqs62x_event_data *event_data = context;
+ struct iqs621_als_private *iqs621_als;
+ struct iio_dev *indio_dev;
+ bool light_new, light_old;
+ bool prox_new, prox_old;
+ u8 range_new, range_old;
+ s64 timestamp;
+ int ret;
+
+ iqs621_als = container_of(notifier, struct iqs621_als_private,
+ notifier);
+ indio_dev = iio_priv_to_dev(iqs621_als);
+ timestamp = iio_get_time_ns(indio_dev);
+
+ mutex_lock(&iqs621_als->lock);
+
+ if (event_flags & BIT(IQS62X_EVENT_SYS_RESET)) {
+ ret = iqs621_als_init(iqs621_als);
+ if (ret) {
+ dev_err(indio_dev->dev.parent,
+ "Failed to re-initialize device: %d\n", ret);
+ ret = NOTIFY_BAD;
+ } else {
+ ret = NOTIFY_OK;
+ }
+
+ goto err_mutex;
+ }
+
+ if (!iqs621_als->light_en && !iqs621_als->range_en &&
+ !iqs621_als->prox_en) {
+ ret = NOTIFY_DONE;
+ goto err_mutex;
+ }
+
+ /* IQS621 only */
+ light_new = event_data->als_flags & IQS621_ALS_FLAGS_LIGHT;
+ light_old = iqs621_als->als_flags & IQS621_ALS_FLAGS_LIGHT;
+
+ if (iqs621_als->light_en && light_new && !light_old)
+ iio_push_event(indio_dev,
+ IIO_UNMOD_EVENT_CODE(IIO_LIGHT, 0,
+ IIO_EV_TYPE_THRESH,
+ IIO_EV_DIR_RISING),
+ timestamp);
+ else if (iqs621_als->light_en && !light_new && light_old)
+ iio_push_event(indio_dev,
+ IIO_UNMOD_EVENT_CODE(IIO_LIGHT, 0,
+ IIO_EV_TYPE_THRESH,
+ IIO_EV_DIR_FALLING),
+ timestamp);
+
+ /* IQS621 and IQS622 */
+ range_new = event_data->als_flags & IQS621_ALS_FLAGS_RANGE;
+ range_old = iqs621_als->als_flags & IQS621_ALS_FLAGS_RANGE;
+
+ if (iqs621_als->range_en && (range_new > range_old))
+ iio_push_event(indio_dev,
+ IIO_UNMOD_EVENT_CODE(IIO_INTENSITY, 0,
+ IIO_EV_TYPE_CHANGE,
+ IIO_EV_DIR_RISING),
+ timestamp);
+ else if (iqs621_als->range_en && (range_new < range_old))
+ iio_push_event(indio_dev,
+ IIO_UNMOD_EVENT_CODE(IIO_INTENSITY, 0,
+ IIO_EV_TYPE_CHANGE,
+ IIO_EV_DIR_FALLING),
+ timestamp);
+
+ /* IQS622 only */
+ prox_new = event_data->ir_flags & iqs621_als->ir_flags_mask;
+ prox_old = iqs621_als->ir_flags & iqs621_als->ir_flags_mask;
+
+ if (iqs621_als->prox_en && prox_new && !prox_old)
+ iio_push_event(indio_dev,
+ IIO_UNMOD_EVENT_CODE(IIO_PROXIMITY, 0,
+ IIO_EV_TYPE_THRESH,
+ IIO_EV_DIR_RISING),
+ timestamp);
+ else if (iqs621_als->prox_en && !prox_new && prox_old)
+ iio_push_event(indio_dev,
+ IIO_UNMOD_EVENT_CODE(IIO_PROXIMITY, 0,
+ IIO_EV_TYPE_THRESH,
+ IIO_EV_DIR_FALLING),
+ timestamp);
+
+ iqs621_als->als_flags = event_data->als_flags;
+ iqs621_als->ir_flags = event_data->ir_flags;
+ ret = NOTIFY_OK;
+
+err_mutex:
+ mutex_unlock(&iqs621_als->lock);
+
+ return ret;
+}
+
+static void iqs621_als_notifier_unregister(void *context)
+{
+ struct iqs621_als_private *iqs621_als = context;
+ struct iio_dev *indio_dev = iio_priv_to_dev(iqs621_als);
+ int ret;
+
+ ret = blocking_notifier_chain_unregister(&iqs621_als->iqs62x->nh,
+ &iqs621_als->notifier);
+ if (ret)
+ dev_err(indio_dev->dev.parent,
+ "Failed to unregister notifier: %d\n", ret);
+}
+
+static int iqs621_als_read_raw(struct iio_dev *indio_dev,
+ struct iio_chan_spec const *chan,
+ int *val, int *val2, long mask)
+{
+ struct iqs621_als_private *iqs621_als = iio_priv(indio_dev);
+ struct iqs62x_core *iqs62x = iqs621_als->iqs62x;
+ int ret;
+ __le16 val_buf;
+
+ switch (chan->type) {
+ case IIO_INTENSITY:
+ ret = regmap_read(iqs62x->regmap, chan->address, val);
+ if (ret)
+ return ret;
+
+ *val &= IQS621_ALS_FLAGS_RANGE;
+ return IIO_VAL_INT;
+
+ case IIO_PROXIMITY:
+ case IIO_LIGHT:
+ ret = regmap_raw_read(iqs62x->regmap, chan->address, &val_buf,
+ sizeof(val_buf));
+ if (ret)
+ return ret;
+
+ *val = le16_to_cpu(val_buf);
+ return IIO_VAL_INT;
+
+ default:
+ return -EINVAL;
+ }
+}
+
+static int iqs621_als_read_event_config(struct iio_dev *indio_dev,
+ const struct iio_chan_spec *chan,
+ enum iio_event_type type,
+ enum iio_event_direction dir)
+{
+ struct iqs621_als_private *iqs621_als = iio_priv(indio_dev);
+ int ret;
+
+ mutex_lock(&iqs621_als->lock);
+
+ switch (chan->type) {
+ case IIO_LIGHT:
+ ret = iqs621_als->light_en;
+ break;
+
+ case IIO_INTENSITY:
+ ret = iqs621_als->range_en;
+ break;
+
+ case IIO_PROXIMITY:
+ ret = iqs621_als->prox_en;
+ break;
+
+ default:
+ ret = -EINVAL;
+ }
+
+ mutex_unlock(&iqs621_als->lock);
+
+ return ret;
+}
+
+static int iqs621_als_write_event_config(struct iio_dev *indio_dev,
+ const struct iio_chan_spec *chan,
+ enum iio_event_type type,
+ enum iio_event_direction dir,
+ int state)
+{
+ struct iqs621_als_private *iqs621_als = iio_priv(indio_dev);
+ struct iqs62x_core *iqs62x = iqs621_als->iqs62x;
+ unsigned int val;
+ int ret;
+
+ mutex_lock(&iqs621_als->lock);
+
+ ret = regmap_read(iqs62x->regmap, iqs62x->dev_desc->als_flags, &val);
+ if (ret)
+ goto err_mutex;
+ iqs621_als->als_flags = val;
+
+ switch (chan->type) {
+ case IIO_LIGHT:
+ ret = regmap_update_bits(iqs62x->regmap, IQS620_GLBL_EVENT_MASK,
+ iqs62x->dev_desc->als_mask,
+ iqs621_als->range_en || state ? 0 :
+ 0xFF);
+ if (!ret)
+ iqs621_als->light_en = state;
+ break;
+
+ case IIO_INTENSITY:
+ ret = regmap_update_bits(iqs62x->regmap, IQS620_GLBL_EVENT_MASK,
+ iqs62x->dev_desc->als_mask,
+ iqs621_als->light_en || state ? 0 :
+ 0xFF);
+ if (!ret)
+ iqs621_als->range_en = state;
+ break;
+
+ case IIO_PROXIMITY:
+ ret = regmap_read(iqs62x->regmap, IQS622_IR_FLAGS, &val);
+ if (ret)
+ goto err_mutex;
+ iqs621_als->ir_flags = val;
+
+ ret = regmap_update_bits(iqs62x->regmap, IQS620_GLBL_EVENT_MASK,
+ iqs62x->dev_desc->ir_mask,
+ state ? 0 : 0xFF);
+ if (!ret)
+ iqs621_als->prox_en = state;
+ break;
+
+ default:
+ ret = -EINVAL;
+ }
+
+err_mutex:
+ mutex_unlock(&iqs621_als->lock);
+
+ return ret;
+}
+
+static int iqs621_als_read_event_value(struct iio_dev *indio_dev,
+ const struct iio_chan_spec *chan,
+ enum iio_event_type type,
+ enum iio_event_direction dir,
+ enum iio_event_info info,
+ int *val, int *val2)
+{
+ struct iqs621_als_private *iqs621_als = iio_priv(indio_dev);
+ int ret = IIO_VAL_INT;
+
+ mutex_lock(&iqs621_als->lock);
+
+ switch (dir) {
+ case IIO_EV_DIR_RISING:
+ *val = iqs621_als->thresh_light * 16;
+ break;
+
+ case IIO_EV_DIR_FALLING:
+ *val = iqs621_als->thresh_dark * 4;
+ break;
+
+ case IIO_EV_DIR_EITHER:
+ if (iqs621_als->ir_flags_mask == IQS622_IR_FLAGS_TOUCH)
+ *val = iqs621_als->thresh_prox * 4;
+ else
+ *val = iqs621_als->thresh_prox;
+ break;
+
+ default:
+ ret = -EINVAL;
+ }
+
+ mutex_unlock(&iqs621_als->lock);
+
+ return ret;
+}
+
+static int iqs621_als_write_event_value(struct iio_dev *indio_dev,
+ const struct iio_chan_spec *chan,
+ enum iio_event_type type,
+ enum iio_event_direction dir,
+ enum iio_event_info info,
+ int val, int val2)
+{
+ struct iqs621_als_private *iqs621_als = iio_priv(indio_dev);
+ struct iqs62x_core *iqs62x = iqs621_als->iqs62x;
+ unsigned int thresh_reg, thresh_val;
+ u8 ir_flags_mask, *thresh_cache;
+ int ret = -EINVAL;
+
+ mutex_lock(&iqs621_als->lock);
+
+ switch (dir) {
+ case IIO_EV_DIR_RISING:
+ thresh_reg = IQS621_ALS_THRESH_LIGHT;
+ thresh_val = val / 16;
+
+ thresh_cache = &iqs621_als->thresh_light;
+ ir_flags_mask = 0;
+ break;
+
+ case IIO_EV_DIR_FALLING:
+ thresh_reg = IQS621_ALS_THRESH_DARK;
+ thresh_val = val / 4;
+
+ thresh_cache = &iqs621_als->thresh_dark;
+ ir_flags_mask = 0;
+ break;
+
+ case IIO_EV_DIR_EITHER:
+ /*
+ * The IQS622 supports two detection thresholds, both measured
+ * in the same arbitrary units reported by read_raw: proximity
+ * (0 through 255 in steps of 1), and touch (0 through 1020 in
+ * steps of 4).
+ *
+ * Based on the single detection threshold chosen by the user,
+ * select the hardware threshold that gives the best trade-off
+ * between range and resolution.
+ *
+ * By default, the close-range (but coarse) touch threshold is
+ * chosen during probe.
+ */
+ switch (val) {
+ case 0 ... 255:
+ thresh_reg = IQS622_IR_THRESH_PROX;
+ thresh_val = val;
+
+ ir_flags_mask = IQS622_IR_FLAGS_PROX;
+ break;
+
+ case 256 ... 1020:
+ thresh_reg = IQS622_IR_THRESH_TOUCH;
+ thresh_val = val / 4;
+
+ ir_flags_mask = IQS622_IR_FLAGS_TOUCH;
+ break;
+
+ default:
+ goto err_mutex;
+ }
+
+ thresh_cache = &iqs621_als->thresh_prox;
+ break;
+
+ default:
+ goto err_mutex;
+ }
+
+ if (thresh_val > 0xFF)
+ goto err_mutex;
+
+ ret = regmap_write(iqs62x->regmap, thresh_reg, thresh_val);
+ if (ret)
+ goto err_mutex;
+
+ *thresh_cache = thresh_val;
+ iqs621_als->ir_flags_mask = ir_flags_mask;
+
+err_mutex:
+ mutex_unlock(&iqs621_als->lock);
+
+ return ret;
+}
+
+static const struct iio_info iqs621_als_info = {
+ .read_raw = &iqs621_als_read_raw,
+ .read_event_config = iqs621_als_read_event_config,
+ .write_event_config = iqs621_als_write_event_config,
+ .read_event_value = iqs621_als_read_event_value,
+ .write_event_value = iqs621_als_write_event_value,
+};
+
+static const struct iio_event_spec iqs621_als_range_events[] = {
+ {
+ .type = IIO_EV_TYPE_CHANGE,
+ .dir = IIO_EV_DIR_EITHER,
+ .mask_separate = BIT(IIO_EV_INFO_ENABLE),
+ },
+};
+
+static const struct iio_event_spec iqs621_als_light_events[] = {
+ {
+ .type = IIO_EV_TYPE_THRESH,
+ .dir = IIO_EV_DIR_EITHER,
+ .mask_separate = BIT(IIO_EV_INFO_ENABLE),
+ },
+ {
+ .type = IIO_EV_TYPE_THRESH,
+ .dir = IIO_EV_DIR_RISING,
+ .mask_separate = BIT(IIO_EV_INFO_VALUE),
+ },
+ {
+ .type = IIO_EV_TYPE_THRESH,
+ .dir = IIO_EV_DIR_FALLING,
+ .mask_separate = BIT(IIO_EV_INFO_VALUE),
+ },
+};
+
+static const struct iio_chan_spec iqs621_als_channels[] = {
+ {
+ .type = IIO_INTENSITY,
+ .address = IQS621_ALS_FLAGS,
+ .info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+ .event_spec = iqs621_als_range_events,
+ .num_event_specs = ARRAY_SIZE(iqs621_als_range_events),
+ },
+ {
+ .type = IIO_LIGHT,
+ .address = IQS621_ALS_UI_OUT,
+ .info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED),
+ .event_spec = iqs621_als_light_events,
+ .num_event_specs = ARRAY_SIZE(iqs621_als_light_events),
+ },
+};
+
+static const struct iio_event_spec iqs622_als_prox_events[] = {
+ {
+ .type = IIO_EV_TYPE_THRESH,
+ .dir = IIO_EV_DIR_EITHER,
+ .mask_separate = BIT(IIO_EV_INFO_ENABLE) |
+ BIT(IIO_EV_INFO_VALUE),
+ },
+};
+
+static const struct iio_chan_spec iqs622_als_channels[] = {
+ {
+ .type = IIO_INTENSITY,
+ .channel2 = IIO_MOD_LIGHT_BOTH,
+ .address = IQS622_ALS_FLAGS,
+ .info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+ .event_spec = iqs621_als_range_events,
+ .num_event_specs = ARRAY_SIZE(iqs621_als_range_events),
+ .modified = true,
+ },
+ {
+ .type = IIO_INTENSITY,
+ .channel2 = IIO_MOD_LIGHT_IR,
+ .address = IQS622_IR_RANGE,
+ .info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+ .modified = true,
+ },
+ {
+ .type = IIO_PROXIMITY,
+ .address = IQS622_IR_UI_OUT,
+ .info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+ .event_spec = iqs622_als_prox_events,
+ .num_event_specs = ARRAY_SIZE(iqs622_als_prox_events),
+ },
+};
+
+static int iqs621_als_probe(struct platform_device *pdev)
+{
+ struct iqs62x_core *iqs62x = dev_get_drvdata(pdev->dev.parent);
+ struct iqs621_als_private *iqs621_als;
+ struct iio_dev *indio_dev;
+ unsigned int val;
+ int ret;
+
+ indio_dev = devm_iio_device_alloc(&pdev->dev, sizeof(*iqs621_als));
+ if (!indio_dev)
+ return -ENOMEM;
+
+ iqs621_als = iio_priv(indio_dev);
+ iqs621_als->iqs62x = iqs62x;
+
+ if (iqs62x->dev_desc->prod_num == IQS622_PROD_NUM) {
+ ret = regmap_read(iqs62x->regmap, IQS622_IR_THRESH_TOUCH,
+ &val);
+ if (ret)
+ return ret;
+ iqs621_als->thresh_prox = val;
+ iqs621_als->ir_flags_mask = IQS622_IR_FLAGS_TOUCH;
+
+ indio_dev->channels = iqs622_als_channels;
+ indio_dev->num_channels = ARRAY_SIZE(iqs622_als_channels);
+ } else {
+ ret = regmap_read(iqs62x->regmap, IQS621_ALS_THRESH_LIGHT,
+ &val);
+ if (ret)
+ return ret;
+ iqs621_als->thresh_light = val;
+
+ ret = regmap_read(iqs62x->regmap, IQS621_ALS_THRESH_DARK,
+ &val);
+ if (ret)
+ return ret;
+ iqs621_als->thresh_dark = val;
+
+ indio_dev->channels = iqs621_als_channels;
+ indio_dev->num_channels = ARRAY_SIZE(iqs621_als_channels);
+ }
+
+ indio_dev->modes = INDIO_DIRECT_MODE;
+ indio_dev->dev.parent = &pdev->dev;
+ indio_dev->name = iqs62x->dev_desc->dev_name;
+ indio_dev->info = &iqs621_als_info;
+
+ mutex_init(&iqs621_als->lock);
+
+ iqs621_als->notifier.notifier_call = iqs621_als_notifier;
+ ret = blocking_notifier_chain_register(&iqs621_als->iqs62x->nh,
+ &iqs621_als->notifier);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to register notifier: %d\n", ret);
+ return ret;
+ }
+
+ ret = devm_add_action_or_reset(&pdev->dev,
+ iqs621_als_notifier_unregister,
+ iqs621_als);
+ if (ret)
+ return ret;
+
+ return devm_iio_device_register(&pdev->dev, indio_dev);
+}
+
+static struct platform_driver iqs621_als_platform_driver = {
+ .driver = {
+ .name = "iqs621-als",
+ },
+ .probe = iqs621_als_probe,
+};
+module_platform_driver(iqs621_als_platform_driver);
+
+MODULE_AUTHOR("Jeff LaBundy <jeff@labundy.com>");
+MODULE_DESCRIPTION("Azoteq IQS621/622 Ambient Light Sensors");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:iqs621-als");
diff --git a/drivers/iio/position/Kconfig b/drivers/iio/position/Kconfig
new file mode 100644
index 000000000000..eda67f008c5b
--- /dev/null
+++ b/drivers/iio/position/Kconfig
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Linear and angular position sensors
+#
+# When adding new entries keep the list in alphabetical order
+
+menu "Linear and angular position sensors"
+
+config IQS624_POS
+ tristate "Azoteq IQS624/625 angular position sensors"
+ depends on MFD_IQS62X || COMPILE_TEST
+ help
+ Say Y here if you want to build support for the Azoteq IQS624
+ and IQS625 angular position sensors.
+
+ To compile this driver as a module, choose M here: the module
+ will be called iqs624-pos.
+
+endmenu
diff --git a/drivers/iio/position/Makefile b/drivers/iio/position/Makefile
new file mode 100644
index 000000000000..3cbe7a734352
--- /dev/null
+++ b/drivers/iio/position/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for IIO linear and angular position sensors
+#
+
+# When adding new entries keep the list in alphabetical order
+
+obj-$(CONFIG_IQS624_POS) += iqs624-pos.o
diff --git a/drivers/iio/position/iqs624-pos.c b/drivers/iio/position/iqs624-pos.c
new file mode 100644
index 000000000000..77096c31c2ba
--- /dev/null
+++ b/drivers/iio/position/iqs624-pos.c
@@ -0,0 +1,284 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Azoteq IQS624/625 Angular Position Sensors
+ *
+ * Copyright (C) 2019 Jeff LaBundy <jeff@labundy.com>
+ */
+
+#include <linux/device.h>
+#include <linux/iio/events.h>
+#include <linux/iio/iio.h>
+#include <linux/kernel.h>
+#include <linux/mfd/iqs62x.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+
+#define IQS624_POS_DEG_OUT 0x16
+
+#define IQS624_POS_SCALE1 (314159 / 180)
+#define IQS624_POS_SCALE2 100000
+
+struct iqs624_pos_private {
+ struct iqs62x_core *iqs62x;
+ struct notifier_block notifier;
+ struct mutex lock;
+ bool angle_en;
+ u16 angle;
+};
+
+static int iqs624_pos_angle_en(struct iqs62x_core *iqs62x, bool angle_en)
+{
+ unsigned int event_mask = IQS624_HALL_UI_WHL_EVENT;
+
+ /*
+ * The IQS625 reports angular position in the form of coarse intervals,
+ * so only interval change events are unmasked. Conversely, the IQS624
+ * reports angular position down to one degree of resolution, so wheel
+ * movement events are unmasked instead.
+ */
+ if (iqs62x->dev_desc->prod_num == IQS625_PROD_NUM)
+ event_mask = IQS624_HALL_UI_INT_EVENT;
+
+ return regmap_update_bits(iqs62x->regmap, IQS624_HALL_UI, event_mask,
+ angle_en ? 0 : 0xFF);
+}
+
+static int iqs624_pos_notifier(struct notifier_block *notifier,
+ unsigned long event_flags, void *context)
+{
+ struct iqs62x_event_data *event_data = context;
+ struct iqs624_pos_private *iqs624_pos;
+ struct iqs62x_core *iqs62x;
+ struct iio_dev *indio_dev;
+ u16 angle = event_data->ui_data;
+ s64 timestamp;
+ int ret;
+
+ iqs624_pos = container_of(notifier, struct iqs624_pos_private,
+ notifier);
+ indio_dev = iio_priv_to_dev(iqs624_pos);
+ timestamp = iio_get_time_ns(indio_dev);
+
+ iqs62x = iqs624_pos->iqs62x;
+ if (iqs62x->dev_desc->prod_num == IQS625_PROD_NUM)
+ angle = event_data->interval;
+
+ mutex_lock(&iqs624_pos->lock);
+
+ if (event_flags & BIT(IQS62X_EVENT_SYS_RESET)) {
+ ret = iqs624_pos_angle_en(iqs62x, iqs624_pos->angle_en);
+ if (ret) {
+ dev_err(indio_dev->dev.parent,
+ "Failed to re-initialize device: %d\n", ret);
+ ret = NOTIFY_BAD;
+ } else {
+ ret = NOTIFY_OK;
+ }
+ } else if (iqs624_pos->angle_en && (angle != iqs624_pos->angle)) {
+ iio_push_event(indio_dev,
+ IIO_UNMOD_EVENT_CODE(IIO_ANGL, 0,
+ IIO_EV_TYPE_CHANGE,
+ IIO_EV_DIR_NONE),
+ timestamp);
+
+ iqs624_pos->angle = angle;
+ ret = NOTIFY_OK;
+ } else {
+ ret = NOTIFY_DONE;
+ }
+
+ mutex_unlock(&iqs624_pos->lock);
+
+ return ret;
+}
+
+static void iqs624_pos_notifier_unregister(void *context)
+{
+ struct iqs624_pos_private *iqs624_pos = context;
+ struct iio_dev *indio_dev = iio_priv_to_dev(iqs624_pos);
+ int ret;
+
+ ret = blocking_notifier_chain_unregister(&iqs624_pos->iqs62x->nh,
+ &iqs624_pos->notifier);
+ if (ret)
+ dev_err(indio_dev->dev.parent,
+ "Failed to unregister notifier: %d\n", ret);
+}
+
+static int iqs624_pos_angle_get(struct iqs62x_core *iqs62x, unsigned int *val)
+{
+ int ret;
+ __le16 val_buf;
+
+ if (iqs62x->dev_desc->prod_num == IQS625_PROD_NUM)
+ return regmap_read(iqs62x->regmap, iqs62x->dev_desc->interval,
+ val);
+
+ ret = regmap_raw_read(iqs62x->regmap, IQS624_POS_DEG_OUT, &val_buf,
+ sizeof(val_buf));
+ if (ret)
+ return ret;
+
+ *val = le16_to_cpu(val_buf);
+
+ return 0;
+}
+
+static int iqs624_pos_read_raw(struct iio_dev *indio_dev,
+ struct iio_chan_spec const *chan,
+ int *val, int *val2, long mask)
+{
+ struct iqs624_pos_private *iqs624_pos = iio_priv(indio_dev);
+ struct iqs62x_core *iqs62x = iqs624_pos->iqs62x;
+ unsigned int scale = 1;
+ int ret;
+
+ switch (mask) {
+ case IIO_CHAN_INFO_RAW:
+ ret = iqs624_pos_angle_get(iqs62x, val);
+ if (ret)
+ return ret;
+
+ return IIO_VAL_INT;
+
+ case IIO_CHAN_INFO_SCALE:
+ if (iqs62x->dev_desc->prod_num == IQS625_PROD_NUM) {
+ ret = regmap_read(iqs62x->regmap, IQS624_INTERVAL_DIV,
+ &scale);
+ if (ret)
+ return ret;
+ }
+
+ *val = scale * IQS624_POS_SCALE1;
+ *val2 = IQS624_POS_SCALE2;
+ return IIO_VAL_FRACTIONAL;
+
+ default:
+ return -EINVAL;
+ }
+}
+
+static int iqs624_pos_read_event_config(struct iio_dev *indio_dev,
+ const struct iio_chan_spec *chan,
+ enum iio_event_type type,
+ enum iio_event_direction dir)
+{
+ struct iqs624_pos_private *iqs624_pos = iio_priv(indio_dev);
+ int ret;
+
+ mutex_lock(&iqs624_pos->lock);
+ ret = iqs624_pos->angle_en;
+ mutex_unlock(&iqs624_pos->lock);
+
+ return ret;
+}
+
+static int iqs624_pos_write_event_config(struct iio_dev *indio_dev,
+ const struct iio_chan_spec *chan,
+ enum iio_event_type type,
+ enum iio_event_direction dir,
+ int state)
+{
+ struct iqs624_pos_private *iqs624_pos = iio_priv(indio_dev);
+ struct iqs62x_core *iqs62x = iqs624_pos->iqs62x;
+ unsigned int val;
+ int ret;
+
+ mutex_lock(&iqs624_pos->lock);
+
+ ret = iqs624_pos_angle_get(iqs62x, &val);
+ if (ret)
+ goto err_mutex;
+
+ ret = iqs624_pos_angle_en(iqs62x, state);
+ if (ret)
+ goto err_mutex;
+
+ iqs624_pos->angle = val;
+ iqs624_pos->angle_en = state;
+
+err_mutex:
+ mutex_unlock(&iqs624_pos->lock);
+
+ return ret;
+}
+
+static const struct iio_info iqs624_pos_info = {
+ .read_raw = &iqs624_pos_read_raw,
+ .read_event_config = iqs624_pos_read_event_config,
+ .write_event_config = iqs624_pos_write_event_config,
+};
+
+static const struct iio_event_spec iqs624_pos_events[] = {
+ {
+ .type = IIO_EV_TYPE_CHANGE,
+ .dir = IIO_EV_DIR_NONE,
+ .mask_separate = BIT(IIO_EV_INFO_ENABLE),
+ },
+};
+
+static const struct iio_chan_spec iqs624_pos_channels[] = {
+ {
+ .type = IIO_ANGL,
+ .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+ BIT(IIO_CHAN_INFO_SCALE),
+ .event_spec = iqs624_pos_events,
+ .num_event_specs = ARRAY_SIZE(iqs624_pos_events),
+ },
+};
+
+static int iqs624_pos_probe(struct platform_device *pdev)
+{
+ struct iqs62x_core *iqs62x = dev_get_drvdata(pdev->dev.parent);
+ struct iqs624_pos_private *iqs624_pos;
+ struct iio_dev *indio_dev;
+ int ret;
+
+ indio_dev = devm_iio_device_alloc(&pdev->dev, sizeof(*iqs624_pos));
+ if (!indio_dev)
+ return -ENOMEM;
+
+ iqs624_pos = iio_priv(indio_dev);
+ iqs624_pos->iqs62x = iqs62x;
+
+ indio_dev->modes = INDIO_DIRECT_MODE;
+ indio_dev->dev.parent = &pdev->dev;
+ indio_dev->channels = iqs624_pos_channels;
+ indio_dev->num_channels = ARRAY_SIZE(iqs624_pos_channels);
+ indio_dev->name = iqs62x->dev_desc->dev_name;
+ indio_dev->info = &iqs624_pos_info;
+
+ mutex_init(&iqs624_pos->lock);
+
+ iqs624_pos->notifier.notifier_call = iqs624_pos_notifier;
+ ret = blocking_notifier_chain_register(&iqs624_pos->iqs62x->nh,
+ &iqs624_pos->notifier);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to register notifier: %d\n", ret);
+ return ret;
+ }
+
+ ret = devm_add_action_or_reset(&pdev->dev,
+ iqs624_pos_notifier_unregister,
+ iqs624_pos);
+ if (ret)
+ return ret;
+
+ return devm_iio_device_register(&pdev->dev, indio_dev);
+}
+
+static struct platform_driver iqs624_pos_platform_driver = {
+ .driver = {
+ .name = "iqs624-pos",
+ },
+ .probe = iqs624_pos_probe,
+};
+module_platform_driver(iqs624_pos_platform_driver);
+
+MODULE_AUTHOR("Jeff LaBundy <jeff@labundy.com>");
+MODULE_DESCRIPTION("Azoteq IQS624/625 Angular Position Sensors");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:iqs624-pos");
diff --git a/drivers/iio/pressure/cros_ec_baro.c b/drivers/iio/pressure/cros_ec_baro.c
index b521bebd551c..c079b8960082 100644
--- a/drivers/iio/pressure/cros_ec_baro.c
+++ b/drivers/iio/pressure/cros_ec_baro.c
@@ -134,10 +134,14 @@ static int cros_ec_baro_probe(struct platform_device *pdev)
if (!indio_dev)
return -ENOMEM;
- ret = cros_ec_sensors_core_init(pdev, indio_dev, true);
+ ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
+ cros_ec_sensors_capture,
+ cros_ec_sensors_push_data);
if (ret)
return ret;
+ iio_buffer_set_attrs(indio_dev->buffer, cros_ec_sensor_fifo_attributes);
+
indio_dev->info = &cros_ec_baro_info;
state = iio_priv(indio_dev);
state->core.type = state->core.resp->info.type;
@@ -147,8 +151,7 @@ static int cros_ec_baro_probe(struct platform_device *pdev)
channel->info_mask_separate = BIT(IIO_CHAN_INFO_RAW);
channel->info_mask_shared_by_all =
BIT(IIO_CHAN_INFO_SCALE) |
- BIT(IIO_CHAN_INFO_SAMP_FREQ) |
- BIT(IIO_CHAN_INFO_FREQUENCY);
+ BIT(IIO_CHAN_INFO_SAMP_FREQ);
channel->info_mask_shared_by_all_available =
BIT(IIO_CHAN_INFO_SAMP_FREQ);
channel->scan_type.realbits = CROS_EC_SENSOR_BITS;
@@ -182,11 +185,6 @@ static int cros_ec_baro_probe(struct platform_device *pdev)
state->core.read_ec_sensors_data = cros_ec_sensors_read_cmd;
- ret = devm_iio_triggered_buffer_setup(dev, indio_dev, NULL,
- cros_ec_sensors_capture, NULL);
- if (ret)
- return ret;
-
return devm_iio_device_register(dev, indio_dev);
}
diff --git a/drivers/iio/temperature/Kconfig b/drivers/iio/temperature/Kconfig
index e1ccb4003015..f1f2a1499c9e 100644
--- a/drivers/iio/temperature/Kconfig
+++ b/drivers/iio/temperature/Kconfig
@@ -4,6 +4,16 @@
#
menu "Temperature sensors"
+config IQS620AT_TEMP
+ tristate "Azoteq IQS620AT temperature sensor"
+ depends on MFD_IQS62X || COMPILE_TEST
+ help
+ Say Y here if you want to build support for the Azoteq IQS620AT
+ temperature sensor.
+
+ To compile this driver as a module, choose M here: the module
+ will be called iqs620at-temp.
+
config LTC2983
tristate "Analog Devices Multi-Sensor Digital Temperature Measurement System"
depends on SPI
diff --git a/drivers/iio/temperature/Makefile b/drivers/iio/temperature/Makefile
index d6b850b0cf63..90c113115422 100644
--- a/drivers/iio/temperature/Makefile
+++ b/drivers/iio/temperature/Makefile
@@ -3,6 +3,7 @@
# Makefile for industrial I/O temperature drivers
#
+obj-$(CONFIG_IQS620AT_TEMP) += iqs620at-temp.o
obj-$(CONFIG_LTC2983) += ltc2983.o
obj-$(CONFIG_HID_SENSOR_TEMP) += hid-sensor-temperature.o
obj-$(CONFIG_MAXIM_THERMOCOUPLE) += maxim_thermocouple.o
diff --git a/drivers/iio/temperature/iqs620at-temp.c b/drivers/iio/temperature/iqs620at-temp.c
new file mode 100644
index 000000000000..3fd52b3eb030
--- /dev/null
+++ b/drivers/iio/temperature/iqs620at-temp.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Azoteq IQS620AT Temperature Sensor
+ *
+ * Copyright (C) 2019 Jeff LaBundy <jeff@labundy.com>
+ */
+
+#include <linux/device.h>
+#include <linux/iio/iio.h>
+#include <linux/kernel.h>
+#include <linux/mfd/iqs62x.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+
+#define IQS620_TEMP_UI_OUT 0x1A
+
+#define IQS620_TEMP_SCALE 1000
+#define IQS620_TEMP_OFFSET (-100)
+
+static int iqs620_temp_read_raw(struct iio_dev *indio_dev,
+ struct iio_chan_spec const *chan,
+ int *val, int *val2, long mask)
+{
+ struct iqs62x_core *iqs62x = iio_device_get_drvdata(indio_dev);
+ int ret;
+ __le16 val_buf;
+
+ switch (mask) {
+ case IIO_CHAN_INFO_RAW:
+ ret = regmap_raw_read(iqs62x->regmap, IQS620_TEMP_UI_OUT,
+ &val_buf, sizeof(val_buf));
+ if (ret)
+ return ret;
+
+ *val = le16_to_cpu(val_buf);
+ return IIO_VAL_INT;
+
+ case IIO_CHAN_INFO_SCALE:
+ *val = IQS620_TEMP_SCALE;
+ return IIO_VAL_INT;
+
+ case IIO_CHAN_INFO_OFFSET:
+ *val = IQS620_TEMP_OFFSET;
+ return IIO_VAL_INT;
+
+ default:
+ return -EINVAL;
+ }
+}
+
+static const struct iio_info iqs620_temp_info = {
+ .read_raw = &iqs620_temp_read_raw,
+};
+
+static const struct iio_chan_spec iqs620_temp_channels[] = {
+ {
+ .type = IIO_TEMP,
+ .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+ BIT(IIO_CHAN_INFO_SCALE) |
+ BIT(IIO_CHAN_INFO_OFFSET),
+ },
+};
+
+static int iqs620_temp_probe(struct platform_device *pdev)
+{
+ struct iqs62x_core *iqs62x = dev_get_drvdata(pdev->dev.parent);
+ struct iio_dev *indio_dev;
+
+ indio_dev = devm_iio_device_alloc(&pdev->dev, 0);
+ if (!indio_dev)
+ return -ENOMEM;
+
+ iio_device_set_drvdata(indio_dev, iqs62x);
+
+ indio_dev->modes = INDIO_DIRECT_MODE;
+ indio_dev->dev.parent = &pdev->dev;
+ indio_dev->channels = iqs620_temp_channels;
+ indio_dev->num_channels = ARRAY_SIZE(iqs620_temp_channels);
+ indio_dev->name = iqs62x->dev_desc->dev_name;
+ indio_dev->info = &iqs620_temp_info;
+
+ return devm_iio_device_register(&pdev->dev, indio_dev);
+}
+
+static struct platform_driver iqs620_temp_platform_driver = {
+ .driver = {
+ .name = "iqs620at-temp",
+ },
+ .probe = iqs620_temp_probe,
+};
+module_platform_driver(iqs620_temp_platform_driver);
+
+MODULE_AUTHOR("Jeff LaBundy <jeff@labundy.com>");
+MODULE_DESCRIPTION("Azoteq IQS620AT Temperature Sensor");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:iqs620at-temp");
diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
index 4706ff09f0e8..28de965a08d5 100644
--- a/drivers/input/keyboard/Kconfig
+++ b/drivers/input/keyboard/Kconfig
@@ -663,6 +663,16 @@ config KEYBOARD_IPAQ_MICRO
To compile this driver as a module, choose M here: the
module will be called ipaq-micro-keys.
+config KEYBOARD_IQS62X
+ tristate "Azoteq IQS620A/621/622/624/625 keys and switches"
+ depends on MFD_IQS62X
+ help
+ Say Y here to enable key and switch support for the Azoteq IQS620A,
+ IQS621, IQS622, IQS624 and IQS625 multi-function sensors.
+
+ To compile this driver as a module, choose M here: the module will
+ be called iqs62x-keys.
+
config KEYBOARD_OMAP
tristate "TI OMAP keypad support"
depends on ARCH_OMAP1
diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile
index f5b17524adf2..1d689fdd5c00 100644
--- a/drivers/input/keyboard/Makefile
+++ b/drivers/input/keyboard/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_KEYBOARD_TCA8418) += tca8418_keypad.o
obj-$(CONFIG_KEYBOARD_HIL) += hil_kbd.o
obj-$(CONFIG_KEYBOARD_HIL_OLD) += hilkbd.o
obj-$(CONFIG_KEYBOARD_IPAQ_MICRO) += ipaq-micro-keys.o
+obj-$(CONFIG_KEYBOARD_IQS62X) += iqs62x-keys.o
obj-$(CONFIG_KEYBOARD_IMX) += imx_keypad.o
obj-$(CONFIG_KEYBOARD_IMX_SC_KEY) += imx_sc_key.o
obj-$(CONFIG_KEYBOARD_HP6XX) += jornada680_kbd.o
diff --git a/drivers/input/keyboard/iqs62x-keys.c b/drivers/input/keyboard/iqs62x-keys.c
new file mode 100644
index 000000000000..93446b21f98f
--- /dev/null
+++ b/drivers/input/keyboard/iqs62x-keys.c
@@ -0,0 +1,335 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Azoteq IQS620A/621/622/624/625 Keys and Switches
+ *
+ * Copyright (C) 2019 Jeff LaBundy <jeff@labundy.com>
+ */
+
+#include <linux/device.h>
+#include <linux/input.h>
+#include <linux/kernel.h>
+#include <linux/mfd/iqs62x.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/platform_device.h>
+#include <linux/property.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+
+enum {
+ IQS62X_SW_HALL_N,
+ IQS62X_SW_HALL_S,
+};
+
+static const char * const iqs62x_switch_names[] = {
+ [IQS62X_SW_HALL_N] = "hall-switch-north",
+ [IQS62X_SW_HALL_S] = "hall-switch-south",
+};
+
+struct iqs62x_switch_desc {
+ enum iqs62x_event_flag flag;
+ unsigned int code;
+ bool enabled;
+};
+
+struct iqs62x_keys_private {
+ struct iqs62x_core *iqs62x;
+ struct input_dev *input;
+ struct notifier_block notifier;
+ struct iqs62x_switch_desc switches[ARRAY_SIZE(iqs62x_switch_names)];
+ unsigned int keycode[IQS62X_NUM_KEYS];
+ unsigned int keycodemax;
+ u8 interval;
+};
+
+static int iqs62x_keys_parse_prop(struct platform_device *pdev,
+ struct iqs62x_keys_private *iqs62x_keys)
+{
+ struct fwnode_handle *child;
+ unsigned int val;
+ int ret, i;
+
+ ret = device_property_count_u32(&pdev->dev, "linux,keycodes");
+ if (ret > IQS62X_NUM_KEYS) {
+ dev_err(&pdev->dev, "Too many keycodes present\n");
+ return -EINVAL;
+ } else if (ret < 0) {
+ dev_err(&pdev->dev, "Failed to count keycodes: %d\n", ret);
+ return ret;
+ }
+ iqs62x_keys->keycodemax = ret;
+
+ ret = device_property_read_u32_array(&pdev->dev, "linux,keycodes",
+ iqs62x_keys->keycode,
+ iqs62x_keys->keycodemax);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to read keycodes: %d\n", ret);
+ return ret;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(iqs62x_keys->switches); i++) {
+ child = device_get_named_child_node(&pdev->dev,
+ iqs62x_switch_names[i]);
+ if (!child)
+ continue;
+
+ ret = fwnode_property_read_u32(child, "linux,code", &val);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to read switch code: %d\n",
+ ret);
+ return ret;
+ }
+ iqs62x_keys->switches[i].code = val;
+ iqs62x_keys->switches[i].enabled = true;
+
+ if (fwnode_property_present(child, "azoteq,use-prox"))
+ iqs62x_keys->switches[i].flag = (i == IQS62X_SW_HALL_N ?
+ IQS62X_EVENT_HALL_N_P :
+ IQS62X_EVENT_HALL_S_P);
+ else
+ iqs62x_keys->switches[i].flag = (i == IQS62X_SW_HALL_N ?
+ IQS62X_EVENT_HALL_N_T :
+ IQS62X_EVENT_HALL_S_T);
+ }
+
+ return 0;
+}
+
+static int iqs62x_keys_init(struct iqs62x_keys_private *iqs62x_keys)
+{
+ struct iqs62x_core *iqs62x = iqs62x_keys->iqs62x;
+ enum iqs62x_event_flag flag;
+ unsigned int event_reg, val;
+ unsigned int event_mask = 0;
+ int ret, i;
+
+ switch (iqs62x->dev_desc->prod_num) {
+ case IQS620_PROD_NUM:
+ case IQS621_PROD_NUM:
+ case IQS622_PROD_NUM:
+ event_reg = IQS620_GLBL_EVENT_MASK;
+
+ /*
+ * Discreet button, hysteresis and SAR UI flags represent keys
+ * and are unmasked if mapped to a valid keycode.
+ */
+ for (i = 0; i < iqs62x_keys->keycodemax; i++) {
+ if (iqs62x_keys->keycode[i] == KEY_RESERVED)
+ continue;
+
+ if (iqs62x_events[i].reg == IQS62X_EVENT_PROX)
+ event_mask |= iqs62x->dev_desc->prox_mask;
+ else if (iqs62x_events[i].reg == IQS62X_EVENT_HYST)
+ event_mask |= (iqs62x->dev_desc->hyst_mask |
+ iqs62x->dev_desc->sar_mask);
+ }
+
+ ret = regmap_read(iqs62x->regmap, iqs62x->dev_desc->hall_flags,
+ &val);
+ if (ret)
+ return ret;
+
+ /*
+ * Hall UI flags represent switches and are unmasked if their
+ * corresponding child nodes are present.
+ */
+ for (i = 0; i < ARRAY_SIZE(iqs62x_keys->switches); i++) {
+ if (!(iqs62x_keys->switches[i].enabled))
+ continue;
+
+ flag = iqs62x_keys->switches[i].flag;
+
+ if (iqs62x_events[flag].reg != IQS62X_EVENT_HALL)
+ continue;
+
+ event_mask |= iqs62x->dev_desc->hall_mask;
+
+ input_report_switch(iqs62x_keys->input,
+ iqs62x_keys->switches[i].code,
+ (val & iqs62x_events[flag].mask) ==
+ iqs62x_events[flag].val);
+ }
+
+ input_sync(iqs62x_keys->input);
+ break;
+
+ case IQS624_PROD_NUM:
+ event_reg = IQS624_HALL_UI;
+
+ /*
+ * Interval change events represent keys and are unmasked if
+ * either wheel movement flag is mapped to a valid keycode.
+ */
+ if (iqs62x_keys->keycode[IQS62X_EVENT_WHEEL_UP] != KEY_RESERVED)
+ event_mask |= IQS624_HALL_UI_INT_EVENT;
+
+ if (iqs62x_keys->keycode[IQS62X_EVENT_WHEEL_DN] != KEY_RESERVED)
+ event_mask |= IQS624_HALL_UI_INT_EVENT;
+
+ ret = regmap_read(iqs62x->regmap, iqs62x->dev_desc->interval,
+ &val);
+ if (ret)
+ return ret;
+
+ iqs62x_keys->interval = val;
+ break;
+
+ default:
+ return 0;
+ }
+
+ return regmap_update_bits(iqs62x->regmap, event_reg, event_mask, 0);
+}
+
+static int iqs62x_keys_notifier(struct notifier_block *notifier,
+ unsigned long event_flags, void *context)
+{
+ struct iqs62x_event_data *event_data = context;
+ struct iqs62x_keys_private *iqs62x_keys;
+ int ret, i;
+
+ iqs62x_keys = container_of(notifier, struct iqs62x_keys_private,
+ notifier);
+
+ if (event_flags & BIT(IQS62X_EVENT_SYS_RESET)) {
+ ret = iqs62x_keys_init(iqs62x_keys);
+ if (ret) {
+ dev_err(iqs62x_keys->input->dev.parent,
+ "Failed to re-initialize device: %d\n", ret);
+ return NOTIFY_BAD;
+ }
+
+ return NOTIFY_OK;
+ }
+
+ for (i = 0; i < iqs62x_keys->keycodemax; i++) {
+ if (iqs62x_events[i].reg == IQS62X_EVENT_WHEEL &&
+ event_data->interval == iqs62x_keys->interval)
+ continue;
+
+ input_report_key(iqs62x_keys->input, iqs62x_keys->keycode[i],
+ event_flags & BIT(i));
+ }
+
+ for (i = 0; i < ARRAY_SIZE(iqs62x_keys->switches); i++)
+ if (iqs62x_keys->switches[i].enabled)
+ input_report_switch(iqs62x_keys->input,
+ iqs62x_keys->switches[i].code,
+ event_flags &
+ BIT(iqs62x_keys->switches[i].flag));
+
+ input_sync(iqs62x_keys->input);
+
+ if (event_data->interval == iqs62x_keys->interval)
+ return NOTIFY_OK;
+
+ /*
+ * Each frame contains at most one wheel event (up or down), in which
+ * case a complementary release cycle is emulated.
+ */
+ if (event_flags & BIT(IQS62X_EVENT_WHEEL_UP)) {
+ input_report_key(iqs62x_keys->input,
+ iqs62x_keys->keycode[IQS62X_EVENT_WHEEL_UP],
+ 0);
+ input_sync(iqs62x_keys->input);
+ } else if (event_flags & BIT(IQS62X_EVENT_WHEEL_DN)) {
+ input_report_key(iqs62x_keys->input,
+ iqs62x_keys->keycode[IQS62X_EVENT_WHEEL_DN],
+ 0);
+ input_sync(iqs62x_keys->input);
+ }
+
+ iqs62x_keys->interval = event_data->interval;
+
+ return NOTIFY_OK;
+}
+
+static int iqs62x_keys_probe(struct platform_device *pdev)
+{
+ struct iqs62x_core *iqs62x = dev_get_drvdata(pdev->dev.parent);
+ struct iqs62x_keys_private *iqs62x_keys;
+ struct input_dev *input;
+ int ret, i;
+
+ iqs62x_keys = devm_kzalloc(&pdev->dev, sizeof(*iqs62x_keys),
+ GFP_KERNEL);
+ if (!iqs62x_keys)
+ return -ENOMEM;
+
+ platform_set_drvdata(pdev, iqs62x_keys);
+
+ ret = iqs62x_keys_parse_prop(pdev, iqs62x_keys);
+ if (ret)
+ return ret;
+
+ input = devm_input_allocate_device(&pdev->dev);
+ if (!input)
+ return -ENOMEM;
+
+ input->keycodemax = iqs62x_keys->keycodemax;
+ input->keycode = iqs62x_keys->keycode;
+ input->keycodesize = sizeof(*iqs62x_keys->keycode);
+
+ input->name = iqs62x->dev_desc->dev_name;
+ input->id.bustype = BUS_I2C;
+
+ for (i = 0; i < iqs62x_keys->keycodemax; i++)
+ if (iqs62x_keys->keycode[i] != KEY_RESERVED)
+ input_set_capability(input, EV_KEY,
+ iqs62x_keys->keycode[i]);
+
+ for (i = 0; i < ARRAY_SIZE(iqs62x_keys->switches); i++)
+ if (iqs62x_keys->switches[i].enabled)
+ input_set_capability(input, EV_SW,
+ iqs62x_keys->switches[i].code);
+
+ iqs62x_keys->iqs62x = iqs62x;
+ iqs62x_keys->input = input;
+
+ ret = iqs62x_keys_init(iqs62x_keys);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to initialize device: %d\n", ret);
+ return ret;
+ }
+
+ ret = input_register_device(iqs62x_keys->input);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to register device: %d\n", ret);
+ return ret;
+ }
+
+ iqs62x_keys->notifier.notifier_call = iqs62x_keys_notifier;
+ ret = blocking_notifier_chain_register(&iqs62x_keys->iqs62x->nh,
+ &iqs62x_keys->notifier);
+ if (ret)
+ dev_err(&pdev->dev, "Failed to register notifier: %d\n", ret);
+
+ return ret;
+}
+
+static int iqs62x_keys_remove(struct platform_device *pdev)
+{
+ struct iqs62x_keys_private *iqs62x_keys = platform_get_drvdata(pdev);
+ int ret;
+
+ ret = blocking_notifier_chain_unregister(&iqs62x_keys->iqs62x->nh,
+ &iqs62x_keys->notifier);
+ if (ret)
+ dev_err(&pdev->dev, "Failed to unregister notifier: %d\n", ret);
+
+ return ret;
+}
+
+static struct platform_driver iqs62x_keys_platform_driver = {
+ .driver = {
+ .name = "iqs62x-keys",
+ },
+ .probe = iqs62x_keys_probe,
+ .remove = iqs62x_keys_remove,
+};
+module_platform_driver(iqs62x_keys_platform_driver);
+
+MODULE_AUTHOR("Jeff LaBundy <jeff@labundy.com>");
+MODULE_DESCRIPTION("Azoteq IQS620A/621/622/624/625 Keys and Switches");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:iqs62x-keys");
diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-x86ia64io.h
index dc974c288e88..08e919dbeb5d 100644
--- a/drivers/input/serio/i8042-x86ia64io.h
+++ b/drivers/input/serio/i8042-x86ia64io.h
@@ -530,6 +530,17 @@ static const struct dmi_system_id __initconst i8042_dmi_nomux_table[] = {
DMI_MATCH(DMI_PRODUCT_VERSION, "Lenovo LaVie Z"),
},
},
+ {
+ /*
+ * Acer Aspire 5738z
+ * Touchpad stops working in mux mode when dis- + re-enabled
+ * with the touchpad enable/disable toggle hotkey
+ */
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5738"),
+ },
+ },
{ }
};
diff --git a/drivers/input/touchscreen/elants_i2c.c b/drivers/input/touchscreen/elants_i2c.c
index 491179967b29..14c577c16b16 100644
--- a/drivers/input/touchscreen/elants_i2c.c
+++ b/drivers/input/touchscreen/elants_i2c.c
@@ -1309,6 +1309,7 @@ static int elants_i2c_probe(struct i2c_client *client,
input_set_abs_params(ts->input, ABS_MT_PRESSURE, 0, 255, 0, 0);
input_abs_set_res(ts->input, ABS_MT_POSITION_X, ts->x_res);
input_abs_set_res(ts->input, ABS_MT_POSITION_Y, ts->y_res);
+ input_abs_set_res(ts->input, ABS_MT_TOUCH_MAJOR, 1);
error = input_register_device(ts->input);
if (error) {
diff --git a/drivers/input/touchscreen/goodix.c b/drivers/input/touchscreen/goodix.c
index 0403102e807e..02c75ea385e0 100644
--- a/drivers/input/touchscreen/goodix.c
+++ b/drivers/input/touchscreen/goodix.c
@@ -29,33 +29,6 @@
#include <linux/of.h>
#include <asm/unaligned.h>
-struct goodix_ts_data;
-
-struct goodix_chip_data {
- u16 config_addr;
- int config_len;
- int (*check_config)(struct goodix_ts_data *, const struct firmware *);
-};
-
-struct goodix_ts_data {
- struct i2c_client *client;
- struct input_dev *input_dev;
- const struct goodix_chip_data *chip;
- struct touchscreen_properties prop;
- unsigned int max_touch_num;
- unsigned int int_trigger_type;
- struct regulator *avdd28;
- struct regulator *vddio;
- struct gpio_desc *gpiod_int;
- struct gpio_desc *gpiod_rst;
- u16 id;
- u16 version;
- const char *cfg_name;
- struct completion firmware_loading_complete;
- unsigned long irq_flags;
- unsigned int contact_size;
-};
-
#define GOODIX_GPIO_INT_NAME "irq"
#define GOODIX_GPIO_RST_NAME "reset"
@@ -65,10 +38,13 @@ struct goodix_ts_data {
#define GOODIX_CONTACT_SIZE 8
#define GOODIX_MAX_CONTACT_SIZE 9
#define GOODIX_MAX_CONTACTS 10
+#define GOODIX_MAX_KEYS 7
-#define GOODIX_CONFIG_MAX_LENGTH 240
+#define GOODIX_CONFIG_MIN_LENGTH 186
#define GOODIX_CONFIG_911_LENGTH 186
#define GOODIX_CONFIG_967_LENGTH 228
+#define GOODIX_CONFIG_GT9X_LENGTH 240
+#define GOODIX_CONFIG_MAX_LENGTH 240
/* Register defines */
#define GOODIX_REG_COMMAND 0x8040
@@ -80,39 +56,118 @@ struct goodix_ts_data {
#define GOODIX_REG_ID 0x8140
#define GOODIX_BUFFER_STATUS_READY BIT(7)
+#define GOODIX_HAVE_KEY BIT(4)
#define GOODIX_BUFFER_STATUS_TIMEOUT 20
#define RESOLUTION_LOC 1
#define MAX_CONTACTS_LOC 5
#define TRIGGER_LOC 6
+/* Our special handling for GPIO accesses through ACPI is x86 specific */
+#if defined CONFIG_X86 && defined CONFIG_ACPI
+#define ACPI_GPIO_SUPPORT
+#endif
+
+struct goodix_ts_data;
+
+enum goodix_irq_pin_access_method {
+ IRQ_PIN_ACCESS_NONE,
+ IRQ_PIN_ACCESS_GPIO,
+ IRQ_PIN_ACCESS_ACPI_GPIO,
+ IRQ_PIN_ACCESS_ACPI_METHOD,
+};
+
+struct goodix_chip_data {
+ u16 config_addr;
+ int config_len;
+ int (*check_config)(struct goodix_ts_data *ts, const u8 *cfg, int len);
+ void (*calc_config_checksum)(struct goodix_ts_data *ts);
+};
+
+struct goodix_chip_id {
+ const char *id;
+ const struct goodix_chip_data *data;
+};
+
+#define GOODIX_ID_MAX_LEN 4
+
+struct goodix_ts_data {
+ struct i2c_client *client;
+ struct input_dev *input_dev;
+ const struct goodix_chip_data *chip;
+ struct touchscreen_properties prop;
+ unsigned int max_touch_num;
+ unsigned int int_trigger_type;
+ struct regulator *avdd28;
+ struct regulator *vddio;
+ struct gpio_desc *gpiod_int;
+ struct gpio_desc *gpiod_rst;
+ int gpio_count;
+ int gpio_int_idx;
+ char id[GOODIX_ID_MAX_LEN + 1];
+ u16 version;
+ const char *cfg_name;
+ bool reset_controller_at_probe;
+ bool load_cfg_from_disk;
+ struct completion firmware_loading_complete;
+ unsigned long irq_flags;
+ enum goodix_irq_pin_access_method irq_pin_access_method;
+ unsigned int contact_size;
+ u8 config[GOODIX_CONFIG_MAX_LENGTH];
+ unsigned short keymap[GOODIX_MAX_KEYS];
+};
+
static int goodix_check_cfg_8(struct goodix_ts_data *ts,
- const struct firmware *cfg);
+ const u8 *cfg, int len);
static int goodix_check_cfg_16(struct goodix_ts_data *ts,
- const struct firmware *cfg);
+ const u8 *cfg, int len);
+static void goodix_calc_cfg_checksum_8(struct goodix_ts_data *ts);
+static void goodix_calc_cfg_checksum_16(struct goodix_ts_data *ts);
static const struct goodix_chip_data gt1x_chip_data = {
.config_addr = GOODIX_GT1X_REG_CONFIG_DATA,
- .config_len = GOODIX_CONFIG_MAX_LENGTH,
+ .config_len = GOODIX_CONFIG_GT9X_LENGTH,
.check_config = goodix_check_cfg_16,
+ .calc_config_checksum = goodix_calc_cfg_checksum_16,
};
static const struct goodix_chip_data gt911_chip_data = {
.config_addr = GOODIX_GT9X_REG_CONFIG_DATA,
.config_len = GOODIX_CONFIG_911_LENGTH,
.check_config = goodix_check_cfg_8,
+ .calc_config_checksum = goodix_calc_cfg_checksum_8,
};
static const struct goodix_chip_data gt967_chip_data = {
.config_addr = GOODIX_GT9X_REG_CONFIG_DATA,
.config_len = GOODIX_CONFIG_967_LENGTH,
.check_config = goodix_check_cfg_8,
+ .calc_config_checksum = goodix_calc_cfg_checksum_8,
};
static const struct goodix_chip_data gt9x_chip_data = {
.config_addr = GOODIX_GT9X_REG_CONFIG_DATA,
- .config_len = GOODIX_CONFIG_MAX_LENGTH,
+ .config_len = GOODIX_CONFIG_GT9X_LENGTH,
.check_config = goodix_check_cfg_8,
+ .calc_config_checksum = goodix_calc_cfg_checksum_8,
+};
+
+static const struct goodix_chip_id goodix_chip_ids[] = {
+ { .id = "1151", .data = &gt1x_chip_data },
+ { .id = "5663", .data = &gt1x_chip_data },
+ { .id = "5688", .data = &gt1x_chip_data },
+ { .id = "917S", .data = &gt1x_chip_data },
+
+ { .id = "911", .data = &gt911_chip_data },
+ { .id = "9271", .data = &gt911_chip_data },
+ { .id = "9110", .data = &gt911_chip_data },
+ { .id = "927", .data = &gt911_chip_data },
+ { .id = "928", .data = &gt911_chip_data },
+
+ { .id = "912", .data = &gt967_chip_data },
+ { .id = "9147", .data = &gt967_chip_data },
+ { .id = "967", .data = &gt967_chip_data },
+ { }
};
static const unsigned long goodix_irq_flags[] = {
@@ -168,6 +223,22 @@ static const struct dmi_system_id nine_bytes_report[] = {
{}
};
+/*
+ * Those tablets have their x coordinate inverted
+ */
+static const struct dmi_system_id inverted_x_screen[] = {
+#if defined(CONFIG_DMI) && defined(CONFIG_X86)
+ {
+ .ident = "Cube I15-TC",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Cube"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "I15-TC")
+ },
+ },
+#endif
+ {}
+};
+
/**
* goodix_i2c_read - read data from a register of the i2c slave device.
*
@@ -235,28 +306,16 @@ static int goodix_i2c_write_u8(struct i2c_client *client, u16 reg, u8 value)
return goodix_i2c_write(client, reg, &value, sizeof(value));
}
-static const struct goodix_chip_data *goodix_get_chip_data(u16 id)
+static const struct goodix_chip_data *goodix_get_chip_data(const char *id)
{
- switch (id) {
- case 1151:
- case 5663:
- case 5688:
- return &gt1x_chip_data;
-
- case 911:
- case 9271:
- case 9110:
- case 927:
- case 928:
- return &gt911_chip_data;
-
- case 912:
- case 967:
- return &gt967_chip_data;
+ unsigned int i;
- default:
- return &gt9x_chip_data;
+ for (i = 0; goodix_chip_ids[i].id; i++) {
+ if (!strcmp(goodix_chip_ids[i].id, id))
+ return goodix_chip_ids[i].data;
}
+
+ return &gt9x_chip_data;
}
static int goodix_ts_read_input_report(struct goodix_ts_data *ts, u8 *data)
@@ -264,6 +323,13 @@ static int goodix_ts_read_input_report(struct goodix_ts_data *ts, u8 *data)
unsigned long max_timeout;
int touch_num;
int error;
+ u16 addr = GOODIX_READ_COOR_ADDR;
+ /*
+ * We are going to read 1-byte header,
+ * ts->contact_size * max(1, touch_num) bytes of coordinates
+ * and 1-byte footer which contains the touch-key code.
+ */
+ const int header_contact_keycode_size = 1 + ts->contact_size + 1;
/*
* The 'buffer status' bit, which indicates that the data is valid, is
@@ -272,8 +338,8 @@ static int goodix_ts_read_input_report(struct goodix_ts_data *ts, u8 *data)
*/
max_timeout = jiffies + msecs_to_jiffies(GOODIX_BUFFER_STATUS_TIMEOUT);
do {
- error = goodix_i2c_read(ts->client, GOODIX_READ_COOR_ADDR,
- data, ts->contact_size + 1);
+ error = goodix_i2c_read(ts->client, addr, data,
+ header_contact_keycode_size);
if (error) {
dev_err(&ts->client->dev, "I2C transfer error: %d\n",
error);
@@ -286,11 +352,10 @@ static int goodix_ts_read_input_report(struct goodix_ts_data *ts, u8 *data)
return -EPROTO;
if (touch_num > 1) {
- data += 1 + ts->contact_size;
+ addr += header_contact_keycode_size;
+ data += header_contact_keycode_size;
error = goodix_i2c_read(ts->client,
- GOODIX_READ_COOR_ADDR +
- 1 + ts->contact_size,
- data,
+ addr, data,
ts->contact_size *
(touch_num - 1));
if (error)
@@ -307,7 +372,7 @@ static int goodix_ts_read_input_report(struct goodix_ts_data *ts, u8 *data)
* The Goodix panel will send spurious interrupts after a
* 'finger up' event, which will always cause a timeout.
*/
- return 0;
+ return -ENOMSG;
}
static void goodix_ts_report_touch_8b(struct goodix_ts_data *ts, u8 *coor_data)
@@ -340,6 +405,25 @@ static void goodix_ts_report_touch_9b(struct goodix_ts_data *ts, u8 *coor_data)
input_report_abs(ts->input_dev, ABS_MT_WIDTH_MAJOR, input_w);
}
+static void goodix_ts_report_key(struct goodix_ts_data *ts, u8 *data)
+{
+ int touch_num;
+ u8 key_value;
+ int i;
+
+ if (data[0] & GOODIX_HAVE_KEY) {
+ touch_num = data[0] & 0x0f;
+ key_value = data[1 + ts->contact_size * touch_num];
+ for (i = 0; i < GOODIX_MAX_KEYS; i++)
+ if (key_value & BIT(i))
+ input_report_key(ts->input_dev,
+ ts->keymap[i], 1);
+ } else {
+ for (i = 0; i < GOODIX_MAX_KEYS; i++)
+ input_report_key(ts->input_dev, ts->keymap[i], 0);
+ }
+}
+
/**
* goodix_process_events - Process incoming events
*
@@ -350,7 +434,7 @@ static void goodix_ts_report_touch_9b(struct goodix_ts_data *ts, u8 *coor_data)
*/
static void goodix_process_events(struct goodix_ts_data *ts)
{
- u8 point_data[1 + GOODIX_MAX_CONTACT_SIZE * GOODIX_MAX_CONTACTS];
+ u8 point_data[2 + GOODIX_MAX_CONTACT_SIZE * GOODIX_MAX_CONTACTS];
int touch_num;
int i;
@@ -358,11 +442,7 @@ static void goodix_process_events(struct goodix_ts_data *ts)
if (touch_num < 0)
return;
- /*
- * Bit 4 of the first byte reports the status of the capacitive
- * Windows/Home button.
- */
- input_report_key(ts->input_dev, KEY_LEFTMETA, point_data[0] & BIT(4));
+ goodix_ts_report_key(ts, point_data);
for (i = 0; i < touch_num; i++)
if (ts->contact_size == 9)
@@ -406,22 +486,21 @@ static int goodix_request_irq(struct goodix_ts_data *ts)
ts->irq_flags, ts->client->name, ts);
}
-static int goodix_check_cfg_8(struct goodix_ts_data *ts,
- const struct firmware *cfg)
+static int goodix_check_cfg_8(struct goodix_ts_data *ts, const u8 *cfg, int len)
{
- int i, raw_cfg_len = cfg->size - 2;
+ int i, raw_cfg_len = len - 2;
u8 check_sum = 0;
for (i = 0; i < raw_cfg_len; i++)
- check_sum += cfg->data[i];
+ check_sum += cfg[i];
check_sum = (~check_sum) + 1;
- if (check_sum != cfg->data[raw_cfg_len]) {
+ if (check_sum != cfg[raw_cfg_len]) {
dev_err(&ts->client->dev,
"The checksum of the config fw is not correct");
return -EINVAL;
}
- if (cfg->data[raw_cfg_len + 1] != 1) {
+ if (cfg[raw_cfg_len + 1] != 1) {
dev_err(&ts->client->dev,
"Config fw must have Config_Fresh register set");
return -EINVAL;
@@ -430,22 +509,35 @@ static int goodix_check_cfg_8(struct goodix_ts_data *ts,
return 0;
}
-static int goodix_check_cfg_16(struct goodix_ts_data *ts,
- const struct firmware *cfg)
+static void goodix_calc_cfg_checksum_8(struct goodix_ts_data *ts)
{
- int i, raw_cfg_len = cfg->size - 3;
+ int i, raw_cfg_len = ts->chip->config_len - 2;
+ u8 check_sum = 0;
+
+ for (i = 0; i < raw_cfg_len; i++)
+ check_sum += ts->config[i];
+ check_sum = (~check_sum) + 1;
+
+ ts->config[raw_cfg_len] = check_sum;
+ ts->config[raw_cfg_len + 1] = 1; /* Set "config_fresh" bit */
+}
+
+static int goodix_check_cfg_16(struct goodix_ts_data *ts, const u8 *cfg,
+ int len)
+{
+ int i, raw_cfg_len = len - 3;
u16 check_sum = 0;
for (i = 0; i < raw_cfg_len; i += 2)
- check_sum += get_unaligned_be16(&cfg->data[i]);
+ check_sum += get_unaligned_be16(&cfg[i]);
check_sum = (~check_sum) + 1;
- if (check_sum != get_unaligned_be16(&cfg->data[raw_cfg_len])) {
+ if (check_sum != get_unaligned_be16(&cfg[raw_cfg_len])) {
dev_err(&ts->client->dev,
"The checksum of the config fw is not correct");
return -EINVAL;
}
- if (cfg->data[raw_cfg_len + 2] != 1) {
+ if (cfg[raw_cfg_len + 2] != 1) {
dev_err(&ts->client->dev,
"Config fw must have Config_Fresh register set");
return -EINVAL;
@@ -454,22 +546,35 @@ static int goodix_check_cfg_16(struct goodix_ts_data *ts,
return 0;
}
+static void goodix_calc_cfg_checksum_16(struct goodix_ts_data *ts)
+{
+ int i, raw_cfg_len = ts->chip->config_len - 3;
+ u16 check_sum = 0;
+
+ for (i = 0; i < raw_cfg_len; i += 2)
+ check_sum += get_unaligned_be16(&ts->config[i]);
+ check_sum = (~check_sum) + 1;
+
+ put_unaligned_be16(check_sum, &ts->config[raw_cfg_len]);
+ ts->config[raw_cfg_len + 2] = 1; /* Set "config_fresh" bit */
+}
+
/**
* goodix_check_cfg - Checks if config fw is valid
*
* @ts: goodix_ts_data pointer
* @cfg: firmware config data
*/
-static int goodix_check_cfg(struct goodix_ts_data *ts,
- const struct firmware *cfg)
+static int goodix_check_cfg(struct goodix_ts_data *ts, const u8 *cfg, int len)
{
- if (cfg->size > GOODIX_CONFIG_MAX_LENGTH) {
+ if (len < GOODIX_CONFIG_MIN_LENGTH ||
+ len > GOODIX_CONFIG_MAX_LENGTH) {
dev_err(&ts->client->dev,
"The length of the config fw is not correct");
return -EINVAL;
}
- return ts->chip->check_config(ts, cfg);
+ return ts->chip->check_config(ts, cfg, len);
}
/**
@@ -478,17 +583,15 @@ static int goodix_check_cfg(struct goodix_ts_data *ts,
* @ts: goodix_ts_data pointer
* @cfg: config firmware to write to device
*/
-static int goodix_send_cfg(struct goodix_ts_data *ts,
- const struct firmware *cfg)
+static int goodix_send_cfg(struct goodix_ts_data *ts, const u8 *cfg, int len)
{
int error;
- error = goodix_check_cfg(ts, cfg);
+ error = goodix_check_cfg(ts, cfg, len);
if (error)
return error;
- error = goodix_i2c_write(ts->client, ts->chip->config_addr, cfg->data,
- cfg->size);
+ error = goodix_i2c_write(ts->client, ts->chip->config_addr, cfg, len);
if (error) {
dev_err(&ts->client->dev, "Failed to write config data: %d",
error);
@@ -502,17 +605,93 @@ static int goodix_send_cfg(struct goodix_ts_data *ts,
return 0;
}
+#ifdef ACPI_GPIO_SUPPORT
+static int goodix_pin_acpi_direction_input(struct goodix_ts_data *ts)
+{
+ acpi_handle handle = ACPI_HANDLE(&ts->client->dev);
+ acpi_status status;
+
+ status = acpi_evaluate_object(handle, "INTI", NULL, NULL);
+ return ACPI_SUCCESS(status) ? 0 : -EIO;
+}
+
+static int goodix_pin_acpi_output_method(struct goodix_ts_data *ts, int value)
+{
+ acpi_handle handle = ACPI_HANDLE(&ts->client->dev);
+ acpi_status status;
+
+ status = acpi_execute_simple_method(handle, "INTO", value);
+ return ACPI_SUCCESS(status) ? 0 : -EIO;
+}
+#else
+static int goodix_pin_acpi_direction_input(struct goodix_ts_data *ts)
+{
+ dev_err(&ts->client->dev,
+ "%s called on device without ACPI support\n", __func__);
+ return -EINVAL;
+}
+
+static int goodix_pin_acpi_output_method(struct goodix_ts_data *ts, int value)
+{
+ dev_err(&ts->client->dev,
+ "%s called on device without ACPI support\n", __func__);
+ return -EINVAL;
+}
+#endif
+
+static int goodix_irq_direction_output(struct goodix_ts_data *ts, int value)
+{
+ switch (ts->irq_pin_access_method) {
+ case IRQ_PIN_ACCESS_NONE:
+ dev_err(&ts->client->dev,
+ "%s called without an irq_pin_access_method set\n",
+ __func__);
+ return -EINVAL;
+ case IRQ_PIN_ACCESS_GPIO:
+ return gpiod_direction_output(ts->gpiod_int, value);
+ case IRQ_PIN_ACCESS_ACPI_GPIO:
+ /*
+ * The IRQ pin triggers on a falling edge, so its gets marked
+ * as active-low, use output_raw to avoid the value inversion.
+ */
+ return gpiod_direction_output_raw(ts->gpiod_int, value);
+ case IRQ_PIN_ACCESS_ACPI_METHOD:
+ return goodix_pin_acpi_output_method(ts, value);
+ }
+
+ return -EINVAL; /* Never reached */
+}
+
+static int goodix_irq_direction_input(struct goodix_ts_data *ts)
+{
+ switch (ts->irq_pin_access_method) {
+ case IRQ_PIN_ACCESS_NONE:
+ dev_err(&ts->client->dev,
+ "%s called without an irq_pin_access_method set\n",
+ __func__);
+ return -EINVAL;
+ case IRQ_PIN_ACCESS_GPIO:
+ return gpiod_direction_input(ts->gpiod_int);
+ case IRQ_PIN_ACCESS_ACPI_GPIO:
+ return gpiod_direction_input(ts->gpiod_int);
+ case IRQ_PIN_ACCESS_ACPI_METHOD:
+ return goodix_pin_acpi_direction_input(ts);
+ }
+
+ return -EINVAL; /* Never reached */
+}
+
static int goodix_int_sync(struct goodix_ts_data *ts)
{
int error;
- error = gpiod_direction_output(ts->gpiod_int, 0);
+ error = goodix_irq_direction_output(ts, 0);
if (error)
return error;
msleep(50); /* T5: 50ms */
- error = gpiod_direction_input(ts->gpiod_int);
+ error = goodix_irq_direction_input(ts);
if (error)
return error;
@@ -536,7 +715,7 @@ static int goodix_reset(struct goodix_ts_data *ts)
msleep(20); /* T2: > 10ms */
/* HIGH: 0x28/0x29, LOW: 0xBA/0xBB */
- error = gpiod_direction_output(ts->gpiod_int, ts->client->addr == 0x14);
+ error = goodix_irq_direction_output(ts, ts->client->addr == 0x14);
if (error)
return error;
@@ -560,6 +739,124 @@ static int goodix_reset(struct goodix_ts_data *ts)
return 0;
}
+#ifdef ACPI_GPIO_SUPPORT
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+static const struct x86_cpu_id baytrail_cpu_ids[] = {
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT, X86_FEATURE_ANY, },
+ {}
+};
+
+static inline bool is_byt(void)
+{
+ const struct x86_cpu_id *id = x86_match_cpu(baytrail_cpu_ids);
+
+ return !!id;
+}
+
+static const struct acpi_gpio_params first_gpio = { 0, 0, false };
+static const struct acpi_gpio_params second_gpio = { 1, 0, false };
+
+static const struct acpi_gpio_mapping acpi_goodix_int_first_gpios[] = {
+ { GOODIX_GPIO_INT_NAME "-gpios", &first_gpio, 1 },
+ { GOODIX_GPIO_RST_NAME "-gpios", &second_gpio, 1 },
+ { },
+};
+
+static const struct acpi_gpio_mapping acpi_goodix_int_last_gpios[] = {
+ { GOODIX_GPIO_RST_NAME "-gpios", &first_gpio, 1 },
+ { GOODIX_GPIO_INT_NAME "-gpios", &second_gpio, 1 },
+ { },
+};
+
+static const struct acpi_gpio_mapping acpi_goodix_reset_only_gpios[] = {
+ { GOODIX_GPIO_RST_NAME "-gpios", &first_gpio, 1 },
+ { },
+};
+
+static int goodix_resource(struct acpi_resource *ares, void *data)
+{
+ struct goodix_ts_data *ts = data;
+ struct device *dev = &ts->client->dev;
+ struct acpi_resource_gpio *gpio;
+
+ switch (ares->type) {
+ case ACPI_RESOURCE_TYPE_GPIO:
+ gpio = &ares->data.gpio;
+ if (gpio->connection_type == ACPI_RESOURCE_GPIO_TYPE_INT) {
+ if (ts->gpio_int_idx == -1) {
+ ts->gpio_int_idx = ts->gpio_count;
+ } else {
+ dev_err(dev, "More then one GpioInt resource, ignoring ACPI GPIO resources\n");
+ ts->gpio_int_idx = -2;
+ }
+ }
+ ts->gpio_count++;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * This function gets called in case we fail to get the irq GPIO directly
+ * because the ACPI tables lack GPIO-name to APCI _CRS index mappings
+ * (no _DSD UUID daffd814-6eba-4d8c-8a91-bc9bbf4aa301 data).
+ * In that case we add our own mapping and then goodix_get_gpio_config()
+ * retries to get the GPIOs based on the added mapping.
+ */
+static int goodix_add_acpi_gpio_mappings(struct goodix_ts_data *ts)
+{
+ const struct acpi_gpio_mapping *gpio_mapping = NULL;
+ struct device *dev = &ts->client->dev;
+ LIST_HEAD(resources);
+ int ret;
+
+ ts->gpio_count = 0;
+ ts->gpio_int_idx = -1;
+ ret = acpi_dev_get_resources(ACPI_COMPANION(dev), &resources,
+ goodix_resource, ts);
+ if (ret < 0) {
+ dev_err(dev, "Error getting ACPI resources: %d\n", ret);
+ return ret;
+ }
+
+ acpi_dev_free_resource_list(&resources);
+
+ if (ts->gpio_count == 2 && ts->gpio_int_idx == 0) {
+ ts->irq_pin_access_method = IRQ_PIN_ACCESS_ACPI_GPIO;
+ gpio_mapping = acpi_goodix_int_first_gpios;
+ } else if (ts->gpio_count == 2 && ts->gpio_int_idx == 1) {
+ ts->irq_pin_access_method = IRQ_PIN_ACCESS_ACPI_GPIO;
+ gpio_mapping = acpi_goodix_int_last_gpios;
+ } else if (ts->gpio_count == 1 && ts->gpio_int_idx == -1 &&
+ acpi_has_method(ACPI_HANDLE(dev), "INTI") &&
+ acpi_has_method(ACPI_HANDLE(dev), "INTO")) {
+ dev_info(dev, "Using ACPI INTI and INTO methods for IRQ pin access\n");
+ ts->irq_pin_access_method = IRQ_PIN_ACCESS_ACPI_METHOD;
+ gpio_mapping = acpi_goodix_reset_only_gpios;
+ } else if (is_byt() && ts->gpio_count == 2 && ts->gpio_int_idx == -1) {
+ dev_info(dev, "No ACPI GpioInt resource, assuming that the GPIO order is reset, int\n");
+ ts->irq_pin_access_method = IRQ_PIN_ACCESS_ACPI_GPIO;
+ gpio_mapping = acpi_goodix_int_last_gpios;
+ } else {
+ dev_warn(dev, "Unexpected ACPI resources: gpio_count %d, gpio_int_idx %d\n",
+ ts->gpio_count, ts->gpio_int_idx);
+ return -EINVAL;
+ }
+
+ return devm_acpi_dev_add_driver_gpios(dev, gpio_mapping);
+}
+#else
+static int goodix_add_acpi_gpio_mappings(struct goodix_ts_data *ts)
+{
+ return -EINVAL;
+}
+#endif /* CONFIG_X86 && CONFIG_ACPI */
+
/**
* goodix_get_gpio_config - Get GPIO config from ACPI/DT
*
@@ -570,6 +867,7 @@ static int goodix_get_gpio_config(struct goodix_ts_data *ts)
int error;
struct device *dev;
struct gpio_desc *gpiod;
+ bool added_acpi_mappings = false;
if (!ts->client)
return -EINVAL;
@@ -593,6 +891,7 @@ static int goodix_get_gpio_config(struct goodix_ts_data *ts)
return error;
}
+retry_get_irq_gpio:
/* Get the interrupt GPIO pin number */
gpiod = devm_gpiod_get_optional(dev, GOODIX_GPIO_INT_NAME, GPIOD_IN);
if (IS_ERR(gpiod)) {
@@ -602,6 +901,11 @@ static int goodix_get_gpio_config(struct goodix_ts_data *ts)
GOODIX_GPIO_INT_NAME, error);
return error;
}
+ if (!gpiod && has_acpi_companion(dev) && !added_acpi_mappings) {
+ added_acpi_mappings = true;
+ if (goodix_add_acpi_gpio_mappings(ts) == 0)
+ goto retry_get_irq_gpio;
+ }
ts->gpiod_int = gpiod;
@@ -617,6 +921,31 @@ static int goodix_get_gpio_config(struct goodix_ts_data *ts)
ts->gpiod_rst = gpiod;
+ switch (ts->irq_pin_access_method) {
+ case IRQ_PIN_ACCESS_ACPI_GPIO:
+ /*
+ * We end up here if goodix_add_acpi_gpio_mappings() has
+ * called devm_acpi_dev_add_driver_gpios() because the ACPI
+ * tables did not contain name to index mappings.
+ * Check that we successfully got both GPIOs after we've
+ * added our own acpi_gpio_mapping and if we did not get both
+ * GPIOs reset irq_pin_access_method to IRQ_PIN_ACCESS_NONE.
+ */
+ if (!ts->gpiod_int || !ts->gpiod_rst)
+ ts->irq_pin_access_method = IRQ_PIN_ACCESS_NONE;
+ break;
+ case IRQ_PIN_ACCESS_ACPI_METHOD:
+ if (!ts->gpiod_rst)
+ ts->irq_pin_access_method = IRQ_PIN_ACCESS_NONE;
+ break;
+ default:
+ if (ts->gpiod_int && ts->gpiod_rst) {
+ ts->reset_controller_at_probe = true;
+ ts->load_cfg_from_disk = true;
+ ts->irq_pin_access_method = IRQ_PIN_ACCESS_GPIO;
+ }
+ }
+
return 0;
}
@@ -629,12 +958,11 @@ static int goodix_get_gpio_config(struct goodix_ts_data *ts)
*/
static void goodix_read_config(struct goodix_ts_data *ts)
{
- u8 config[GOODIX_CONFIG_MAX_LENGTH];
int x_max, y_max;
int error;
error = goodix_i2c_read(ts->client, ts->chip->config_addr,
- config, ts->chip->config_len);
+ ts->config, ts->chip->config_len);
if (error) {
dev_warn(&ts->client->dev, "Error reading config: %d\n",
error);
@@ -643,15 +971,17 @@ static void goodix_read_config(struct goodix_ts_data *ts)
return;
}
- ts->int_trigger_type = config[TRIGGER_LOC] & 0x03;
- ts->max_touch_num = config[MAX_CONTACTS_LOC] & 0x0f;
+ ts->int_trigger_type = ts->config[TRIGGER_LOC] & 0x03;
+ ts->max_touch_num = ts->config[MAX_CONTACTS_LOC] & 0x0f;
- x_max = get_unaligned_le16(&config[RESOLUTION_LOC]);
- y_max = get_unaligned_le16(&config[RESOLUTION_LOC + 2]);
+ x_max = get_unaligned_le16(&ts->config[RESOLUTION_LOC]);
+ y_max = get_unaligned_le16(&ts->config[RESOLUTION_LOC + 2]);
if (x_max && y_max) {
input_abs_set_max(ts->input_dev, ABS_MT_POSITION_X, x_max - 1);
input_abs_set_max(ts->input_dev, ABS_MT_POSITION_Y, y_max - 1);
}
+
+ ts->chip->calc_config_checksum(ts);
}
/**
@@ -663,7 +993,7 @@ static int goodix_read_version(struct goodix_ts_data *ts)
{
int error;
u8 buf[6];
- char id_str[5];
+ char id_str[GOODIX_ID_MAX_LEN + 1];
error = goodix_i2c_read(ts->client, GOODIX_REG_ID, buf, sizeof(buf));
if (error) {
@@ -671,14 +1001,13 @@ static int goodix_read_version(struct goodix_ts_data *ts)
return error;
}
- memcpy(id_str, buf, 4);
- id_str[4] = 0;
- if (kstrtou16(id_str, 10, &ts->id))
- ts->id = 0x1001;
+ memcpy(id_str, buf, GOODIX_ID_MAX_LEN);
+ id_str[GOODIX_ID_MAX_LEN] = 0;
+ strscpy(ts->id, id_str, GOODIX_ID_MAX_LEN + 1);
ts->version = get_unaligned_le16(&buf[4]);
- dev_info(&ts->client->dev, "ID %d, version: %04x\n", ts->id,
+ dev_info(&ts->client->dev, "ID %s, version: %04x\n", ts->id,
ts->version);
return 0;
@@ -722,6 +1051,7 @@ static int goodix_i2c_test(struct i2c_client *client)
static int goodix_configure_dev(struct goodix_ts_data *ts)
{
int error;
+ int i;
ts->int_trigger_type = GOODIX_INT_TRIGGER;
ts->max_touch_num = GOODIX_MAX_CONTACTS;
@@ -736,11 +1066,23 @@ static int goodix_configure_dev(struct goodix_ts_data *ts)
ts->input_dev->phys = "input/ts";
ts->input_dev->id.bustype = BUS_I2C;
ts->input_dev->id.vendor = 0x0416;
- ts->input_dev->id.product = ts->id;
+ if (kstrtou16(ts->id, 10, &ts->input_dev->id.product))
+ ts->input_dev->id.product = 0x1001;
ts->input_dev->id.version = ts->version;
+ ts->input_dev->keycode = ts->keymap;
+ ts->input_dev->keycodesize = sizeof(ts->keymap[0]);
+ ts->input_dev->keycodemax = GOODIX_MAX_KEYS;
+
/* Capacitive Windows/Home button on some devices */
- input_set_capability(ts->input_dev, EV_KEY, KEY_LEFTMETA);
+ for (i = 0; i < GOODIX_MAX_KEYS; ++i) {
+ if (i == 0)
+ ts->keymap[i] = KEY_LEFTMETA;
+ else
+ ts->keymap[i] = KEY_F1 + (i - 1);
+
+ input_set_capability(ts->input_dev, EV_KEY, ts->keymap[i]);
+ }
input_set_capability(ts->input_dev, EV_ABS, ABS_MT_POSITION_X);
input_set_capability(ts->input_dev, EV_ABS, ABS_MT_POSITION_Y);
@@ -780,6 +1122,12 @@ static int goodix_configure_dev(struct goodix_ts_data *ts)
"Non-standard 9-bytes report format quirk\n");
}
+ if (dmi_check_system(inverted_x_screen)) {
+ ts->prop.invert_x = true;
+ dev_dbg(&ts->client->dev,
+ "Applying 'inverted x screen' quirk\n");
+ }
+
error = input_mt_init_slots(ts->input_dev, ts->max_touch_num,
INPUT_MT_DIRECT | INPUT_MT_DROP_UNUSED);
if (error) {
@@ -820,7 +1168,7 @@ static void goodix_config_cb(const struct firmware *cfg, void *ctx)
if (cfg) {
/* send device configuration to the firmware */
- error = goodix_send_cfg(ts, cfg);
+ error = goodix_send_cfg(ts, cfg->data, cfg->size);
if (error)
goto err_release_cfg;
}
@@ -889,7 +1237,8 @@ static int goodix_ts_probe(struct i2c_client *client,
if (error)
return error;
- if (ts->gpiod_int && ts->gpiod_rst) {
+reset:
+ if (ts->reset_controller_at_probe) {
/* reset the controller */
error = goodix_reset(ts);
if (error) {
@@ -900,6 +1249,12 @@ static int goodix_ts_probe(struct i2c_client *client,
error = goodix_i2c_test(client);
if (error) {
+ if (!ts->reset_controller_at_probe &&
+ ts->irq_pin_access_method != IRQ_PIN_ACCESS_NONE) {
+ /* Retry after a controller reset */
+ ts->reset_controller_at_probe = true;
+ goto reset;
+ }
dev_err(&client->dev, "I2C communication failure: %d\n", error);
return error;
}
@@ -912,10 +1267,10 @@ static int goodix_ts_probe(struct i2c_client *client,
ts->chip = goodix_get_chip_data(ts->id);
- if (ts->gpiod_int && ts->gpiod_rst) {
+ if (ts->load_cfg_from_disk) {
/* update device config */
ts->cfg_name = devm_kasprintf(&client->dev, GFP_KERNEL,
- "goodix_%d_cfg.bin", ts->id);
+ "goodix_%s_cfg.bin", ts->id);
if (!ts->cfg_name)
return -ENOMEM;
@@ -943,7 +1298,7 @@ static int goodix_ts_remove(struct i2c_client *client)
{
struct goodix_ts_data *ts = i2c_get_clientdata(client);
- if (ts->gpiod_int && ts->gpiod_rst)
+ if (ts->load_cfg_from_disk)
wait_for_completion(&ts->firmware_loading_complete);
return 0;
@@ -955,19 +1310,20 @@ static int __maybe_unused goodix_suspend(struct device *dev)
struct goodix_ts_data *ts = i2c_get_clientdata(client);
int error;
+ if (ts->load_cfg_from_disk)
+ wait_for_completion(&ts->firmware_loading_complete);
+
/* We need gpio pins to suspend/resume */
- if (!ts->gpiod_int || !ts->gpiod_rst) {
+ if (ts->irq_pin_access_method == IRQ_PIN_ACCESS_NONE) {
disable_irq(client->irq);
return 0;
}
- wait_for_completion(&ts->firmware_loading_complete);
-
/* Free IRQ as IRQ pin is used as output in the suspend sequence */
goodix_free_irq(ts);
/* Output LOW on the INT pin for 5 ms */
- error = gpiod_direction_output(ts->gpiod_int, 0);
+ error = goodix_irq_direction_output(ts, 0);
if (error) {
goodix_request_irq(ts);
return error;
@@ -979,7 +1335,7 @@ static int __maybe_unused goodix_suspend(struct device *dev)
GOODIX_CMD_SCREEN_OFF);
if (error) {
dev_err(&ts->client->dev, "Screen off command failed\n");
- gpiod_direction_input(ts->gpiod_int);
+ goodix_irq_direction_input(ts);
goodix_request_irq(ts);
return -EAGAIN;
}
@@ -997,9 +1353,10 @@ static int __maybe_unused goodix_resume(struct device *dev)
{
struct i2c_client *client = to_i2c_client(dev);
struct goodix_ts_data *ts = i2c_get_clientdata(client);
+ u8 config_ver;
int error;
- if (!ts->gpiod_int || !ts->gpiod_rst) {
+ if (ts->irq_pin_access_method == IRQ_PIN_ACCESS_NONE) {
enable_irq(client->irq);
return 0;
}
@@ -1008,7 +1365,7 @@ static int __maybe_unused goodix_resume(struct device *dev)
* Exit sleep mode by outputting HIGH level to INT pin
* for 2ms~5ms.
*/
- error = gpiod_direction_output(ts->gpiod_int, 1);
+ error = goodix_irq_direction_output(ts, 1);
if (error)
return error;
@@ -1018,6 +1375,27 @@ static int __maybe_unused goodix_resume(struct device *dev)
if (error)
return error;
+ error = goodix_i2c_read(ts->client, ts->chip->config_addr,
+ &config_ver, 1);
+ if (error)
+ dev_warn(dev, "Error reading config version: %d, resetting controller\n",
+ error);
+ else if (config_ver != ts->config[0])
+ dev_info(dev, "Config version mismatch %d != %d, resetting controller\n",
+ config_ver, ts->config[0]);
+
+ if (error != 0 || config_ver != ts->config[0]) {
+ error = goodix_reset(ts);
+ if (error) {
+ dev_err(dev, "Controller reset failed.\n");
+ return error;
+ }
+
+ error = goodix_send_cfg(ts, ts->config, ts->chip->config_len);
+ if (error)
+ return error;
+ }
+
error = goodix_request_irq(ts);
if (error)
return error;
@@ -1050,6 +1428,8 @@ static const struct of_device_id goodix_of_match[] = {
{ .compatible = "goodix,gt911" },
{ .compatible = "goodix,gt9110" },
{ .compatible = "goodix,gt912" },
+ { .compatible = "goodix,gt9147" },
+ { .compatible = "goodix,gt917s" },
{ .compatible = "goodix,gt927" },
{ .compatible = "goodix,gt9271" },
{ .compatible = "goodix,gt928" },
diff --git a/drivers/input/touchscreen/of_touchscreen.c b/drivers/input/touchscreen/of_touchscreen.c
index e16ec4c7043a..97342e14b4f1 100644
--- a/drivers/input/touchscreen/of_touchscreen.c
+++ b/drivers/input/touchscreen/of_touchscreen.c
@@ -66,7 +66,7 @@ void touchscreen_parse_properties(struct input_dev *input, bool multitouch,
{
struct device *dev = input->dev.parent;
struct input_absinfo *absinfo;
- unsigned int axis;
+ unsigned int axis, axis_x, axis_y;
unsigned int minimum, maximum, fuzz;
bool data_present;
@@ -74,33 +74,34 @@ void touchscreen_parse_properties(struct input_dev *input, bool multitouch,
if (!input->absinfo)
return;
- axis = multitouch ? ABS_MT_POSITION_X : ABS_X;
+ axis_x = multitouch ? ABS_MT_POSITION_X : ABS_X;
+ axis_y = multitouch ? ABS_MT_POSITION_Y : ABS_Y;
+
data_present = touchscreen_get_prop_u32(dev, "touchscreen-min-x",
- input_abs_get_min(input, axis),
+ input_abs_get_min(input, axis_x),
&minimum) |
touchscreen_get_prop_u32(dev, "touchscreen-size-x",
input_abs_get_max(input,
- axis) + 1,
+ axis_x) + 1,
&maximum) |
touchscreen_get_prop_u32(dev, "touchscreen-fuzz-x",
- input_abs_get_fuzz(input, axis),
+ input_abs_get_fuzz(input, axis_x),
&fuzz);
if (data_present)
- touchscreen_set_params(input, axis, minimum, maximum - 1, fuzz);
+ touchscreen_set_params(input, axis_x, minimum, maximum - 1, fuzz);
- axis = multitouch ? ABS_MT_POSITION_Y : ABS_Y;
data_present = touchscreen_get_prop_u32(dev, "touchscreen-min-y",
- input_abs_get_min(input, axis),
+ input_abs_get_min(input, axis_y),
&minimum) |
touchscreen_get_prop_u32(dev, "touchscreen-size-y",
input_abs_get_max(input,
- axis) + 1,
+ axis_y) + 1,
&maximum) |
touchscreen_get_prop_u32(dev, "touchscreen-fuzz-y",
- input_abs_get_fuzz(input, axis),
+ input_abs_get_fuzz(input, axis_y),
&fuzz);
if (data_present)
- touchscreen_set_params(input, axis, minimum, maximum - 1, fuzz);
+ touchscreen_set_params(input, axis_y, minimum, maximum - 1, fuzz);
axis = multitouch ? ABS_MT_PRESSURE : ABS_PRESSURE;
data_present = touchscreen_get_prop_u32(dev,
@@ -117,15 +118,13 @@ void touchscreen_parse_properties(struct input_dev *input, bool multitouch,
if (!prop)
return;
- axis = multitouch ? ABS_MT_POSITION_X : ABS_X;
-
- prop->max_x = input_abs_get_max(input, axis);
- prop->max_y = input_abs_get_max(input, axis + 1);
+ prop->max_x = input_abs_get_max(input, axis_x);
+ prop->max_y = input_abs_get_max(input, axis_y);
prop->invert_x =
device_property_read_bool(dev, "touchscreen-inverted-x");
if (prop->invert_x) {
- absinfo = &input->absinfo[axis];
+ absinfo = &input->absinfo[axis_x];
absinfo->maximum -= absinfo->minimum;
absinfo->minimum = 0;
}
@@ -133,7 +132,7 @@ void touchscreen_parse_properties(struct input_dev *input, bool multitouch,
prop->invert_y =
device_property_read_bool(dev, "touchscreen-inverted-y");
if (prop->invert_y) {
- absinfo = &input->absinfo[axis + 1];
+ absinfo = &input->absinfo[axis_y];
absinfo->maximum -= absinfo->minimum;
absinfo->minimum = 0;
}
@@ -141,7 +140,7 @@ void touchscreen_parse_properties(struct input_dev *input, bool multitouch,
prop->swap_x_y =
device_property_read_bool(dev, "touchscreen-swapped-x-y");
if (prop->swap_x_y)
- swap(input->absinfo[axis], input->absinfo[axis + 1]);
+ swap(input->absinfo[axis_x], input->absinfo[axis_y]);
}
EXPORT_SYMBOL(touchscreen_parse_properties);
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index d2fade984999..58b4a4dbfc78 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -188,6 +188,7 @@ config INTEL_IOMMU
select NEED_DMA_MAP_STATE
select DMAR_TABLE
select SWIOTLB
+ select IOASID
help
DMA remapping (DMAR) devices support enables independent address
translations for Direct Memory Access (DMA) from devices.
@@ -273,7 +274,7 @@ config IRQ_REMAP
# OMAP IOMMU support
config OMAP_IOMMU
bool "OMAP IOMMU Support"
- depends on ARM && MMU
+ depends on ARM && MMU || (COMPILE_TEST && (ARM || ARM64 || IA64 || SPARC))
depends on ARCH_OMAP2PLUS || COMPILE_TEST
select IOMMU_API
---help---
@@ -291,7 +292,7 @@ config OMAP_IOMMU_DEBUG
config ROCKCHIP_IOMMU
bool "Rockchip IOMMU Support"
- depends on ARM || ARM64
+ depends on ARM || ARM64 || (COMPILE_TEST && (ARM64 || IA64 || SPARC))
depends on ARCH_ROCKCHIP || COMPILE_TEST
select IOMMU_API
select ARM_DMA_USE_IOMMU
@@ -325,7 +326,7 @@ config TEGRA_IOMMU_SMMU
config EXYNOS_IOMMU
bool "Exynos IOMMU Support"
- depends on ARCH_EXYNOS && MMU
+ depends on ARCH_EXYNOS && MMU || (COMPILE_TEST && (ARM || ARM64 || IA64 || SPARC))
depends on !CPU_BIG_ENDIAN # revisit driver if we can enable big-endian ptes
select IOMMU_API
select ARM_DMA_USE_IOMMU
@@ -361,7 +362,7 @@ config IPMMU_VMSA
config SPAPR_TCE_IOMMU
bool "sPAPR TCE IOMMU Support"
- depends on PPC_POWERNV || PPC_PSERIES
+ depends on PPC_POWERNV || PPC_PSERIES || (PPC && COMPILE_TEST)
select IOMMU_API
help
Enables bits of IOMMU API required by VFIO. The iommu_ops
@@ -370,7 +371,7 @@ config SPAPR_TCE_IOMMU
# ARM IOMMU support
config ARM_SMMU
tristate "ARM Ltd. System MMU (SMMU) Support"
- depends on (ARM64 || ARM) && MMU
+ depends on (ARM64 || ARM || (COMPILE_TEST && !GENERIC_ATOMIC64)) && MMU
select IOMMU_API
select IOMMU_IO_PGTABLE_LPAE
select ARM_DMA_USE_IOMMU if ARM
@@ -440,7 +441,7 @@ config S390_IOMMU
config S390_CCW_IOMMU
bool "S390 CCW IOMMU Support"
- depends on S390 && CCW
+ depends on S390 && CCW || COMPILE_TEST
select IOMMU_API
help
Enables bits of IOMMU API required by VFIO. The iommu_ops
@@ -448,7 +449,7 @@ config S390_CCW_IOMMU
config S390_AP_IOMMU
bool "S390 AP IOMMU Support"
- depends on S390 && ZCRYPT
+ depends on S390 && ZCRYPT || COMPILE_TEST
select IOMMU_API
help
Enables bits of IOMMU API required by VFIO. The iommu_ops
@@ -456,7 +457,7 @@ config S390_AP_IOMMU
config MTK_IOMMU
bool "MTK IOMMU Support"
- depends on ARM || ARM64
+ depends on ARM || ARM64 || COMPILE_TEST
depends on ARCH_MEDIATEK || COMPILE_TEST
select ARM_DMA_USE_IOMMU
select IOMMU_API
@@ -506,8 +507,8 @@ config HYPERV_IOMMU
guests to run with x2APIC mode enabled.
config VIRTIO_IOMMU
- bool "Virtio IOMMU driver"
- depends on VIRTIO=y
+ tristate "Virtio IOMMU driver"
+ depends on VIRTIO
depends on ARM64
select IOMMU_API
select INTERVAL_TREE
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index f8d01d6b00da..ca8c4522045b 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -348,7 +348,7 @@
#define DTE_GCR3_VAL_A(x) (((x) >> 12) & 0x00007ULL)
#define DTE_GCR3_VAL_B(x) (((x) >> 15) & 0x0ffffULL)
-#define DTE_GCR3_VAL_C(x) (((x) >> 31) & 0xfffffULL)
+#define DTE_GCR3_VAL_C(x) (((x) >> 31) & 0x1fffffULL)
#define DTE_GCR3_INDEX_A 0
#define DTE_GCR3_INDEX_B 1
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index aa3ac2a03807..82508730feb7 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -69,6 +69,9 @@
#define IDR1_SSIDSIZE GENMASK(10, 6)
#define IDR1_SIDSIZE GENMASK(5, 0)
+#define ARM_SMMU_IDR3 0xc
+#define IDR3_RIL (1 << 10)
+
#define ARM_SMMU_IDR5 0x14
#define IDR5_STALL_MAX GENMASK(31, 16)
#define IDR5_GRAN64K (1 << 6)
@@ -346,9 +349,14 @@
#define CMDQ_CFGI_1_LEAF (1UL << 0)
#define CMDQ_CFGI_1_RANGE GENMASK_ULL(4, 0)
+#define CMDQ_TLBI_0_NUM GENMASK_ULL(16, 12)
+#define CMDQ_TLBI_RANGE_NUM_MAX 31
+#define CMDQ_TLBI_0_SCALE GENMASK_ULL(24, 20)
#define CMDQ_TLBI_0_VMID GENMASK_ULL(47, 32)
#define CMDQ_TLBI_0_ASID GENMASK_ULL(63, 48)
#define CMDQ_TLBI_1_LEAF (1UL << 0)
+#define CMDQ_TLBI_1_TTL GENMASK_ULL(9, 8)
+#define CMDQ_TLBI_1_TG GENMASK_ULL(11, 10)
#define CMDQ_TLBI_1_VA_MASK GENMASK_ULL(63, 12)
#define CMDQ_TLBI_1_IPA_MASK GENMASK_ULL(51, 12)
@@ -473,9 +481,13 @@ struct arm_smmu_cmdq_ent {
#define CMDQ_OP_TLBI_S2_IPA 0x2a
#define CMDQ_OP_TLBI_NSNH_ALL 0x30
struct {
+ u8 num;
+ u8 scale;
u16 asid;
u16 vmid;
bool leaf;
+ u8 ttl;
+ u8 tg;
u64 addr;
} tlbi;
@@ -548,6 +560,11 @@ struct arm_smmu_cmdq {
atomic_t lock;
};
+struct arm_smmu_cmdq_batch {
+ u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
+ int num;
+};
+
struct arm_smmu_evtq {
struct arm_smmu_queue q;
u32 max_stalls;
@@ -627,6 +644,7 @@ struct arm_smmu_device {
#define ARM_SMMU_FEAT_HYP (1 << 12)
#define ARM_SMMU_FEAT_STALL_FORCE (1 << 13)
#define ARM_SMMU_FEAT_VAX (1 << 14)
+#define ARM_SMMU_FEAT_RANGE_INV (1 << 15)
u32 features;
#define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0)
@@ -895,14 +913,22 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_RANGE, 31);
break;
case CMDQ_OP_TLBI_NH_VA:
+ cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
+ cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
+ cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
+ cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;
break;
case CMDQ_OP_TLBI_S2_IPA:
+ cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
+ cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
+ cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
+ cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_IPA_MASK;
break;
case CMDQ_OP_TLBI_NH_ASID:
@@ -1482,6 +1508,24 @@ static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
}
+static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq_batch *cmds,
+ struct arm_smmu_cmdq_ent *cmd)
+{
+ if (cmds->num == CMDQ_BATCH_ENTRIES) {
+ arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
+ cmds->num = 0;
+ }
+ arm_smmu_cmdq_build_cmd(&cmds->cmds[cmds->num * CMDQ_ENT_DWORDS], cmd);
+ cmds->num++;
+}
+
+static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq_batch *cmds)
+{
+ return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
+}
+
/* Context descriptor manipulation functions */
static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain,
int ssid, bool leaf)
@@ -1489,6 +1533,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain,
size_t i;
unsigned long flags;
struct arm_smmu_master *master;
+ struct arm_smmu_cmdq_batch cmds = {};
struct arm_smmu_device *smmu = smmu_domain->smmu;
struct arm_smmu_cmdq_ent cmd = {
.opcode = CMDQ_OP_CFGI_CD,
@@ -1502,12 +1547,12 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain,
list_for_each_entry(master, &smmu_domain->devices, domain_head) {
for (i = 0; i < master->num_sids; i++) {
cmd.cfgi.sid = master->sids[i];
- arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+ arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
}
}
spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
- arm_smmu_cmdq_issue_sync(smmu);
+ arm_smmu_cmdq_batch_submit(smmu, &cmds);
}
static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu,
@@ -1531,6 +1576,7 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst,
u64 val = (l1_desc->l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) |
CTXDESC_L1_DESC_V;
+ /* See comment in arm_smmu_write_ctx_desc() */
WRITE_ONCE(*dst, cpu_to_le64(val));
}
@@ -1726,7 +1772,8 @@ arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc)
val |= FIELD_PREP(STRTAB_L1_DESC_SPAN, desc->span);
val |= desc->l2ptr_dma & STRTAB_L1_DESC_L2PTR_MASK;
- *dst = cpu_to_le64(val);
+ /* See comment in arm_smmu_write_ctx_desc() */
+ WRITE_ONCE(*dst, cpu_to_le64(val));
}
static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid)
@@ -2132,17 +2179,16 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, size_t size,
cmd->atc.size = log2_span;
}
-static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
- struct arm_smmu_cmdq_ent *cmd)
+static int arm_smmu_atc_inv_master(struct arm_smmu_master *master)
{
int i;
+ struct arm_smmu_cmdq_ent cmd;
- if (!master->ats_enabled)
- return 0;
+ arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd);
for (i = 0; i < master->num_sids; i++) {
- cmd->atc.sid = master->sids[i];
- arm_smmu_cmdq_issue_cmd(master->smmu, cmd);
+ cmd.atc.sid = master->sids[i];
+ arm_smmu_cmdq_issue_cmd(master->smmu, &cmd);
}
return arm_smmu_cmdq_issue_sync(master->smmu);
@@ -2151,10 +2197,11 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
int ssid, unsigned long iova, size_t size)
{
- int ret = 0;
+ int i;
unsigned long flags;
struct arm_smmu_cmdq_ent cmd;
struct arm_smmu_master *master;
+ struct arm_smmu_cmdq_batch cmds = {};
if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
return 0;
@@ -2179,11 +2226,18 @@ static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
spin_lock_irqsave(&smmu_domain->devices_lock, flags);
- list_for_each_entry(master, &smmu_domain->devices, domain_head)
- ret |= arm_smmu_atc_inv_master(master, &cmd);
+ list_for_each_entry(master, &smmu_domain->devices, domain_head) {
+ if (!master->ats_enabled)
+ continue;
+
+ for (i = 0; i < master->num_sids; i++) {
+ cmd.atc.sid = master->sids[i];
+ arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd);
+ }
+ }
spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
- return ret ? -ETIMEDOUT : 0;
+ return arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds);
}
/* IO_PGTABLE API */
@@ -2218,10 +2272,10 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
size_t granule, bool leaf,
struct arm_smmu_domain *smmu_domain)
{
- u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
struct arm_smmu_device *smmu = smmu_domain->smmu;
- unsigned long start = iova, end = iova + size;
- int i = 0;
+ unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
+ size_t inv_range = granule;
+ struct arm_smmu_cmdq_batch cmds = {};
struct arm_smmu_cmdq_ent cmd = {
.tlbi = {
.leaf = leaf,
@@ -2239,19 +2293,50 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid;
}
+ if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
+ /* Get the leaf page size */
+ tg = __ffs(smmu_domain->domain.pgsize_bitmap);
+
+ /* Convert page size of 12,14,16 (log2) to 1,2,3 */
+ cmd.tlbi.tg = (tg - 10) / 2;
+
+ /* Determine what level the granule is at */
+ cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
+
+ num_pages = size >> tg;
+ }
+
while (iova < end) {
- if (i == CMDQ_BATCH_ENTRIES) {
- arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false);
- i = 0;
+ if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
+ /*
+ * On each iteration of the loop, the range is 5 bits
+ * worth of the aligned size remaining.
+ * The range in pages is:
+ *
+ * range = (num_pages & (0x1f << __ffs(num_pages)))
+ */
+ unsigned long scale, num;
+
+ /* Determine the power of 2 multiple number of pages */
+ scale = __ffs(num_pages);
+ cmd.tlbi.scale = scale;
+
+ /* Determine how many chunks of 2^scale size we have */
+ num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
+ cmd.tlbi.num = num - 1;
+
+ /* range is num * 2^scale * pgsize */
+ inv_range = num << (scale + tg);
+
+ /* Clear out the lower order bits for the next iteration */
+ num_pages -= num << scale;
}
cmd.tlbi.addr = iova;
- arm_smmu_cmdq_build_cmd(&cmds[i * CMDQ_ENT_DWORDS], &cmd);
- iova += granule;
- i++;
+ arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
+ iova += inv_range;
}
-
- arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, true);
+ arm_smmu_cmdq_batch_submit(smmu, &cmds);
/*
* Unfortunately, this can't be leaf-only since we may have
@@ -2611,7 +2696,6 @@ static void arm_smmu_enable_ats(struct arm_smmu_master *master)
static void arm_smmu_disable_ats(struct arm_smmu_master *master)
{
- struct arm_smmu_cmdq_ent cmd;
struct arm_smmu_domain *smmu_domain = master->domain;
if (!master->ats_enabled)
@@ -2623,11 +2707,57 @@ static void arm_smmu_disable_ats(struct arm_smmu_master *master)
* ATC invalidation via the SMMU.
*/
wmb();
- arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd);
- arm_smmu_atc_inv_master(master, &cmd);
+ arm_smmu_atc_inv_master(master);
atomic_dec(&smmu_domain->nr_ats_masters);
}
+static int arm_smmu_enable_pasid(struct arm_smmu_master *master)
+{
+ int ret;
+ int features;
+ int num_pasids;
+ struct pci_dev *pdev;
+
+ if (!dev_is_pci(master->dev))
+ return -ENODEV;
+
+ pdev = to_pci_dev(master->dev);
+
+ features = pci_pasid_features(pdev);
+ if (features < 0)
+ return features;
+
+ num_pasids = pci_max_pasids(pdev);
+ if (num_pasids <= 0)
+ return num_pasids;
+
+ ret = pci_enable_pasid(pdev, features);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to enable PASID\n");
+ return ret;
+ }
+
+ master->ssid_bits = min_t(u8, ilog2(num_pasids),
+ master->smmu->ssid_bits);
+ return 0;
+}
+
+static void arm_smmu_disable_pasid(struct arm_smmu_master *master)
+{
+ struct pci_dev *pdev;
+
+ if (!dev_is_pci(master->dev))
+ return;
+
+ pdev = to_pci_dev(master->dev);
+
+ if (!pdev->pasid_enabled)
+ return;
+
+ master->ssid_bits = 0;
+ pci_disable_pasid(pdev);
+}
+
static void arm_smmu_detach_dev(struct arm_smmu_master *master)
{
unsigned long flags;
@@ -2659,7 +2789,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
if (!fwspec)
return -ENOENT;
- master = fwspec->iommu_priv;
+ master = dev_iommu_priv_get(dev);
smmu = master->smmu;
arm_smmu_detach_dev(master);
@@ -2795,7 +2925,7 @@ static int arm_smmu_add_device(struct device *dev)
if (!fwspec || fwspec->ops != &arm_smmu_ops)
return -ENODEV;
- if (WARN_ON_ONCE(fwspec->iommu_priv))
+ if (WARN_ON_ONCE(dev_iommu_priv_get(dev)))
return -EBUSY;
smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode);
@@ -2810,7 +2940,7 @@ static int arm_smmu_add_device(struct device *dev)
master->smmu = smmu;
master->sids = fwspec->ids;
master->num_sids = fwspec->num_ids;
- fwspec->iommu_priv = master;
+ dev_iommu_priv_set(dev, master);
/* Check the SIDs are in range of the SMMU and our stream table */
for (i = 0; i < master->num_sids; i++) {
@@ -2831,13 +2961,23 @@ static int arm_smmu_add_device(struct device *dev)
master->ssid_bits = min(smmu->ssid_bits, fwspec->num_pasid_bits);
+ /*
+ * Note that PASID must be enabled before, and disabled after ATS:
+ * PCI Express Base 4.0r1.0 - 10.5.1.3 ATS Control Register
+ *
+ * Behavior is undefined if this bit is Set and the value of the PASID
+ * Enable, Execute Requested Enable, or Privileged Mode Requested bits
+ * are changed.
+ */
+ arm_smmu_enable_pasid(master);
+
if (!(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB))
master->ssid_bits = min_t(u8, master->ssid_bits,
CTXDESC_LINEAR_CDMAX);
ret = iommu_device_link(&smmu->iommu, dev);
if (ret)
- goto err_free_master;
+ goto err_disable_pasid;
group = iommu_group_get_for_dev(dev);
if (IS_ERR(group)) {
@@ -2850,9 +2990,11 @@ static int arm_smmu_add_device(struct device *dev)
err_unlink:
iommu_device_unlink(&smmu->iommu, dev);
+err_disable_pasid:
+ arm_smmu_disable_pasid(master);
err_free_master:
kfree(master);
- fwspec->iommu_priv = NULL;
+ dev_iommu_priv_set(dev, NULL);
return ret;
}
@@ -2865,11 +3007,12 @@ static void arm_smmu_remove_device(struct device *dev)
if (!fwspec || fwspec->ops != &arm_smmu_ops)
return;
- master = fwspec->iommu_priv;
+ master = dev_iommu_priv_get(dev);
smmu = master->smmu;
arm_smmu_detach_dev(master);
iommu_group_remove_device(dev);
iommu_device_unlink(&smmu->iommu, dev);
+ arm_smmu_disable_pasid(master);
kfree(master);
iommu_fwspec_free(dev);
}
@@ -3700,6 +3843,11 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
if (smmu->sid_bits <= STRTAB_SPLIT)
smmu->features &= ~ARM_SMMU_FEAT_2_LVL_STRTAB;
+ /* IDR3 */
+ reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3);
+ if (FIELD_GET(IDR3_RIL, reg))
+ smmu->features |= ARM_SMMU_FEAT_RANGE_INV;
+
/* IDR5 */
reg = readl_relaxed(smmu->base + ARM_SMMU_IDR5);
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 16c4b87af42b..a6a5796e9c41 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -98,12 +98,10 @@ struct arm_smmu_master_cfg {
s16 smendx[];
};
#define INVALID_SMENDX -1
-#define __fwspec_cfg(fw) ((struct arm_smmu_master_cfg *)fw->iommu_priv)
-#define fwspec_smmu(fw) (__fwspec_cfg(fw)->smmu)
-#define fwspec_smendx(fw, i) \
- (i >= fw->num_ids ? INVALID_SMENDX : __fwspec_cfg(fw)->smendx[i])
-#define for_each_cfg_sme(fw, i, idx) \
- for (i = 0; idx = fwspec_smendx(fw, i), i < fw->num_ids; ++i)
+#define cfg_smendx(cfg, fw, i) \
+ (i >= fw->num_ids ? INVALID_SMENDX : cfg->smendx[i])
+#define for_each_cfg_sme(cfg, fw, i, idx) \
+ for (i = 0; idx = cfg_smendx(cfg, fw, i), i < fw->num_ids; ++i)
static bool using_legacy_binding, using_generic_binding;
@@ -1061,7 +1059,7 @@ static bool arm_smmu_free_sme(struct arm_smmu_device *smmu, int idx)
static int arm_smmu_master_alloc_smes(struct device *dev)
{
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
- struct arm_smmu_master_cfg *cfg = fwspec->iommu_priv;
+ struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev);
struct arm_smmu_device *smmu = cfg->smmu;
struct arm_smmu_smr *smrs = smmu->smrs;
struct iommu_group *group;
@@ -1069,7 +1067,7 @@ static int arm_smmu_master_alloc_smes(struct device *dev)
mutex_lock(&smmu->stream_map_mutex);
/* Figure out a viable stream map entry allocation */
- for_each_cfg_sme(fwspec, i, idx) {
+ for_each_cfg_sme(cfg, fwspec, i, idx) {
u16 sid = FIELD_GET(ARM_SMMU_SMR_ID, fwspec->ids[i]);
u16 mask = FIELD_GET(ARM_SMMU_SMR_MASK, fwspec->ids[i]);
@@ -1100,7 +1098,7 @@ static int arm_smmu_master_alloc_smes(struct device *dev)
iommu_group_put(group);
/* It worked! Now, poke the actual hardware */
- for_each_cfg_sme(fwspec, i, idx) {
+ for_each_cfg_sme(cfg, fwspec, i, idx) {
arm_smmu_write_sme(smmu, idx);
smmu->s2crs[idx].group = group;
}
@@ -1117,14 +1115,14 @@ out_err:
return ret;
}
-static void arm_smmu_master_free_smes(struct iommu_fwspec *fwspec)
+static void arm_smmu_master_free_smes(struct arm_smmu_master_cfg *cfg,
+ struct iommu_fwspec *fwspec)
{
- struct arm_smmu_device *smmu = fwspec_smmu(fwspec);
- struct arm_smmu_master_cfg *cfg = fwspec->iommu_priv;
+ struct arm_smmu_device *smmu = cfg->smmu;
int i, idx;
mutex_lock(&smmu->stream_map_mutex);
- for_each_cfg_sme(fwspec, i, idx) {
+ for_each_cfg_sme(cfg, fwspec, i, idx) {
if (arm_smmu_free_sme(smmu, idx))
arm_smmu_write_sme(smmu, idx);
cfg->smendx[i] = INVALID_SMENDX;
@@ -1133,6 +1131,7 @@ static void arm_smmu_master_free_smes(struct iommu_fwspec *fwspec)
}
static int arm_smmu_domain_add_master(struct arm_smmu_domain *smmu_domain,
+ struct arm_smmu_master_cfg *cfg,
struct iommu_fwspec *fwspec)
{
struct arm_smmu_device *smmu = smmu_domain->smmu;
@@ -1146,7 +1145,7 @@ static int arm_smmu_domain_add_master(struct arm_smmu_domain *smmu_domain,
else
type = S2CR_TYPE_TRANS;
- for_each_cfg_sme(fwspec, i, idx) {
+ for_each_cfg_sme(cfg, fwspec, i, idx) {
if (type == s2cr[idx].type && cbndx == s2cr[idx].cbndx)
continue;
@@ -1160,10 +1159,11 @@ static int arm_smmu_domain_add_master(struct arm_smmu_domain *smmu_domain,
static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
{
- int ret;
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct arm_smmu_master_cfg *cfg;
struct arm_smmu_device *smmu;
- struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ int ret;
if (!fwspec || fwspec->ops != &arm_smmu_ops) {
dev_err(dev, "cannot attach to SMMU, is it on the same bus?\n");
@@ -1177,10 +1177,11 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
* domains, just say no (but more politely than by dereferencing NULL).
* This should be at least a WARN_ON once that's sorted.
*/
- if (!fwspec->iommu_priv)
+ cfg = dev_iommu_priv_get(dev);
+ if (!cfg)
return -ENODEV;
- smmu = fwspec_smmu(fwspec);
+ smmu = cfg->smmu;
ret = arm_smmu_rpm_get(smmu);
if (ret < 0)
@@ -1204,7 +1205,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
}
/* Looks ok, so add the device to the domain */
- ret = arm_smmu_domain_add_master(smmu_domain, fwspec);
+ ret = arm_smmu_domain_add_master(smmu_domain, cfg, fwspec);
/*
* Setup an autosuspend delay to avoid bouncing runpm state.
@@ -1383,7 +1384,7 @@ struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode)
static int arm_smmu_add_device(struct device *dev)
{
- struct arm_smmu_device *smmu;
+ struct arm_smmu_device *smmu = NULL;
struct arm_smmu_master_cfg *cfg;
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
int i, ret;
@@ -1429,7 +1430,7 @@ static int arm_smmu_add_device(struct device *dev)
goto out_free;
cfg->smmu = smmu;
- fwspec->iommu_priv = cfg;
+ dev_iommu_priv_set(dev, cfg);
while (i--)
cfg->smendx[i] = INVALID_SMENDX;
@@ -1467,7 +1468,7 @@ static void arm_smmu_remove_device(struct device *dev)
if (!fwspec || fwspec->ops != &arm_smmu_ops)
return;
- cfg = fwspec->iommu_priv;
+ cfg = dev_iommu_priv_get(dev);
smmu = cfg->smmu;
ret = arm_smmu_rpm_get(smmu);
@@ -1475,23 +1476,25 @@ static void arm_smmu_remove_device(struct device *dev)
return;
iommu_device_unlink(&smmu->iommu, dev);
- arm_smmu_master_free_smes(fwspec);
+ arm_smmu_master_free_smes(cfg, fwspec);
arm_smmu_rpm_put(smmu);
+ dev_iommu_priv_set(dev, NULL);
iommu_group_remove_device(dev);
- kfree(fwspec->iommu_priv);
+ kfree(cfg);
iommu_fwspec_free(dev);
}
static struct iommu_group *arm_smmu_device_group(struct device *dev)
{
+ struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev);
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
- struct arm_smmu_device *smmu = fwspec_smmu(fwspec);
+ struct arm_smmu_device *smmu = cfg->smmu;
struct iommu_group *group = NULL;
int i, idx;
- for_each_cfg_sme(fwspec, i, idx) {
+ for_each_cfg_sme(cfg, fwspec, i, idx) {
if (group && smmu->s2crs[idx].group &&
group != smmu->s2crs[idx].group)
return ERR_PTR(-EINVAL);
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 4be549478691..ef0a5246700e 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4501,7 +4501,8 @@ static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
struct dmar_atsr_unit *atsru;
struct acpi_dmar_atsr *tmp;
- list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
+ list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
+ dmar_rcu_check()) {
tmp = (struct acpi_dmar_atsr *)atsru->hdr;
if (atsr->segment != tmp->segment)
continue;
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index d7f2a5358900..2998418f0a38 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -531,7 +531,7 @@ struct page_req_dsc {
u64 priv_data[2];
};
-#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x10)
+#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20)
static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
{
@@ -611,14 +611,15 @@ static irqreturn_t prq_event_thread(int irq, void *d)
* any faults on kernel addresses. */
if (!svm->mm)
goto bad_req;
- /* If the mm is already defunct, don't handle faults. */
- if (!mmget_not_zero(svm->mm))
- goto bad_req;
/* If address is not canonical, return invalid response */
if (!is_canonical_address(address))
goto bad_req;
+ /* If the mm is already defunct, don't handle faults. */
+ if (!mmget_not_zero(svm->mm))
+ goto bad_req;
+
down_read(&svm->mm->mmap_sem);
vma = find_extend_vma(svm->mm, address);
if (!vma || address < vma->vm_start)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3e3528436e0b..2b471419e26c 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -152,9 +152,9 @@ void iommu_device_unregister(struct iommu_device *iommu)
}
EXPORT_SYMBOL_GPL(iommu_device_unregister);
-static struct iommu_param *iommu_get_dev_param(struct device *dev)
+static struct dev_iommu *dev_iommu_get(struct device *dev)
{
- struct iommu_param *param = dev->iommu_param;
+ struct dev_iommu *param = dev->iommu;
if (param)
return param;
@@ -164,14 +164,14 @@ static struct iommu_param *iommu_get_dev_param(struct device *dev)
return NULL;
mutex_init(&param->lock);
- dev->iommu_param = param;
+ dev->iommu = param;
return param;
}
-static void iommu_free_dev_param(struct device *dev)
+static void dev_iommu_free(struct device *dev)
{
- kfree(dev->iommu_param);
- dev->iommu_param = NULL;
+ kfree(dev->iommu);
+ dev->iommu = NULL;
}
int iommu_probe_device(struct device *dev)
@@ -183,7 +183,7 @@ int iommu_probe_device(struct device *dev)
if (!ops)
return -EINVAL;
- if (!iommu_get_dev_param(dev))
+ if (!dev_iommu_get(dev))
return -ENOMEM;
if (!try_module_get(ops->owner)) {
@@ -200,7 +200,7 @@ int iommu_probe_device(struct device *dev)
err_module_put:
module_put(ops->owner);
err_free_dev_param:
- iommu_free_dev_param(dev);
+ dev_iommu_free(dev);
return ret;
}
@@ -211,9 +211,9 @@ void iommu_release_device(struct device *dev)
if (dev->iommu_group)
ops->remove_device(dev);
- if (dev->iommu_param) {
+ if (dev->iommu) {
module_put(ops->owner);
- iommu_free_dev_param(dev);
+ dev_iommu_free(dev);
}
}
@@ -972,7 +972,7 @@ int iommu_register_device_fault_handler(struct device *dev,
iommu_dev_fault_handler_t handler,
void *data)
{
- struct iommu_param *param = dev->iommu_param;
+ struct dev_iommu *param = dev->iommu;
int ret = 0;
if (!param)
@@ -1015,7 +1015,7 @@ EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler);
*/
int iommu_unregister_device_fault_handler(struct device *dev)
{
- struct iommu_param *param = dev->iommu_param;
+ struct dev_iommu *param = dev->iommu;
int ret = 0;
if (!param)
@@ -1055,7 +1055,7 @@ EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler);
*/
int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
{
- struct iommu_param *param = dev->iommu_param;
+ struct dev_iommu *param = dev->iommu;
struct iommu_fault_event *evt_pending = NULL;
struct iommu_fault_param *fparam;
int ret = 0;
@@ -1104,7 +1104,7 @@ int iommu_page_response(struct device *dev,
int ret = -EINVAL;
struct iommu_fault_event *evt;
struct iommu_fault_page_request *prm;
- struct iommu_param *param = dev->iommu_param;
+ struct dev_iommu *param = dev->iommu;
struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
if (!domain || !domain->ops->page_response)
@@ -2405,7 +2405,11 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
if (fwspec)
return ops == fwspec->ops ? 0 : -EINVAL;
- fwspec = kzalloc(sizeof(*fwspec), GFP_KERNEL);
+ if (!dev_iommu_get(dev))
+ return -ENOMEM;
+
+ /* Preallocate for the overwhelmingly common case of 1 ID */
+ fwspec = kzalloc(struct_size(fwspec, ids, 1), GFP_KERNEL);
if (!fwspec)
return -ENOMEM;
@@ -2432,15 +2436,15 @@ EXPORT_SYMBOL_GPL(iommu_fwspec_free);
int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids)
{
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
- size_t size;
- int i;
+ int i, new_num;
if (!fwspec)
return -EINVAL;
- size = offsetof(struct iommu_fwspec, ids[fwspec->num_ids + num_ids]);
- if (size > sizeof(*fwspec)) {
- fwspec = krealloc(fwspec, size, GFP_KERNEL);
+ new_num = fwspec->num_ids + num_ids;
+ if (new_num > 1) {
+ fwspec = krealloc(fwspec, struct_size(fwspec, ids, new_num),
+ GFP_KERNEL);
if (!fwspec)
return -ENOMEM;
@@ -2450,7 +2454,7 @@ int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids)
for (i = 0; i < num_ids; i++)
fwspec->ids[fwspec->num_ids + i] = ids[i];
- fwspec->num_ids += num_ids;
+ fwspec->num_ids = new_num;
return 0;
}
EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index ecb3f9464dd5..310cf09feea3 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -89,9 +89,7 @@ static struct ipmmu_vmsa_domain *to_vmsa_domain(struct iommu_domain *dom)
static struct ipmmu_vmsa_device *to_ipmmu(struct device *dev)
{
- struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-
- return fwspec ? fwspec->iommu_priv : NULL;
+ return dev_iommu_priv_get(dev);
}
#define TLB_LOOP_TIMEOUT 100 /* 100us */
@@ -727,14 +725,13 @@ static phys_addr_t ipmmu_iova_to_phys(struct iommu_domain *io_domain,
static int ipmmu_init_platform_device(struct device *dev,
struct of_phandle_args *args)
{
- struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct platform_device *ipmmu_pdev;
ipmmu_pdev = of_find_device_by_node(args->np);
if (!ipmmu_pdev)
return -ENODEV;
- fwspec->iommu_priv = platform_get_drvdata(ipmmu_pdev);
+ dev_iommu_priv_set(dev, platform_get_drvdata(ipmmu_pdev));
return 0;
}
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 95945f467c03..5f4d6df59cf6 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -358,8 +358,8 @@ static void mtk_iommu_domain_free(struct iommu_domain *domain)
static int mtk_iommu_attach_device(struct iommu_domain *domain,
struct device *dev)
{
+ struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
struct mtk_iommu_domain *dom = to_mtk_domain(domain);
- struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv;
if (!data)
return -ENODEV;
@@ -378,7 +378,7 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain,
static void mtk_iommu_detach_device(struct iommu_domain *domain,
struct device *dev)
{
- struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv;
+ struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
if (!data)
return;
@@ -450,7 +450,7 @@ static int mtk_iommu_add_device(struct device *dev)
if (!fwspec || fwspec->ops != &mtk_iommu_ops)
return -ENODEV; /* Not a iommu client device */
- data = fwspec->iommu_priv;
+ data = dev_iommu_priv_get(dev);
iommu_device_link(&data->iommu, dev);
group = iommu_group_get_for_dev(dev);
@@ -469,7 +469,7 @@ static void mtk_iommu_remove_device(struct device *dev)
if (!fwspec || fwspec->ops != &mtk_iommu_ops)
return;
- data = fwspec->iommu_priv;
+ data = dev_iommu_priv_get(dev);
iommu_device_unlink(&data->iommu, dev);
iommu_group_remove_device(dev);
@@ -496,7 +496,6 @@ static struct iommu_group *mtk_iommu_device_group(struct device *dev)
static int mtk_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
{
- struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct platform_device *m4updev;
if (args->args_count != 1) {
@@ -505,13 +504,13 @@ static int mtk_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
return -EINVAL;
}
- if (!fwspec->iommu_priv) {
+ if (!dev_iommu_priv_get(dev)) {
/* Get the m4u device */
m4updev = of_find_device_by_node(args->np);
if (WARN_ON(!m4updev))
return -EINVAL;
- fwspec->iommu_priv = platform_get_drvdata(m4updev);
+ dev_iommu_priv_set(dev, platform_get_drvdata(m4updev));
}
return iommu_fwspec_add_ids(dev, args->args, 1);
diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c
index e93b94ecac45..a31be05601c9 100644
--- a/drivers/iommu/mtk_iommu_v1.c
+++ b/drivers/iommu/mtk_iommu_v1.c
@@ -263,8 +263,8 @@ static void mtk_iommu_domain_free(struct iommu_domain *domain)
static int mtk_iommu_attach_device(struct iommu_domain *domain,
struct device *dev)
{
+ struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
struct mtk_iommu_domain *dom = to_mtk_domain(domain);
- struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv;
int ret;
if (!data)
@@ -286,7 +286,7 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain,
static void mtk_iommu_detach_device(struct iommu_domain *domain,
struct device *dev)
{
- struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv;
+ struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
if (!data)
return;
@@ -387,20 +387,20 @@ static int mtk_iommu_create_mapping(struct device *dev,
return -EINVAL;
}
- if (!fwspec->iommu_priv) {
+ if (!dev_iommu_priv_get(dev)) {
/* Get the m4u device */
m4updev = of_find_device_by_node(args->np);
if (WARN_ON(!m4updev))
return -EINVAL;
- fwspec->iommu_priv = platform_get_drvdata(m4updev);
+ dev_iommu_priv_set(dev, platform_get_drvdata(m4updev));
}
ret = iommu_fwspec_add_ids(dev, args->args, 1);
if (ret)
return ret;
- data = fwspec->iommu_priv;
+ data = dev_iommu_priv_get(dev);
m4udev = data->dev;
mtk_mapping = m4udev->archdata.iommu;
if (!mtk_mapping) {
@@ -459,7 +459,7 @@ static int mtk_iommu_add_device(struct device *dev)
if (err)
return err;
- data = fwspec->iommu_priv;
+ data = dev_iommu_priv_get(dev);
mtk_mapping = data->dev->archdata.iommu;
err = arm_iommu_attach_device(dev, mtk_mapping);
if (err) {
@@ -478,7 +478,7 @@ static void mtk_iommu_remove_device(struct device *dev)
if (!fwspec || fwspec->ops != &mtk_iommu_ops)
return;
- data = fwspec->iommu_priv;
+ data = dev_iommu_priv_get(dev);
iommu_device_unlink(&data->iommu, dev);
iommu_group_remove_device(dev);
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index be551cc34be4..887fefcb03b4 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -167,7 +167,7 @@ static int omap2_iommu_enable(struct omap_iommu *obj)
{
u32 l, pa;
- if (!obj->iopgd || !IS_ALIGNED((u32)obj->iopgd, SZ_16K))
+ if (!obj->iopgd || !IS_ALIGNED((unsigned long)obj->iopgd, SZ_16K))
return -EINVAL;
pa = virt_to_phys(obj->iopgd);
@@ -434,7 +434,7 @@ static void flush_iotlb_page(struct omap_iommu *obj, u32 da)
bytes = iopgsz_to_bytes(cr.cam & 3);
if ((start <= da) && (da < start + bytes)) {
- dev_dbg(obj->dev, "%s: %08x<=%08x(%x)\n",
+ dev_dbg(obj->dev, "%s: %08x<=%08x(%zx)\n",
__func__, start, da, bytes);
iotlb_load_cr(obj, &cr);
iommu_write_reg(obj, 1, MMU_FLUSH_ENTRY);
@@ -1352,11 +1352,11 @@ static int omap_iommu_map(struct iommu_domain *domain, unsigned long da,
omap_pgsz = bytes_to_iopgsz(bytes);
if (omap_pgsz < 0) {
- dev_err(dev, "invalid size to map: %d\n", bytes);
+ dev_err(dev, "invalid size to map: %zu\n", bytes);
return -EINVAL;
}
- dev_dbg(dev, "mapping da 0x%lx to pa %pa size 0x%x\n", da, &pa, bytes);
+ dev_dbg(dev, "mapping da 0x%lx to pa %pa size 0x%zx\n", da, &pa, bytes);
iotlb_init_entry(&e, da, pa, omap_pgsz);
@@ -1393,7 +1393,7 @@ static size_t omap_iommu_unmap(struct iommu_domain *domain, unsigned long da,
size_t bytes = 0;
int i;
- dev_dbg(dev, "unmapping da 0x%lx size %u\n", da, size);
+ dev_dbg(dev, "unmapping da 0x%lx size %zu\n", da, size);
iommu = omap_domain->iommus;
for (i = 0; i < omap_domain->num_iommus; i++, iommu++) {
diff --git a/drivers/iommu/omap-iopgtable.h b/drivers/iommu/omap-iopgtable.h
index 1a4adb59a859..51d74002cc30 100644
--- a/drivers/iommu/omap-iopgtable.h
+++ b/drivers/iommu/omap-iopgtable.h
@@ -63,7 +63,8 @@
*
* va to pa translation
*/
-static inline phys_addr_t omap_iommu_translate(u32 d, u32 va, u32 mask)
+static inline phys_addr_t omap_iommu_translate(unsigned long d, dma_addr_t va,
+ dma_addr_t mask)
{
return (d & mask) | (va & (~mask));
}
diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c
index 4328da0b0a9f..0e2a96467767 100644
--- a/drivers/iommu/qcom_iommu.c
+++ b/drivers/iommu/qcom_iommu.c
@@ -48,7 +48,7 @@ struct qcom_iommu_dev {
void __iomem *local_base;
u32 sec_id;
u8 num_ctxs;
- struct qcom_iommu_ctx *ctxs[0]; /* indexed by asid-1 */
+ struct qcom_iommu_ctx *ctxs[]; /* indexed by asid-1 */
};
struct qcom_iommu_ctx {
@@ -74,16 +74,19 @@ static struct qcom_iommu_domain *to_qcom_iommu_domain(struct iommu_domain *dom)
static const struct iommu_ops qcom_iommu_ops;
-static struct qcom_iommu_dev * to_iommu(struct iommu_fwspec *fwspec)
+static struct qcom_iommu_dev * to_iommu(struct device *dev)
{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
if (!fwspec || fwspec->ops != &qcom_iommu_ops)
return NULL;
- return fwspec->iommu_priv;
+
+ return dev_iommu_priv_get(dev);
}
-static struct qcom_iommu_ctx * to_ctx(struct iommu_fwspec *fwspec, unsigned asid)
+static struct qcom_iommu_ctx * to_ctx(struct device *dev, unsigned asid)
{
- struct qcom_iommu_dev *qcom_iommu = to_iommu(fwspec);
+ struct qcom_iommu_dev *qcom_iommu = to_iommu(dev);
if (!qcom_iommu)
return NULL;
return qcom_iommu->ctxs[asid - 1];
@@ -115,11 +118,14 @@ iommu_readq(struct qcom_iommu_ctx *ctx, unsigned reg)
static void qcom_iommu_tlb_sync(void *cookie)
{
- struct iommu_fwspec *fwspec = cookie;
+ struct iommu_fwspec *fwspec;
+ struct device *dev = cookie;
unsigned i;
+ fwspec = dev_iommu_fwspec_get(dev);
+
for (i = 0; i < fwspec->num_ids; i++) {
- struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]);
+ struct qcom_iommu_ctx *ctx = to_ctx(dev, fwspec->ids[i]);
unsigned int val, ret;
iommu_writel(ctx, ARM_SMMU_CB_TLBSYNC, 0);
@@ -133,11 +139,14 @@ static void qcom_iommu_tlb_sync(void *cookie)
static void qcom_iommu_tlb_inv_context(void *cookie)
{
- struct iommu_fwspec *fwspec = cookie;
+ struct device *dev = cookie;
+ struct iommu_fwspec *fwspec;
unsigned i;
+ fwspec = dev_iommu_fwspec_get(dev);
+
for (i = 0; i < fwspec->num_ids; i++) {
- struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]);
+ struct qcom_iommu_ctx *ctx = to_ctx(dev, fwspec->ids[i]);
iommu_writel(ctx, ARM_SMMU_CB_S1_TLBIASID, ctx->asid);
}
@@ -147,13 +156,16 @@ static void qcom_iommu_tlb_inv_context(void *cookie)
static void qcom_iommu_tlb_inv_range_nosync(unsigned long iova, size_t size,
size_t granule, bool leaf, void *cookie)
{
- struct iommu_fwspec *fwspec = cookie;
+ struct device *dev = cookie;
+ struct iommu_fwspec *fwspec;
unsigned i, reg;
reg = leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
+ fwspec = dev_iommu_fwspec_get(dev);
+
for (i = 0; i < fwspec->num_ids; i++) {
- struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]);
+ struct qcom_iommu_ctx *ctx = to_ctx(dev, fwspec->ids[i]);
size_t s = size;
iova = (iova >> 12) << 12;
@@ -222,9 +234,10 @@ static irqreturn_t qcom_iommu_fault(int irq, void *dev)
static int qcom_iommu_init_domain(struct iommu_domain *domain,
struct qcom_iommu_dev *qcom_iommu,
- struct iommu_fwspec *fwspec)
+ struct device *dev)
{
struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct io_pgtable_ops *pgtbl_ops;
struct io_pgtable_cfg pgtbl_cfg;
int i, ret = 0;
@@ -243,7 +256,7 @@ static int qcom_iommu_init_domain(struct iommu_domain *domain,
};
qcom_domain->iommu = qcom_iommu;
- pgtbl_ops = alloc_io_pgtable_ops(ARM_32_LPAE_S1, &pgtbl_cfg, fwspec);
+ pgtbl_ops = alloc_io_pgtable_ops(ARM_32_LPAE_S1, &pgtbl_cfg, dev);
if (!pgtbl_ops) {
dev_err(qcom_iommu->dev, "failed to allocate pagetable ops\n");
ret = -ENOMEM;
@@ -256,7 +269,7 @@ static int qcom_iommu_init_domain(struct iommu_domain *domain,
domain->geometry.force_aperture = true;
for (i = 0; i < fwspec->num_ids; i++) {
- struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]);
+ struct qcom_iommu_ctx *ctx = to_ctx(dev, fwspec->ids[i]);
if (!ctx->secure_init) {
ret = qcom_scm_restore_sec_cfg(qcom_iommu->sec_id, ctx->asid);
@@ -363,8 +376,7 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain)
static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
{
- struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
- struct qcom_iommu_dev *qcom_iommu = to_iommu(fwspec);
+ struct qcom_iommu_dev *qcom_iommu = to_iommu(dev);
struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
int ret;
@@ -375,7 +387,7 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev
/* Ensure that the domain is finalized */
pm_runtime_get_sync(qcom_iommu->dev);
- ret = qcom_iommu_init_domain(domain, qcom_iommu, fwspec);
+ ret = qcom_iommu_init_domain(domain, qcom_iommu, dev);
pm_runtime_put_sync(qcom_iommu->dev);
if (ret < 0)
return ret;
@@ -397,9 +409,9 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev
static void qcom_iommu_detach_dev(struct iommu_domain *domain, struct device *dev)
{
- struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
- struct qcom_iommu_dev *qcom_iommu = to_iommu(fwspec);
struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct qcom_iommu_dev *qcom_iommu = to_iommu(dev);
unsigned i;
if (WARN_ON(!qcom_domain->iommu))
@@ -407,7 +419,7 @@ static void qcom_iommu_detach_dev(struct iommu_domain *domain, struct device *de
pm_runtime_get_sync(qcom_iommu->dev);
for (i = 0; i < fwspec->num_ids; i++) {
- struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]);
+ struct qcom_iommu_ctx *ctx = to_ctx(dev, fwspec->ids[i]);
/* Disable the context bank: */
iommu_writel(ctx, ARM_SMMU_CB_SCTLR, 0);
@@ -514,7 +526,7 @@ static bool qcom_iommu_capable(enum iommu_cap cap)
static int qcom_iommu_add_device(struct device *dev)
{
- struct qcom_iommu_dev *qcom_iommu = to_iommu(dev_iommu_fwspec_get(dev));
+ struct qcom_iommu_dev *qcom_iommu = to_iommu(dev);
struct iommu_group *group;
struct device_link *link;
@@ -545,7 +557,7 @@ static int qcom_iommu_add_device(struct device *dev)
static void qcom_iommu_remove_device(struct device *dev)
{
- struct qcom_iommu_dev *qcom_iommu = to_iommu(dev_iommu_fwspec_get(dev));
+ struct qcom_iommu_dev *qcom_iommu = to_iommu(dev);
if (!qcom_iommu)
return;
@@ -557,7 +569,6 @@ static void qcom_iommu_remove_device(struct device *dev)
static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
{
- struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct qcom_iommu_dev *qcom_iommu;
struct platform_device *iommu_pdev;
unsigned asid = args->args[0];
@@ -583,14 +594,14 @@ static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
WARN_ON(asid > qcom_iommu->num_ctxs))
return -EINVAL;
- if (!fwspec->iommu_priv) {
- fwspec->iommu_priv = qcom_iommu;
+ if (!dev_iommu_priv_get(dev)) {
+ dev_iommu_priv_set(dev, qcom_iommu);
} else {
/* make sure devices iommus dt node isn't referring to
* multiple different iommu devices. Multiple context
* banks are ok, but multiple devices are not:
*/
- if (WARN_ON(qcom_iommu != fwspec->iommu_priv))
+ if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev)))
return -EINVAL;
}
diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
index 3fb7ba72507d..db6559e8336f 100644
--- a/drivers/iommu/tegra-gart.c
+++ b/drivers/iommu/tegra-gart.c
@@ -247,7 +247,7 @@ static int gart_iommu_add_device(struct device *dev)
{
struct iommu_group *group;
- if (!dev->iommu_fwspec)
+ if (!dev_iommu_fwspec_get(dev))
return -ENODEV;
group = iommu_group_get_for_dev(dev);
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index cce329d71fba..d5cac4f46ca5 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -466,7 +466,7 @@ static int viommu_probe_endpoint(struct viommu_dev *viommu, struct device *dev)
struct virtio_iommu_req_probe *probe;
struct virtio_iommu_probe_property *prop;
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
- struct viommu_endpoint *vdev = fwspec->iommu_priv;
+ struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
if (!fwspec->num_ids)
return -EINVAL;
@@ -607,24 +607,36 @@ static struct iommu_domain *viommu_domain_alloc(unsigned type)
return &vdomain->domain;
}
-static int viommu_domain_finalise(struct viommu_dev *viommu,
+static int viommu_domain_finalise(struct viommu_endpoint *vdev,
struct iommu_domain *domain)
{
int ret;
+ unsigned long viommu_page_size;
+ struct viommu_dev *viommu = vdev->viommu;
struct viommu_domain *vdomain = to_viommu_domain(domain);
- vdomain->viommu = viommu;
- vdomain->map_flags = viommu->map_flags;
+ viommu_page_size = 1UL << __ffs(viommu->pgsize_bitmap);
+ if (viommu_page_size > PAGE_SIZE) {
+ dev_err(vdev->dev,
+ "granule 0x%lx larger than system page size 0x%lx\n",
+ viommu_page_size, PAGE_SIZE);
+ return -EINVAL;
+ }
+
+ ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain,
+ viommu->last_domain, GFP_KERNEL);
+ if (ret < 0)
+ return ret;
+
+ vdomain->id = (unsigned int)ret;
domain->pgsize_bitmap = viommu->pgsize_bitmap;
domain->geometry = viommu->geometry;
- ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain,
- viommu->last_domain, GFP_KERNEL);
- if (ret >= 0)
- vdomain->id = (unsigned int)ret;
+ vdomain->map_flags = viommu->map_flags;
+ vdomain->viommu = viommu;
- return ret > 0 ? 0 : ret;
+ return 0;
}
static void viommu_domain_free(struct iommu_domain *domain)
@@ -648,7 +660,7 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
int ret = 0;
struct virtio_iommu_req_attach req;
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
- struct viommu_endpoint *vdev = fwspec->iommu_priv;
+ struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
struct viommu_domain *vdomain = to_viommu_domain(domain);
mutex_lock(&vdomain->mutex);
@@ -657,7 +669,7 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
* Properly initialize the domain now that we know which viommu
* owns it.
*/
- ret = viommu_domain_finalise(vdev->viommu, domain);
+ ret = viommu_domain_finalise(vdev, domain);
} else if (vdomain->viommu != vdev->viommu) {
dev_err(dev, "cannot attach to foreign vIOMMU\n");
ret = -EXDEV;
@@ -807,8 +819,7 @@ static void viommu_iotlb_sync(struct iommu_domain *domain,
static void viommu_get_resv_regions(struct device *dev, struct list_head *head)
{
struct iommu_resv_region *entry, *new_entry, *msi = NULL;
- struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
- struct viommu_endpoint *vdev = fwspec->iommu_priv;
+ struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
list_for_each_entry(entry, &vdev->resv_regions, list) {
@@ -876,7 +887,7 @@ static int viommu_add_device(struct device *dev)
vdev->dev = dev;
vdev->viommu = viommu;
INIT_LIST_HEAD(&vdev->resv_regions);
- fwspec->iommu_priv = vdev;
+ dev_iommu_priv_set(dev, vdev);
if (viommu->probe_size) {
/* Get additional information for this endpoint */
@@ -920,7 +931,7 @@ static void viommu_remove_device(struct device *dev)
if (!fwspec || fwspec->ops != &viommu_ops)
return;
- vdev = fwspec->iommu_priv;
+ vdev = dev_iommu_priv_get(dev);
iommu_group_remove_device(dev);
iommu_device_unlink(&vdev->viommu->iommu, dev);
@@ -1082,7 +1093,6 @@ static int viommu_probe(struct virtio_device *vdev)
#ifdef CONFIG_PCI
if (pci_bus_type.iommu_ops != &viommu_ops) {
- pci_request_acs();
ret = bus_set_iommu(&pci_bus_type, &viommu_ops);
if (ret)
goto err_unregister;
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index d82f1dea3711..c664d84e1667 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -846,6 +846,17 @@ config LEDS_TPS6105X
It is a single boost converter primarily for white LEDs and
audio amplifiers.
+config LEDS_IP30
+ tristate "LED support for SGI Octane machines"
+ depends on LEDS_CLASS
+ depends on SGI_MFD_IOC3
+ help
+ This option enables support for the Red and White LEDs of
+ SGI Octane machines.
+
+ To compile this driver as a module, choose M here: the module
+ will be called leds-ip30.
+
comment "LED Triggers"
source "drivers/leds/trigger/Kconfig"
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index d7e1107753fb..45235d5fb218 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -6,91 +6,92 @@ obj-$(CONFIG_LEDS_CLASS) += led-class.o
obj-$(CONFIG_LEDS_CLASS_FLASH) += led-class-flash.o
obj-$(CONFIG_LEDS_TRIGGERS) += led-triggers.o
-# LED Platform Drivers
+# LED Platform Drivers (keep this sorted, M-| sort)
obj-$(CONFIG_LEDS_88PM860X) += leds-88pm860x.o
obj-$(CONFIG_LEDS_AAT1290) += leds-aat1290.o
+obj-$(CONFIG_LEDS_ADP5520) += leds-adp5520.o
+obj-$(CONFIG_LEDS_AN30259A) += leds-an30259a.o
obj-$(CONFIG_LEDS_APU) += leds-apu.o
obj-$(CONFIG_LEDS_AS3645A) += leds-as3645a.o
-obj-$(CONFIG_LEDS_AN30259A) += leds-an30259a.o
+obj-$(CONFIG_LEDS_ASIC3) += leds-asic3.o
obj-$(CONFIG_LEDS_BCM6328) += leds-bcm6328.o
obj-$(CONFIG_LEDS_BCM6358) += leds-bcm6358.o
obj-$(CONFIG_LEDS_BD2802) += leds-bd2802.o
+obj-$(CONFIG_LEDS_BLINKM) += leds-blinkm.o
+obj-$(CONFIG_LEDS_CLEVO_MAIL) += leds-clevo-mail.o
+obj-$(CONFIG_LEDS_COBALT_QUBE) += leds-cobalt-qube.o
+obj-$(CONFIG_LEDS_COBALT_RAQ) += leds-cobalt-raq.o
obj-$(CONFIG_LEDS_CPCAP) += leds-cpcap.o
-obj-$(CONFIG_LEDS_LOCOMO) += leds-locomo.o
+obj-$(CONFIG_LEDS_DA903X) += leds-da903x.o
+obj-$(CONFIG_LEDS_DA9052) += leds-da9052.o
+obj-$(CONFIG_LEDS_FSG) += leds-fsg.o
+obj-$(CONFIG_LEDS_GPIO) += leds-gpio.o
+obj-$(CONFIG_LEDS_GPIO_REGISTER) += leds-gpio-register.o
+obj-$(CONFIG_LEDS_HP6XX) += leds-hp6xx.o
+obj-$(CONFIG_LEDS_INTEL_SS4200) += leds-ss4200.o
+obj-$(CONFIG_LEDS_IP30) += leds-ip30.o
+obj-$(CONFIG_LEDS_IPAQ_MICRO) += leds-ipaq-micro.o
+obj-$(CONFIG_LEDS_IS31FL319X) += leds-is31fl319x.o
+obj-$(CONFIG_LEDS_IS31FL32XX) += leds-is31fl32xx.o
+obj-$(CONFIG_LEDS_KTD2692) += leds-ktd2692.o
obj-$(CONFIG_LEDS_LM3530) += leds-lm3530.o
obj-$(CONFIG_LEDS_LM3532) += leds-lm3532.o
obj-$(CONFIG_LEDS_LM3533) += leds-lm3533.o
+obj-$(CONFIG_LEDS_LM355x) += leds-lm355x.o
+obj-$(CONFIG_LEDS_LM3601X) += leds-lm3601x.o
+obj-$(CONFIG_LEDS_LM36274) += leds-lm36274.o
obj-$(CONFIG_LEDS_LM3642) += leds-lm3642.o
-obj-$(CONFIG_LEDS_MIKROTIK_RB532) += leds-rb532.o
-obj-$(CONFIG_LEDS_S3C24XX) += leds-s3c24xx.o
-obj-$(CONFIG_LEDS_NET48XX) += leds-net48xx.o
-obj-$(CONFIG_LEDS_WRAP) += leds-wrap.o
-obj-$(CONFIG_LEDS_COBALT_QUBE) += leds-cobalt-qube.o
-obj-$(CONFIG_LEDS_COBALT_RAQ) += leds-cobalt-raq.o
-obj-$(CONFIG_LEDS_SUNFIRE) += leds-sunfire.o
-obj-$(CONFIG_LEDS_PCA9532) += leds-pca9532.o
-obj-$(CONFIG_LEDS_GPIO_REGISTER) += leds-gpio-register.o
-obj-$(CONFIG_LEDS_GPIO) += leds-gpio.o
+obj-$(CONFIG_LEDS_LM3692X) += leds-lm3692x.o
+obj-$(CONFIG_LEDS_LM3697) += leds-lm3697.o
+obj-$(CONFIG_LEDS_LOCOMO) += leds-locomo.o
obj-$(CONFIG_LEDS_LP3944) += leds-lp3944.o
obj-$(CONFIG_LEDS_LP3952) += leds-lp3952.o
-obj-$(CONFIG_LEDS_LP55XX_COMMON) += leds-lp55xx-common.o
obj-$(CONFIG_LEDS_LP5521) += leds-lp5521.o
obj-$(CONFIG_LEDS_LP5523) += leds-lp5523.o
obj-$(CONFIG_LEDS_LP5562) += leds-lp5562.o
+obj-$(CONFIG_LEDS_LP55XX_COMMON) += leds-lp55xx-common.o
obj-$(CONFIG_LEDS_LP8501) += leds-lp8501.o
obj-$(CONFIG_LEDS_LP8788) += leds-lp8788.o
obj-$(CONFIG_LEDS_LP8860) += leds-lp8860.o
-obj-$(CONFIG_LEDS_TCA6507) += leds-tca6507.o
-obj-$(CONFIG_LEDS_TLC591XX) += leds-tlc591xx.o
-obj-$(CONFIG_LEDS_CLEVO_MAIL) += leds-clevo-mail.o
-obj-$(CONFIG_LEDS_IPAQ_MICRO) += leds-ipaq-micro.o
-obj-$(CONFIG_LEDS_HP6XX) += leds-hp6xx.o
-obj-$(CONFIG_LEDS_OT200) += leds-ot200.o
-obj-$(CONFIG_LEDS_FSG) += leds-fsg.o
-obj-$(CONFIG_LEDS_PCA955X) += leds-pca955x.o
-obj-$(CONFIG_LEDS_PCA963X) += leds-pca963x.o
-obj-$(CONFIG_LEDS_DA903X) += leds-da903x.o
-obj-$(CONFIG_LEDS_DA9052) += leds-da9052.o
-obj-$(CONFIG_LEDS_WM831X_STATUS) += leds-wm831x-status.o
-obj-$(CONFIG_LEDS_WM8350) += leds-wm8350.o
-obj-$(CONFIG_LEDS_PWM) += leds-pwm.o
-obj-$(CONFIG_LEDS_REGULATOR) += leds-regulator.o
-obj-$(CONFIG_LEDS_INTEL_SS4200) += leds-ss4200.o
obj-$(CONFIG_LEDS_LT3593) += leds-lt3593.o
-obj-$(CONFIG_LEDS_ADP5520) += leds-adp5520.o
-obj-$(CONFIG_LEDS_MC13783) += leds-mc13783.o
-obj-$(CONFIG_LEDS_NS2) += leds-ns2.o
-obj-$(CONFIG_LEDS_NETXBIG) += leds-netxbig.o
-obj-$(CONFIG_LEDS_ASIC3) += leds-asic3.o
obj-$(CONFIG_LEDS_MAX77650) += leds-max77650.o
obj-$(CONFIG_LEDS_MAX77693) += leds-max77693.o
obj-$(CONFIG_LEDS_MAX8997) += leds-max8997.o
-obj-$(CONFIG_LEDS_LM355x) += leds-lm355x.o
-obj-$(CONFIG_LEDS_BLINKM) += leds-blinkm.o
-obj-$(CONFIG_LEDS_SYSCON) += leds-syscon.o
+obj-$(CONFIG_LEDS_MC13783) += leds-mc13783.o
obj-$(CONFIG_LEDS_MENF21BMC) += leds-menf21bmc.o
-obj-$(CONFIG_LEDS_KTD2692) += leds-ktd2692.o
-obj-$(CONFIG_LEDS_POWERNV) += leds-powernv.o
-obj-$(CONFIG_LEDS_IS31FL319X) += leds-is31fl319x.o
-obj-$(CONFIG_LEDS_IS31FL32XX) += leds-is31fl32xx.o
-obj-$(CONFIG_LEDS_PM8058) += leds-pm8058.o
+obj-$(CONFIG_LEDS_MIKROTIK_RB532) += leds-rb532.o
obj-$(CONFIG_LEDS_MLXCPLD) += leds-mlxcpld.o
obj-$(CONFIG_LEDS_MLXREG) += leds-mlxreg.o
-obj-$(CONFIG_LEDS_NIC78BX) += leds-nic78bx.o
-obj-$(CONFIG_LEDS_SPI_BYTE) += leds-spi-byte.o
obj-$(CONFIG_LEDS_MT6323) += leds-mt6323.o
-obj-$(CONFIG_LEDS_LM3692X) += leds-lm3692x.o
+obj-$(CONFIG_LEDS_NET48XX) += leds-net48xx.o
+obj-$(CONFIG_LEDS_NETXBIG) += leds-netxbig.o
+obj-$(CONFIG_LEDS_NIC78BX) += leds-nic78bx.o
+obj-$(CONFIG_LEDS_NS2) += leds-ns2.o
+obj-$(CONFIG_LEDS_OT200) += leds-ot200.o
+obj-$(CONFIG_LEDS_PCA9532) += leds-pca9532.o
+obj-$(CONFIG_LEDS_PCA955X) += leds-pca955x.o
+obj-$(CONFIG_LEDS_PCA963X) += leds-pca963x.o
+obj-$(CONFIG_LEDS_PM8058) += leds-pm8058.o
+obj-$(CONFIG_LEDS_POWERNV) += leds-powernv.o
+obj-$(CONFIG_LEDS_PWM) += leds-pwm.o
+obj-$(CONFIG_LEDS_REGULATOR) += leds-regulator.o
+obj-$(CONFIG_LEDS_S3C24XX) += leds-s3c24xx.o
obj-$(CONFIG_LEDS_SC27XX_BLTC) += leds-sc27xx-bltc.o
-obj-$(CONFIG_LEDS_LM3601X) += leds-lm3601x.o
+obj-$(CONFIG_LEDS_SUNFIRE) += leds-sunfire.o
+obj-$(CONFIG_LEDS_SYSCON) += leds-syscon.o
+obj-$(CONFIG_LEDS_TCA6507) += leds-tca6507.o
obj-$(CONFIG_LEDS_TI_LMU_COMMON) += leds-ti-lmu-common.o
-obj-$(CONFIG_LEDS_LM3697) += leds-lm3697.o
-obj-$(CONFIG_LEDS_LM36274) += leds-lm36274.o
+obj-$(CONFIG_LEDS_TLC591XX) += leds-tlc591xx.o
obj-$(CONFIG_LEDS_TPS6105X) += leds-tps6105x.o
+obj-$(CONFIG_LEDS_WM831X_STATUS) += leds-wm831x-status.o
+obj-$(CONFIG_LEDS_WM8350) += leds-wm8350.o
+obj-$(CONFIG_LEDS_WRAP) += leds-wrap.o
# LED SPI Drivers
obj-$(CONFIG_LEDS_CR0014114) += leds-cr0014114.o
obj-$(CONFIG_LEDS_DAC124S085) += leds-dac124s085.o
obj-$(CONFIG_LEDS_EL15203000) += leds-el15203000.o
+obj-$(CONFIG_LEDS_SPI_BYTE) += leds-spi-byte.o
# LED Userspace Drivers
obj-$(CONFIG_LEDS_USER) += uleds.o
diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 1fc40e8af75e..3363a6551a70 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -376,7 +376,7 @@ int led_classdev_register_ext(struct device *parent,
if (ret)
dev_warn(parent, "Led %s renamed to %s due to name collision",
- led_cdev->name, dev_name(led_cdev->dev));
+ proposed_name, dev_name(led_cdev->dev));
if (led_cdev->flags & LED_BRIGHT_HW_CHANGED) {
ret = led_add_brightness_hw_changed(led_cdev);
diff --git a/drivers/leds/leds-bd2802.c b/drivers/leds/leds-bd2802.c
index bd61a823d0ca..8bbaef5a2986 100644
--- a/drivers/leds/leds-bd2802.c
+++ b/drivers/leds/leds-bd2802.c
@@ -660,7 +660,6 @@ static int bd2802_probe(struct i2c_client *client,
const struct i2c_device_id *id)
{
struct bd2802_led *led;
- struct bd2802_led_platform_data *pdata;
int ret, i;
led = devm_kzalloc(&client->dev, sizeof(struct bd2802_led), GFP_KERNEL);
@@ -668,7 +667,6 @@ static int bd2802_probe(struct i2c_client *client,
return -ENOMEM;
led->client = client;
- pdata = led->pdata = dev_get_platdata(&client->dev);
i2c_set_clientdata(client, led);
/*
diff --git a/drivers/leds/leds-ip30.c b/drivers/leds/leds-ip30.c
new file mode 100644
index 000000000000..d4ec7361c616
--- /dev/null
+++ b/drivers/leds/leds-ip30.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * LED Driver for SGI Octane machines
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/leds.h>
+
+#define IP30_LED_SYSTEM 0
+#define IP30_LED_FAULT 1
+
+struct ip30_led {
+ struct led_classdev cdev;
+ u32 __iomem *reg;
+};
+
+static void ip30led_set(struct led_classdev *led_cdev,
+ enum led_brightness value)
+{
+ struct ip30_led *led = container_of(led_cdev, struct ip30_led, cdev);
+
+ writel(value, led->reg);
+}
+
+static int ip30led_create(struct platform_device *pdev, int num)
+{
+ struct resource *res;
+ struct ip30_led *data;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, num);
+ if (!res)
+ return -EBUSY;
+
+ data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->reg = devm_ioremap_resource(&pdev->dev, res);
+ if (IS_ERR(data->reg))
+ return PTR_ERR(data->reg);
+
+
+ switch (num) {
+ case IP30_LED_SYSTEM:
+ data->cdev.name = "white:power";
+ break;
+ case IP30_LED_FAULT:
+ data->cdev.name = "red:fault";
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ data->cdev.brightness = readl(data->reg);
+ data->cdev.max_brightness = 1;
+ data->cdev.brightness_set = ip30led_set;
+
+ return devm_led_classdev_register(&pdev->dev, &data->cdev);
+}
+
+static int ip30led_probe(struct platform_device *pdev)
+{
+ int ret;
+
+ ret = ip30led_create(pdev, IP30_LED_SYSTEM);
+ if (ret < 0)
+ return ret;
+
+ return ip30led_create(pdev, IP30_LED_FAULT);
+}
+
+static struct platform_driver ip30led_driver = {
+ .probe = ip30led_probe,
+ .driver = {
+ .name = "ip30-leds",
+ },
+};
+
+module_platform_driver(ip30led_driver);
+
+MODULE_AUTHOR("Thomas Bogendoerfer <tbogendoerfer@suse.de>");
+MODULE_DESCRIPTION("SGI Octane LED driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:ip30-leds");
diff --git a/drivers/leds/leds-is31fl32xx.c b/drivers/leds/leds-is31fl32xx.c
index 6f29b8943913..cd768f991da1 100644
--- a/drivers/leds/leds-is31fl32xx.c
+++ b/drivers/leds/leds-is31fl32xx.c
@@ -44,7 +44,7 @@ struct is31fl32xx_priv {
const struct is31fl32xx_chipdef *cdef;
struct i2c_client *client;
unsigned int num_leds;
- struct is31fl32xx_led_data leds[0];
+ struct is31fl32xx_led_data leds[];
};
/**
diff --git a/drivers/leds/leds-lm3532.c b/drivers/leds/leds-lm3532.c
index 188a57da981a..aa9bf8cda673 100644
--- a/drivers/leds/leds-lm3532.c
+++ b/drivers/leds/leds-lm3532.c
@@ -140,7 +140,7 @@ struct lm3532_led {
int ctrl_brt_pointer;
int num_leds;
int full_scale_current;
- int enabled:1;
+ unsigned int enabled:1;
u32 led_strings[LM3532_MAX_CONTROL_BANKS];
char label[LED_MAX_NAME_SIZE];
};
diff --git a/drivers/leds/leds-lm3697.c b/drivers/leds/leds-lm3697.c
index b71711aff8a3..872d26f9706a 100644
--- a/drivers/leds/leds-lm3697.c
+++ b/drivers/leds/leds-lm3697.c
@@ -246,7 +246,7 @@ static int lm3697_probe_dt(struct lm3697 *priv)
led->num_leds = fwnode_property_count_u32(child, "led-sources");
if (led->num_leds > LM3697_MAX_LED_STRINGS) {
- dev_err(&priv->client->dev, "To many LED strings defined\n");
+ dev_err(&priv->client->dev, "Too many LED strings defined\n");
continue;
}
diff --git a/drivers/leds/leds-ns2.c b/drivers/leds/leds-ns2.c
index 7c500dfdcfa3..538ca5755602 100644
--- a/drivers/leds/leds-ns2.c
+++ b/drivers/leds/leds-ns2.c
@@ -12,14 +12,38 @@
#include <linux/kernel.h>
#include <linux/platform_device.h>
#include <linux/slab.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
#include <linux/leds.h>
#include <linux/module.h>
-#include <linux/platform_data/leds-kirkwood-ns2.h>
#include <linux/of.h>
-#include <linux/of_gpio.h>
#include "leds.h"
+enum ns2_led_modes {
+ NS_V2_LED_OFF,
+ NS_V2_LED_ON,
+ NS_V2_LED_SATA,
+};
+
+struct ns2_led_modval {
+ enum ns2_led_modes mode;
+ int cmd_level;
+ int slow_level;
+};
+
+struct ns2_led {
+ const char *name;
+ const char *default_trigger;
+ struct gpio_desc *cmd;
+ struct gpio_desc *slow;
+ int num_modes;
+ struct ns2_led_modval *modval;
+};
+
+struct ns2_led_platform_data {
+ int num_leds;
+ struct ns2_led *leds;
+};
+
/*
* The Network Space v2 dual-GPIO LED is wired to a CPLD. Three different LED
* modes are available: off, on and SATA activity blinking. The LED modes are
@@ -29,8 +53,8 @@
struct ns2_led_data {
struct led_classdev cdev;
- unsigned int cmd;
- unsigned int slow;
+ struct gpio_desc *cmd;
+ struct gpio_desc *slow;
bool can_sleep;
unsigned char sata; /* True when SATA mode active. */
rwlock_t rw_lock; /* Lock GPIOs. */
@@ -46,8 +70,8 @@ static int ns2_led_get_mode(struct ns2_led_data *led_dat,
int cmd_level;
int slow_level;
- cmd_level = gpio_get_value_cansleep(led_dat->cmd);
- slow_level = gpio_get_value_cansleep(led_dat->slow);
+ cmd_level = gpiod_get_value_cansleep(led_dat->cmd);
+ slow_level = gpiod_get_value_cansleep(led_dat->slow);
for (i = 0; i < led_dat->num_modes; i++) {
if (cmd_level == led_dat->modval[i].cmd_level &&
@@ -80,15 +104,15 @@ static void ns2_led_set_mode(struct ns2_led_data *led_dat,
write_lock_irqsave(&led_dat->rw_lock, flags);
if (!led_dat->can_sleep) {
- gpio_set_value(led_dat->cmd,
- led_dat->modval[i].cmd_level);
- gpio_set_value(led_dat->slow,
- led_dat->modval[i].slow_level);
+ gpiod_set_value(led_dat->cmd,
+ led_dat->modval[i].cmd_level);
+ gpiod_set_value(led_dat->slow,
+ led_dat->modval[i].slow_level);
goto exit_unlock;
}
- gpio_set_value_cansleep(led_dat->cmd, led_dat->modval[i].cmd_level);
- gpio_set_value_cansleep(led_dat->slow, led_dat->modval[i].slow_level);
+ gpiod_set_value_cansleep(led_dat->cmd, led_dat->modval[i].cmd_level);
+ gpiod_set_value_cansleep(led_dat->slow, led_dat->modval[i].slow_level);
exit_unlock:
write_unlock_irqrestore(&led_dat->rw_lock, flags);
@@ -176,26 +200,6 @@ create_ns2_led(struct platform_device *pdev, struct ns2_led_data *led_dat,
int ret;
enum ns2_led_modes mode;
- ret = devm_gpio_request_one(&pdev->dev, template->cmd,
- gpio_get_value_cansleep(template->cmd) ?
- GPIOF_OUT_INIT_HIGH : GPIOF_OUT_INIT_LOW,
- template->name);
- if (ret) {
- dev_err(&pdev->dev, "%s: failed to setup command GPIO\n",
- template->name);
- return ret;
- }
-
- ret = devm_gpio_request_one(&pdev->dev, template->slow,
- gpio_get_value_cansleep(template->slow) ?
- GPIOF_OUT_INIT_HIGH : GPIOF_OUT_INIT_LOW,
- template->name);
- if (ret) {
- dev_err(&pdev->dev, "%s: failed to setup slow GPIO\n",
- template->name);
- return ret;
- }
-
rwlock_init(&led_dat->rw_lock);
led_dat->cdev.name = template->name;
@@ -205,8 +209,8 @@ create_ns2_led(struct platform_device *pdev, struct ns2_led_data *led_dat,
led_dat->cdev.groups = ns2_led_groups;
led_dat->cmd = template->cmd;
led_dat->slow = template->slow;
- led_dat->can_sleep = gpio_cansleep(led_dat->cmd) |
- gpio_cansleep(led_dat->slow);
+ led_dat->can_sleep = gpiod_cansleep(led_dat->cmd) |
+ gpiod_cansleep(led_dat->slow);
if (led_dat->can_sleep)
led_dat->cdev.brightness_set_blocking = ns2_led_set_blocking;
else
@@ -261,17 +265,26 @@ ns2_leds_get_of_pdata(struct device *dev, struct ns2_led_platform_data *pdata)
const char *string;
int i, num_modes;
struct ns2_led_modval *modval;
+ struct gpio_desc *gd;
- ret = of_get_named_gpio(child, "cmd-gpio", 0);
- if (ret < 0)
- goto err_node_put;
- led->cmd = ret;
- ret = of_get_named_gpio(child, "slow-gpio", 0);
- if (ret < 0)
- goto err_node_put;
- led->slow = ret;
ret = of_property_read_string(child, "label", &string);
led->name = (ret == 0) ? string : child->name;
+
+ gd = gpiod_get_from_of_node(child, "cmd-gpio", 0,
+ GPIOD_ASIS, led->name);
+ if (IS_ERR(gd)) {
+ ret = PTR_ERR(gd);
+ goto err_node_put;
+ }
+ led->cmd = gd;
+ gd = gpiod_get_from_of_node(child, "slow-gpio", 0,
+ GPIOD_ASIS, led->name);
+ if (IS_ERR(gd)) {
+ ret = PTR_ERR(gd);
+ goto err_node_put;
+ }
+ led->slow = gd;
+
ret = of_property_read_string(child, "linux,default-trigger",
&string);
if (ret == 0)
diff --git a/drivers/leds/leds-pwm.c b/drivers/leds/leds-pwm.c
index 8b6965a563e9..6c8a724aac51 100644
--- a/drivers/leds/leds-pwm.c
+++ b/drivers/leds/leds-pwm.c
@@ -16,60 +16,55 @@
#include <linux/leds.h>
#include <linux/err.h>
#include <linux/pwm.h>
-#include <linux/leds_pwm.h>
#include <linux/slab.h>
+struct led_pwm {
+ const char *name;
+ const char *default_trigger;
+ u8 active_low;
+ unsigned int max_brightness;
+};
+
+struct led_pwm_platform_data {
+ int num_leds;
+ struct led_pwm *leds;
+};
+
struct led_pwm_data {
struct led_classdev cdev;
struct pwm_device *pwm;
+ struct pwm_state pwmstate;
unsigned int active_low;
- unsigned int period;
- int duty;
};
struct led_pwm_priv {
int num_leds;
- struct led_pwm_data leds[0];
+ struct led_pwm_data leds[];
};
-static void __led_pwm_set(struct led_pwm_data *led_dat)
-{
- int new_duty = led_dat->duty;
-
- pwm_config(led_dat->pwm, new_duty, led_dat->period);
-
- if (new_duty == 0)
- pwm_disable(led_dat->pwm);
- else
- pwm_enable(led_dat->pwm);
-}
-
static int led_pwm_set(struct led_classdev *led_cdev,
enum led_brightness brightness)
{
struct led_pwm_data *led_dat =
container_of(led_cdev, struct led_pwm_data, cdev);
unsigned int max = led_dat->cdev.max_brightness;
- unsigned long long duty = led_dat->period;
+ unsigned long long duty = led_dat->pwmstate.period;
duty *= brightness;
do_div(duty, max);
if (led_dat->active_low)
- duty = led_dat->period - duty;
-
- led_dat->duty = duty;
-
- __led_pwm_set(led_dat);
+ duty = led_dat->pwmstate.period - duty;
- return 0;
+ led_dat->pwmstate.duty_cycle = duty;
+ led_dat->pwmstate.enabled = duty > 0;
+ return pwm_apply_state(led_dat->pwm, &led_dat->pwmstate);
}
static int led_pwm_add(struct device *dev, struct led_pwm_priv *priv,
struct led_pwm *led, struct fwnode_handle *fwnode)
{
struct led_pwm_data *led_data = &priv->leds[priv->num_leds];
- struct pwm_args pargs;
int ret;
led_data->active_low = led->active_low;
@@ -93,17 +88,7 @@ static int led_pwm_add(struct device *dev, struct led_pwm_priv *priv,
led_data->cdev.brightness_set_blocking = led_pwm_set;
- /*
- * FIXME: pwm_apply_args() should be removed when switching to the
- * atomic PWM API.
- */
- pwm_apply_args(led_data->pwm);
-
- pwm_get_args(led_data->pwm, &pargs);
-
- led_data->period = pargs.period;
- if (!led_data->period && (led->pwm_period_ns > 0))
- led_data->period = led->pwm_period_ns;
+ pwm_init_state(led_data->pwm, &led_data->pwmstate);
ret = devm_led_classdev_register(dev, &led_data->cdev);
if (ret == 0) {
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 8d07fdf63a47..e1db43446327 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -201,10 +201,27 @@ static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
}
+static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
+ size_t nr_pages)
+{
+ int ret;
+ struct linear_c *lc = ti->private;
+ struct block_device *bdev = lc->dev->bdev;
+ struct dax_device *dax_dev = lc->dev->dax_dev;
+ sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+
+ dev_sector = linear_map_sector(ti, sector);
+ ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages << PAGE_SHIFT, &pgoff);
+ if (ret)
+ return ret;
+ return dax_zero_page_range(dax_dev, pgoff, nr_pages);
+}
+
#else
#define linear_dax_direct_access NULL
#define linear_dax_copy_from_iter NULL
#define linear_dax_copy_to_iter NULL
+#define linear_dax_zero_page_range NULL
#endif
static struct target_type linear_target = {
@@ -226,6 +243,7 @@ static struct target_type linear_target = {
.direct_access = linear_dax_direct_access,
.dax_copy_from_iter = linear_dax_copy_from_iter,
.dax_copy_to_iter = linear_dax_copy_to_iter,
+ .dax_zero_page_range = linear_dax_zero_page_range,
};
int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 99721c76225d..8ea20b56b4d6 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -994,10 +994,26 @@ static size_t log_writes_dax_copy_to_iter(struct dm_target *ti,
return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
}
+static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
+ size_t nr_pages)
+{
+ int ret;
+ struct log_writes_c *lc = ti->private;
+ sector_t sector = pgoff * PAGE_SECTORS;
+
+ ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages << PAGE_SHIFT,
+ &pgoff);
+ if (ret)
+ return ret;
+ return dax_zero_page_range(lc->dev->dax_dev, pgoff,
+ nr_pages << PAGE_SHIFT);
+}
+
#else
#define log_writes_dax_direct_access NULL
#define log_writes_dax_copy_from_iter NULL
#define log_writes_dax_copy_to_iter NULL
+#define log_writes_dax_zero_page_range NULL
#endif
static struct target_type log_writes_target = {
@@ -1016,6 +1032,7 @@ static struct target_type log_writes_target = {
.direct_access = log_writes_dax_direct_access,
.dax_copy_from_iter = log_writes_dax_copy_from_iter,
.dax_copy_to_iter = log_writes_dax_copy_to_iter,
+ .dax_zero_page_range = log_writes_dax_zero_page_range,
};
static int __init dm_log_writes_init(void)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 63bbcc20f49a..fa813c0f993d 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -360,10 +360,32 @@ static size_t stripe_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
}
+static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
+ size_t nr_pages)
+{
+ int ret;
+ sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+ struct stripe_c *sc = ti->private;
+ struct dax_device *dax_dev;
+ struct block_device *bdev;
+ uint32_t stripe;
+
+ stripe_map_sector(sc, sector, &stripe, &dev_sector);
+ dev_sector += sc->stripe[stripe].physical_start;
+ dax_dev = sc->stripe[stripe].dev->dax_dev;
+ bdev = sc->stripe[stripe].dev->bdev;
+
+ ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages << PAGE_SHIFT, &pgoff);
+ if (ret)
+ return ret;
+ return dax_zero_page_range(dax_dev, pgoff, nr_pages);
+}
+
#else
#define stripe_dax_direct_access NULL
#define stripe_dax_copy_from_iter NULL
#define stripe_dax_copy_to_iter NULL
+#define stripe_dax_zero_page_range NULL
#endif
/*
@@ -486,6 +508,7 @@ static struct target_type stripe_target = {
.direct_access = stripe_dax_direct_access,
.dax_copy_from_iter = stripe_dax_copy_from_iter,
.dax_copy_to_iter = stripe_dax_copy_to_iter,
+ .dax_zero_page_range = stripe_dax_zero_page_range,
};
int __init dm_stripe_init(void)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 21c0207e3207..db9e46114653 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1199,6 +1199,35 @@ static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
return ret;
}
+static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
+ size_t nr_pages)
+{
+ struct mapped_device *md = dax_get_private(dax_dev);
+ sector_t sector = pgoff * PAGE_SECTORS;
+ struct dm_target *ti;
+ int ret = -EIO;
+ int srcu_idx;
+
+ ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+
+ if (!ti)
+ goto out;
+ if (WARN_ON(!ti->type->dax_zero_page_range)) {
+ /*
+ * ->zero_page_range() is mandatory dax operation. If we are
+ * here, something is wrong.
+ */
+ dm_put_live_table(md, srcu_idx);
+ goto out;
+ }
+ ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
+
+ out:
+ dm_put_live_table(md, srcu_idx);
+
+ return ret;
+}
+
/*
* A target may call dm_accept_partial_bio only from the map routine. It is
* allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_RESET,
@@ -1969,7 +1998,7 @@ static struct mapped_device *alloc_dev(int minor)
if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
md->dax_dev = alloc_dax(md, md->disk->disk_name,
&dm_dax_ops, 0);
- if (!md->dax_dev)
+ if (IS_ERR(md->dax_dev))
goto bad;
}
@@ -3200,6 +3229,7 @@ static const struct dax_operations dm_dax_ops = {
.dax_supported = dm_dax_supported,
.copy_from_iter = dm_dax_copy_from_iter,
.copy_to_iter = dm_dax_copy_to_iter,
+ .zero_page_range = dm_dax_zero_page_range,
};
/*
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 2b203290e7b9..0a59249198d3 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -642,6 +642,19 @@ config MFD_IPAQ_MICRO
AT90LS8535 microcontroller flashed with a special iPAQ
firmware using the custom protocol implemented in this driver.
+config MFD_IQS62X
+ tristate "Azoteq IQS620A/621/622/624/625 core support"
+ depends on I2C
+ select MFD_CORE
+ select REGMAP_I2C
+ help
+ Say Y here if you want to build core support for the Azoteq IQS620A,
+ IQS621, IQS622, IQS624 and IQS625 multi-function sensors. Additional
+ options must be selected to enable device-specific functions.
+
+ To compile this driver as a module, choose M here: the module will
+ be called iqs62x.
+
config MFD_JANZ_CMODIO
tristate "Janz CMOD-IO PCI MODULbus Carrier Board"
select MFD_CORE
@@ -893,6 +906,7 @@ config MFD_CPCAP
tristate "Support for Motorola CPCAP"
depends on SPI
depends on OF || COMPILE_TEST
+ select MFD_CORE
select REGMAP_SPI
select REGMAP_IRQ
help
@@ -1058,6 +1072,7 @@ config MFD_RN5T618
depends on OF
select MFD_CORE
select REGMAP_I2C
+ select REGMAP_IRQ
help
Say yes here to add support for the Ricoh RN5T567,
RN5T618, RC5T619 PMIC.
@@ -1201,7 +1216,7 @@ config AB8500_CORE
chip. This connects to U8500 either on the SSP/SPI bus (deprecated
since hardware version v1.0) or the I2C bus via PRCMU. It also adds
the irq_chip parts for handling the Mixed Signal chip events.
- This chip embeds various other multimedia funtionalities as well.
+ This chip embeds various other multimedia functionalities as well.
config AB8500_DEBUG
bool "Enable debug info via debugfs"
@@ -1851,7 +1866,7 @@ config MFD_WM8994
has on board GPIO and regulator functionality which is
supported via the relevant subsystems. This driver provides
core support for the WM8994, in order to use the actual
- functionaltiy of the device other drivers must be enabled.
+ functionality of the device other drivers must be enabled.
config MFD_WM97xx
tristate "Wolfson Microelectronics WM97xx"
@@ -1864,7 +1879,7 @@ config MFD_WM97xx
designed for smartphone applications. As well as audio functionality
it has on board GPIO and a touchscreen functionality which is
supported via the relevant subsystems. This driver provides core
- support for the WM97xx, in order to use the actual functionaltiy of
+ support for the WM97xx, in order to use the actual functionality of
the device other drivers must be enabled.
config MFD_STW481X
@@ -1957,7 +1972,7 @@ config MFD_STPMIC1
Support for ST Microelectronics STPMIC1 PMIC. STPMIC1 has power on
key, watchdog and regulator functionalities which are supported via
the relevant subsystems. This driver provides core support for the
- STPMIC1. In order to use the actual functionaltiy of the device other
+ STPMIC1. In order to use the actual functionality of the device other
drivers must be enabled.
To compile this driver as a module, choose M here: the
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index b83f172545e1..f935d10cbf0f 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -226,6 +226,7 @@ obj-$(CONFIG_MFD_AS3711) += as3711.o
obj-$(CONFIG_MFD_AS3722) += as3722.o
obj-$(CONFIG_MFD_STW481X) += stw481x.o
obj-$(CONFIG_MFD_IPAQ_MICRO) += ipaq-micro.o
+obj-$(CONFIG_MFD_IQS62X) += iqs62x.o
obj-$(CONFIG_MFD_MENF21BMC) += menf21bmc.o
obj-$(CONFIG_MFD_HI6421_PMIC) += hi6421-pmic-core.o
obj-$(CONFIG_MFD_HI655X_PMIC) += hi655x-pmic.o
diff --git a/drivers/mfd/aat2870-core.c b/drivers/mfd/aat2870-core.c
index 78ee4b28fca2..a17cf759739d 100644
--- a/drivers/mfd/aat2870-core.c
+++ b/drivers/mfd/aat2870-core.c
@@ -221,7 +221,7 @@ static ssize_t aat2870_dump_reg(struct aat2870_data *aat2870, char *buf)
count += sprintf(buf, "aat2870 registers\n");
for (addr = 0; addr < AAT2870_REG_NUM; addr++) {
- count += sprintf(buf + count, "0x%02x: ", addr);
+ count += snprintf(buf + count, PAGE_SIZE - count, "0x%02x: ", addr);
if (count >= PAGE_SIZE - 1)
break;
diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c
index 39e611695053..32c2b912b58b 100644
--- a/drivers/mfd/cros_ec_dev.c
+++ b/drivers/mfd/cros_ec_dev.c
@@ -211,7 +211,7 @@ static int ec_device_probe(struct platform_device *pdev)
* explicitly added on platforms that don't have the PD notifier ACPI
* device entry defined.
*/
- if (IS_ENABLED(CONFIG_OF)) {
+ if (IS_ENABLED(CONFIG_OF) && ec->ec_dev->dev->of_node) {
if (cros_ec_check_features(ec, EC_FEATURE_USB_PD)) {
retval = mfd_add_hotplug_devices(ec->dev,
cros_usbpd_notify_cells,
diff --git a/drivers/mfd/da9062-core.c b/drivers/mfd/da9062-core.c
index 419c73533401..fc30726e2e27 100644
--- a/drivers/mfd/da9062-core.c
+++ b/drivers/mfd/da9062-core.c
@@ -21,6 +21,9 @@
#define DA9062_REG_EVENT_B_OFFSET 1
#define DA9062_REG_EVENT_C_OFFSET 2
+#define DA9062_IRQ_LOW 0
+#define DA9062_IRQ_HIGH 1
+
static struct regmap_irq da9061_irqs[] = {
/* EVENT A */
[DA9061_IRQ_ONKEY] = {
@@ -369,6 +372,33 @@ static int da9062_get_device_type(struct da9062 *chip)
return ret;
}
+static u32 da9062_configure_irq_type(struct da9062 *chip, int irq, u32 *trigger)
+{
+ u32 irq_type = 0;
+ struct irq_data *irq_data = irq_get_irq_data(irq);
+
+ if (!irq_data) {
+ dev_err(chip->dev, "Invalid IRQ: %d\n", irq);
+ return -EINVAL;
+ }
+ *trigger = irqd_get_trigger_type(irq_data);
+
+ switch (*trigger) {
+ case IRQ_TYPE_LEVEL_HIGH:
+ irq_type = DA9062_IRQ_HIGH;
+ break;
+ case IRQ_TYPE_LEVEL_LOW:
+ irq_type = DA9062_IRQ_LOW;
+ break;
+ default:
+ dev_warn(chip->dev, "Unsupported IRQ type: %d\n", *trigger);
+ return -EINVAL;
+ }
+ return regmap_update_bits(chip->regmap, DA9062AA_CONFIG_A,
+ DA9062AA_IRQ_TYPE_MASK,
+ irq_type << DA9062AA_IRQ_TYPE_SHIFT);
+}
+
static const struct regmap_range da9061_aa_readable_ranges[] = {
regmap_reg_range(DA9062AA_PAGE_CON, DA9062AA_STATUS_B),
regmap_reg_range(DA9062AA_STATUS_D, DA9062AA_EVENT_C),
@@ -388,6 +418,7 @@ static const struct regmap_range da9061_aa_readable_ranges[] = {
regmap_reg_range(DA9062AA_VBUCK1_A, DA9062AA_VBUCK4_A),
regmap_reg_range(DA9062AA_VBUCK3_A, DA9062AA_VBUCK3_A),
regmap_reg_range(DA9062AA_VLDO1_A, DA9062AA_VLDO4_A),
+ regmap_reg_range(DA9062AA_CONFIG_A, DA9062AA_CONFIG_A),
regmap_reg_range(DA9062AA_VBUCK1_B, DA9062AA_VBUCK4_B),
regmap_reg_range(DA9062AA_VBUCK3_B, DA9062AA_VBUCK3_B),
regmap_reg_range(DA9062AA_VLDO1_B, DA9062AA_VLDO4_B),
@@ -417,6 +448,7 @@ static const struct regmap_range da9061_aa_writeable_ranges[] = {
regmap_reg_range(DA9062AA_VBUCK1_A, DA9062AA_VBUCK4_A),
regmap_reg_range(DA9062AA_VBUCK3_A, DA9062AA_VBUCK3_A),
regmap_reg_range(DA9062AA_VLDO1_A, DA9062AA_VLDO4_A),
+ regmap_reg_range(DA9062AA_CONFIG_A, DA9062AA_CONFIG_A),
regmap_reg_range(DA9062AA_VBUCK1_B, DA9062AA_VBUCK4_B),
regmap_reg_range(DA9062AA_VBUCK3_B, DA9062AA_VBUCK3_B),
regmap_reg_range(DA9062AA_VLDO1_B, DA9062AA_VLDO4_B),
@@ -596,6 +628,7 @@ static int da9062_i2c_probe(struct i2c_client *i2c,
const struct regmap_irq_chip *irq_chip;
const struct regmap_config *config;
int cell_num;
+ u32 trigger_type = 0;
int ret;
chip = devm_kzalloc(&i2c->dev, sizeof(*chip), GFP_KERNEL);
@@ -654,10 +687,15 @@ static int da9062_i2c_probe(struct i2c_client *i2c,
if (ret)
return ret;
+ ret = da9062_configure_irq_type(chip, i2c->irq, &trigger_type);
+ if (ret < 0) {
+ dev_err(chip->dev, "Failed to configure IRQ type\n");
+ return ret;
+ }
+
ret = regmap_add_irq_chip(chip->regmap, i2c->irq,
- IRQF_TRIGGER_LOW | IRQF_ONESHOT | IRQF_SHARED,
- -1, irq_chip,
- &chip->regmap_irq);
+ trigger_type | IRQF_SHARED | IRQF_ONESHOT,
+ -1, irq_chip, &chip->regmap_irq);
if (ret) {
dev_err(chip->dev, "Failed to request IRQ %d: %d\n",
i2c->irq, ret);
diff --git a/drivers/mfd/dln2.c b/drivers/mfd/dln2.c
index 7841c11411d0..39276fa626d2 100644
--- a/drivers/mfd/dln2.c
+++ b/drivers/mfd/dln2.c
@@ -90,6 +90,11 @@ struct dln2_mod_rx_slots {
spinlock_t lock;
};
+enum dln2_endpoint {
+ DLN2_EP_OUT = 0,
+ DLN2_EP_IN = 1,
+};
+
struct dln2_dev {
struct usb_device *usb_dev;
struct usb_interface *interface;
@@ -640,35 +645,56 @@ static int dln2_start_rx_urbs(struct dln2_dev *dln2, gfp_t gfp)
return 0;
}
+enum {
+ DLN2_ACPI_MATCH_GPIO = 0,
+ DLN2_ACPI_MATCH_I2C = 1,
+ DLN2_ACPI_MATCH_SPI = 2,
+};
+
static struct dln2_platform_data dln2_pdata_gpio = {
.handle = DLN2_HANDLE_GPIO,
};
+static struct mfd_cell_acpi_match dln2_acpi_match_gpio = {
+ .adr = DLN2_ACPI_MATCH_GPIO,
+};
+
/* Only one I2C port seems to be supported on current hardware */
static struct dln2_platform_data dln2_pdata_i2c = {
.handle = DLN2_HANDLE_I2C,
.port = 0,
};
+static struct mfd_cell_acpi_match dln2_acpi_match_i2c = {
+ .adr = DLN2_ACPI_MATCH_I2C,
+};
+
/* Only one SPI port supported */
static struct dln2_platform_data dln2_pdata_spi = {
.handle = DLN2_HANDLE_SPI,
.port = 0,
};
+static struct mfd_cell_acpi_match dln2_acpi_match_spi = {
+ .adr = DLN2_ACPI_MATCH_SPI,
+};
+
static const struct mfd_cell dln2_devs[] = {
{
.name = "dln2-gpio",
+ .acpi_match = &dln2_acpi_match_gpio,
.platform_data = &dln2_pdata_gpio,
.pdata_size = sizeof(struct dln2_platform_data),
},
{
.name = "dln2-i2c",
+ .acpi_match = &dln2_acpi_match_i2c,
.platform_data = &dln2_pdata_i2c,
.pdata_size = sizeof(struct dln2_platform_data),
},
{
.name = "dln2-spi",
+ .acpi_match = &dln2_acpi_match_spi,
.platform_data = &dln2_pdata_spi,
.pdata_size = sizeof(struct dln2_platform_data),
},
@@ -733,10 +759,10 @@ static int dln2_probe(struct usb_interface *interface,
hostif->desc.bNumEndpoints < 2)
return -ENODEV;
- epin = &hostif->endpoint[0].desc;
- epout = &hostif->endpoint[1].desc;
+ epout = &hostif->endpoint[DLN2_EP_OUT].desc;
if (!usb_endpoint_is_bulk_out(epout))
return -ENODEV;
+ epin = &hostif->endpoint[DLN2_EP_IN].desc;
if (!usb_endpoint_is_bulk_in(epin))
return -ENODEV;
diff --git a/drivers/mfd/intel-lpss-pci.c b/drivers/mfd/intel-lpss-pci.c
index c40a6c7d0cf8..7fc0c5d4edff 100644
--- a/drivers/mfd/intel-lpss-pci.c
+++ b/drivers/mfd/intel-lpss-pci.c
@@ -139,6 +139,11 @@ static const struct intel_lpss_platform_info cnl_i2c_info = {
.properties = spt_i2c_properties,
};
+static const struct intel_lpss_platform_info ehl_i2c_info = {
+ .clk_rate = 100000000,
+ .properties = bxt_i2c_properties,
+};
+
static const struct pci_device_id intel_lpss_pci_ids[] = {
/* CML-LP */
{ PCI_VDEVICE(INTEL, 0x02a8), (kernel_ulong_t)&spt_uart_info },
@@ -231,15 +236,15 @@ static const struct pci_device_id intel_lpss_pci_ids[] = {
{ PCI_VDEVICE(INTEL, 0x4b2a), (kernel_ulong_t)&bxt_info },
{ PCI_VDEVICE(INTEL, 0x4b2b), (kernel_ulong_t)&bxt_info },
{ PCI_VDEVICE(INTEL, 0x4b37), (kernel_ulong_t)&bxt_info },
- { PCI_VDEVICE(INTEL, 0x4b44), (kernel_ulong_t)&bxt_i2c_info },
- { PCI_VDEVICE(INTEL, 0x4b45), (kernel_ulong_t)&bxt_i2c_info },
- { PCI_VDEVICE(INTEL, 0x4b4b), (kernel_ulong_t)&bxt_i2c_info },
- { PCI_VDEVICE(INTEL, 0x4b4c), (kernel_ulong_t)&bxt_i2c_info },
+ { PCI_VDEVICE(INTEL, 0x4b44), (kernel_ulong_t)&ehl_i2c_info },
+ { PCI_VDEVICE(INTEL, 0x4b45), (kernel_ulong_t)&ehl_i2c_info },
+ { PCI_VDEVICE(INTEL, 0x4b4b), (kernel_ulong_t)&ehl_i2c_info },
+ { PCI_VDEVICE(INTEL, 0x4b4c), (kernel_ulong_t)&ehl_i2c_info },
{ PCI_VDEVICE(INTEL, 0x4b4d), (kernel_ulong_t)&bxt_uart_info },
- { PCI_VDEVICE(INTEL, 0x4b78), (kernel_ulong_t)&bxt_i2c_info },
- { PCI_VDEVICE(INTEL, 0x4b79), (kernel_ulong_t)&bxt_i2c_info },
- { PCI_VDEVICE(INTEL, 0x4b7a), (kernel_ulong_t)&bxt_i2c_info },
- { PCI_VDEVICE(INTEL, 0x4b7b), (kernel_ulong_t)&bxt_i2c_info },
+ { PCI_VDEVICE(INTEL, 0x4b78), (kernel_ulong_t)&ehl_i2c_info },
+ { PCI_VDEVICE(INTEL, 0x4b79), (kernel_ulong_t)&ehl_i2c_info },
+ { PCI_VDEVICE(INTEL, 0x4b7a), (kernel_ulong_t)&ehl_i2c_info },
+ { PCI_VDEVICE(INTEL, 0x4b7b), (kernel_ulong_t)&ehl_i2c_info },
/* JSL */
{ PCI_VDEVICE(INTEL, 0x4da8), (kernel_ulong_t)&spt_uart_info },
{ PCI_VDEVICE(INTEL, 0x4da9), (kernel_ulong_t)&spt_uart_info },
@@ -347,6 +352,16 @@ static const struct pci_device_id intel_lpss_pci_ids[] = {
{ PCI_VDEVICE(INTEL, 0xa36a), (kernel_ulong_t)&cnl_i2c_info },
{ PCI_VDEVICE(INTEL, 0xa36b), (kernel_ulong_t)&cnl_i2c_info },
{ PCI_VDEVICE(INTEL, 0xa37b), (kernel_ulong_t)&spt_info },
+ /* CML-V */
+ { PCI_VDEVICE(INTEL, 0xa3a7), (kernel_ulong_t)&spt_uart_info },
+ { PCI_VDEVICE(INTEL, 0xa3a8), (kernel_ulong_t)&spt_uart_info },
+ { PCI_VDEVICE(INTEL, 0xa3a9), (kernel_ulong_t)&spt_info },
+ { PCI_VDEVICE(INTEL, 0xa3aa), (kernel_ulong_t)&spt_info },
+ { PCI_VDEVICE(INTEL, 0xa3e0), (kernel_ulong_t)&spt_i2c_info },
+ { PCI_VDEVICE(INTEL, 0xa3e1), (kernel_ulong_t)&spt_i2c_info },
+ { PCI_VDEVICE(INTEL, 0xa3e2), (kernel_ulong_t)&spt_i2c_info },
+ { PCI_VDEVICE(INTEL, 0xa3e3), (kernel_ulong_t)&spt_i2c_info },
+ { PCI_VDEVICE(INTEL, 0xa3e6), (kernel_ulong_t)&spt_uart_info },
{ }
};
MODULE_DEVICE_TABLE(pci, intel_lpss_pci_ids);
diff --git a/drivers/mfd/iqs62x.c b/drivers/mfd/iqs62x.c
new file mode 100644
index 000000000000..af764bc87d7c
--- /dev/null
+++ b/drivers/mfd/iqs62x.c
@@ -0,0 +1,1063 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Azoteq IQS620A/621/622/624/625 Multi-Function Sensors
+ *
+ * Copyright (C) 2019 Jeff LaBundy <jeff@labundy.com>
+ *
+ * These devices rely on application-specific register settings and calibration
+ * data developed in and exported from a suite of GUIs offered by the vendor. A
+ * separate tool converts the GUIs' ASCII-based output into a standard firmware
+ * file parsed by the driver.
+ *
+ * Link to datasheets and GUIs: https://www.azoteq.com/
+ *
+ * Link to conversion tool: https://github.com/jlabundy/iqs62x-h2bin.git
+ */
+
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/firmware.h>
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/iqs62x.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/of_device.h>
+#include <linux/property.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+
+#define IQS62X_PROD_NUM 0x00
+
+#define IQS62X_SYS_FLAGS 0x10
+#define IQS62X_SYS_FLAGS_IN_ATI BIT(2)
+
+#define IQS620_HALL_FLAGS 0x16
+#define IQS621_HALL_FLAGS 0x19
+#define IQS622_HALL_FLAGS IQS621_HALL_FLAGS
+
+#define IQS624_INTERVAL_NUM 0x18
+#define IQS625_INTERVAL_NUM 0x12
+
+#define IQS622_PROX_SETTINGS_4 0x48
+#define IQS620_PROX_SETTINGS_4 0x50
+#define IQS620_PROX_SETTINGS_4_SAR_EN BIT(7)
+
+#define IQS621_ALS_CAL_DIV_LUX 0x82
+#define IQS621_ALS_CAL_DIV_IR 0x83
+
+#define IQS620_TEMP_CAL_MULT 0xC2
+#define IQS620_TEMP_CAL_DIV 0xC3
+#define IQS620_TEMP_CAL_OFFS 0xC4
+
+#define IQS62X_SYS_SETTINGS 0xD0
+#define IQS62X_SYS_SETTINGS_SOFT_RESET BIT(7)
+#define IQS62X_SYS_SETTINGS_ACK_RESET BIT(6)
+#define IQS62X_SYS_SETTINGS_EVENT_MODE BIT(5)
+#define IQS62X_SYS_SETTINGS_CLK_DIV BIT(4)
+#define IQS62X_SYS_SETTINGS_REDO_ATI BIT(1)
+
+#define IQS62X_PWR_SETTINGS 0xD2
+#define IQS62X_PWR_SETTINGS_DIS_AUTO BIT(5)
+#define IQS62X_PWR_SETTINGS_PWR_MODE_MASK (BIT(4) | BIT(3))
+#define IQS62X_PWR_SETTINGS_PWR_MODE_HALT (BIT(4) | BIT(3))
+#define IQS62X_PWR_SETTINGS_PWR_MODE_NORM 0
+
+#define IQS62X_OTP_CMD 0xF0
+#define IQS62X_OTP_CMD_FG3 0x13
+#define IQS62X_OTP_DATA 0xF1
+#define IQS62X_MAX_REG 0xFF
+
+#define IQS62X_HALL_CAL_MASK GENMASK(3, 0)
+
+#define IQS62X_FW_REC_TYPE_INFO 0
+#define IQS62X_FW_REC_TYPE_PROD 1
+#define IQS62X_FW_REC_TYPE_HALL 2
+#define IQS62X_FW_REC_TYPE_MASK 3
+#define IQS62X_FW_REC_TYPE_DATA 4
+
+#define IQS62X_ATI_POLL_SLEEP_US 10000
+#define IQS62X_ATI_POLL_TIMEOUT_US 500000
+#define IQS62X_ATI_STABLE_DELAY_MS 150
+
+struct iqs62x_fw_rec {
+ u8 type;
+ u8 addr;
+ u8 len;
+ u8 data;
+} __packed;
+
+struct iqs62x_fw_blk {
+ struct list_head list;
+ u8 addr;
+ u8 mask;
+ u8 len;
+ u8 data[];
+};
+
+struct iqs62x_info {
+ u8 prod_num;
+ u8 sw_num;
+ u8 hw_num;
+} __packed;
+
+static int iqs62x_dev_init(struct iqs62x_core *iqs62x)
+{
+ struct iqs62x_fw_blk *fw_blk;
+ unsigned int val;
+ int ret;
+ u8 clk_div = 1;
+
+ list_for_each_entry(fw_blk, &iqs62x->fw_blk_head, list) {
+ if (fw_blk->mask)
+ ret = regmap_update_bits(iqs62x->regmap, fw_blk->addr,
+ fw_blk->mask, *fw_blk->data);
+ else
+ ret = regmap_raw_write(iqs62x->regmap, fw_blk->addr,
+ fw_blk->data, fw_blk->len);
+ if (ret)
+ return ret;
+ }
+
+ switch (iqs62x->dev_desc->prod_num) {
+ case IQS620_PROD_NUM:
+ case IQS622_PROD_NUM:
+ ret = regmap_read(iqs62x->regmap,
+ iqs62x->dev_desc->prox_settings, &val);
+ if (ret)
+ return ret;
+
+ if (val & IQS620_PROX_SETTINGS_4_SAR_EN)
+ iqs62x->ui_sel = IQS62X_UI_SAR1;
+
+ /* fall through */
+
+ case IQS621_PROD_NUM:
+ ret = regmap_write(iqs62x->regmap, IQS620_GLBL_EVENT_MASK,
+ IQS620_GLBL_EVENT_MASK_PMU |
+ iqs62x->dev_desc->prox_mask |
+ iqs62x->dev_desc->sar_mask |
+ iqs62x->dev_desc->hall_mask |
+ iqs62x->dev_desc->hyst_mask |
+ iqs62x->dev_desc->temp_mask |
+ iqs62x->dev_desc->als_mask |
+ iqs62x->dev_desc->ir_mask);
+ if (ret)
+ return ret;
+ break;
+
+ default:
+ ret = regmap_write(iqs62x->regmap, IQS624_HALL_UI,
+ IQS624_HALL_UI_WHL_EVENT |
+ IQS624_HALL_UI_INT_EVENT |
+ IQS624_HALL_UI_AUTO_CAL);
+ if (ret)
+ return ret;
+
+ /*
+ * The IQS625 default interval divider is below the minimum
+ * permissible value, and the datasheet mandates that it is
+ * corrected during initialization (unless an updated value
+ * has already been provided by firmware).
+ *
+ * To protect against an unacceptably low user-entered value
+ * stored in the firmware, the same check is extended to the
+ * IQS624 as well.
+ */
+ ret = regmap_read(iqs62x->regmap, IQS624_INTERVAL_DIV, &val);
+ if (ret)
+ return ret;
+
+ if (val >= iqs62x->dev_desc->interval_div)
+ break;
+
+ ret = regmap_write(iqs62x->regmap, IQS624_INTERVAL_DIV,
+ iqs62x->dev_desc->interval_div);
+ if (ret)
+ return ret;
+ }
+
+ ret = regmap_read(iqs62x->regmap, IQS62X_SYS_SETTINGS, &val);
+ if (ret)
+ return ret;
+
+ if (val & IQS62X_SYS_SETTINGS_CLK_DIV)
+ clk_div = iqs62x->dev_desc->clk_div;
+
+ ret = regmap_write(iqs62x->regmap, IQS62X_SYS_SETTINGS, val |
+ IQS62X_SYS_SETTINGS_ACK_RESET |
+ IQS62X_SYS_SETTINGS_EVENT_MODE |
+ IQS62X_SYS_SETTINGS_REDO_ATI);
+ if (ret)
+ return ret;
+
+ ret = regmap_read_poll_timeout(iqs62x->regmap, IQS62X_SYS_FLAGS, val,
+ !(val & IQS62X_SYS_FLAGS_IN_ATI),
+ IQS62X_ATI_POLL_SLEEP_US,
+ IQS62X_ATI_POLL_TIMEOUT_US * clk_div);
+ if (ret)
+ return ret;
+
+ msleep(IQS62X_ATI_STABLE_DELAY_MS * clk_div);
+
+ return 0;
+}
+
+static int iqs62x_firmware_parse(struct iqs62x_core *iqs62x,
+ const struct firmware *fw)
+{
+ struct i2c_client *client = iqs62x->client;
+ struct iqs62x_fw_rec *fw_rec;
+ struct iqs62x_fw_blk *fw_blk;
+ unsigned int val;
+ size_t pos = 0;
+ int ret = 0;
+ u8 mask, len, *data;
+ u8 hall_cal_index = 0;
+
+ while (pos < fw->size) {
+ if (pos + sizeof(*fw_rec) > fw->size) {
+ ret = -EINVAL;
+ break;
+ }
+ fw_rec = (struct iqs62x_fw_rec *)(fw->data + pos);
+ pos += sizeof(*fw_rec);
+
+ if (pos + fw_rec->len - 1 > fw->size) {
+ ret = -EINVAL;
+ break;
+ }
+ pos += fw_rec->len - 1;
+
+ switch (fw_rec->type) {
+ case IQS62X_FW_REC_TYPE_INFO:
+ continue;
+
+ case IQS62X_FW_REC_TYPE_PROD:
+ if (fw_rec->data == iqs62x->dev_desc->prod_num)
+ continue;
+
+ dev_err(&client->dev,
+ "Incompatible product number: 0x%02X\n",
+ fw_rec->data);
+ ret = -EINVAL;
+ break;
+
+ case IQS62X_FW_REC_TYPE_HALL:
+ if (!hall_cal_index) {
+ ret = regmap_write(iqs62x->regmap,
+ IQS62X_OTP_CMD,
+ IQS62X_OTP_CMD_FG3);
+ if (ret)
+ break;
+
+ ret = regmap_read(iqs62x->regmap,
+ IQS62X_OTP_DATA, &val);
+ if (ret)
+ break;
+
+ hall_cal_index = val & IQS62X_HALL_CAL_MASK;
+ if (!hall_cal_index) {
+ dev_err(&client->dev,
+ "Uncalibrated device\n");
+ ret = -ENODATA;
+ break;
+ }
+ }
+
+ if (hall_cal_index > fw_rec->len) {
+ ret = -EINVAL;
+ break;
+ }
+
+ mask = 0;
+ data = &fw_rec->data + hall_cal_index - 1;
+ len = sizeof(*data);
+ break;
+
+ case IQS62X_FW_REC_TYPE_MASK:
+ if (fw_rec->len < (sizeof(mask) + sizeof(*data))) {
+ ret = -EINVAL;
+ break;
+ }
+
+ mask = fw_rec->data;
+ data = &fw_rec->data + sizeof(mask);
+ len = sizeof(*data);
+ break;
+
+ case IQS62X_FW_REC_TYPE_DATA:
+ mask = 0;
+ data = &fw_rec->data;
+ len = fw_rec->len;
+ break;
+
+ default:
+ dev_err(&client->dev,
+ "Unrecognized record type: 0x%02X\n",
+ fw_rec->type);
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ break;
+
+ fw_blk = devm_kzalloc(&client->dev,
+ struct_size(fw_blk, data, len),
+ GFP_KERNEL);
+ if (!fw_blk) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ fw_blk->addr = fw_rec->addr;
+ fw_blk->mask = mask;
+ fw_blk->len = len;
+ memcpy(fw_blk->data, data, len);
+
+ list_add(&fw_blk->list, &iqs62x->fw_blk_head);
+ }
+
+ release_firmware(fw);
+
+ return ret;
+}
+
+const struct iqs62x_event_desc iqs62x_events[IQS62X_NUM_EVENTS] = {
+ [IQS62X_EVENT_PROX_CH0_T] = {
+ .reg = IQS62X_EVENT_PROX,
+ .mask = BIT(4),
+ .val = BIT(4),
+ },
+ [IQS62X_EVENT_PROX_CH0_P] = {
+ .reg = IQS62X_EVENT_PROX,
+ .mask = BIT(0),
+ .val = BIT(0),
+ },
+ [IQS62X_EVENT_PROX_CH1_T] = {
+ .reg = IQS62X_EVENT_PROX,
+ .mask = BIT(5),
+ .val = BIT(5),
+ },
+ [IQS62X_EVENT_PROX_CH1_P] = {
+ .reg = IQS62X_EVENT_PROX,
+ .mask = BIT(1),
+ .val = BIT(1),
+ },
+ [IQS62X_EVENT_PROX_CH2_T] = {
+ .reg = IQS62X_EVENT_PROX,
+ .mask = BIT(6),
+ .val = BIT(6),
+ },
+ [IQS62X_EVENT_PROX_CH2_P] = {
+ .reg = IQS62X_EVENT_PROX,
+ .mask = BIT(2),
+ .val = BIT(2),
+ },
+ [IQS62X_EVENT_HYST_POS_T] = {
+ .reg = IQS62X_EVENT_HYST,
+ .mask = BIT(6) | BIT(7),
+ .val = BIT(6),
+ },
+ [IQS62X_EVENT_HYST_POS_P] = {
+ .reg = IQS62X_EVENT_HYST,
+ .mask = BIT(5) | BIT(7),
+ .val = BIT(5),
+ },
+ [IQS62X_EVENT_HYST_NEG_T] = {
+ .reg = IQS62X_EVENT_HYST,
+ .mask = BIT(6) | BIT(7),
+ .val = BIT(6) | BIT(7),
+ },
+ [IQS62X_EVENT_HYST_NEG_P] = {
+ .reg = IQS62X_EVENT_HYST,
+ .mask = BIT(5) | BIT(7),
+ .val = BIT(5) | BIT(7),
+ },
+ [IQS62X_EVENT_SAR1_ACT] = {
+ .reg = IQS62X_EVENT_HYST,
+ .mask = BIT(4),
+ .val = BIT(4),
+ },
+ [IQS62X_EVENT_SAR1_QRD] = {
+ .reg = IQS62X_EVENT_HYST,
+ .mask = BIT(2),
+ .val = BIT(2),
+ },
+ [IQS62X_EVENT_SAR1_MOVE] = {
+ .reg = IQS62X_EVENT_HYST,
+ .mask = BIT(1),
+ .val = BIT(1),
+ },
+ [IQS62X_EVENT_SAR1_HALT] = {
+ .reg = IQS62X_EVENT_HYST,
+ .mask = BIT(0),
+ .val = BIT(0),
+ },
+ [IQS62X_EVENT_WHEEL_UP] = {
+ .reg = IQS62X_EVENT_WHEEL,
+ .mask = BIT(7) | BIT(6),
+ .val = BIT(7),
+ },
+ [IQS62X_EVENT_WHEEL_DN] = {
+ .reg = IQS62X_EVENT_WHEEL,
+ .mask = BIT(7) | BIT(6),
+ .val = BIT(7) | BIT(6),
+ },
+ [IQS62X_EVENT_HALL_N_T] = {
+ .reg = IQS62X_EVENT_HALL,
+ .mask = BIT(2) | BIT(0),
+ .val = BIT(2),
+ },
+ [IQS62X_EVENT_HALL_N_P] = {
+ .reg = IQS62X_EVENT_HALL,
+ .mask = BIT(1) | BIT(0),
+ .val = BIT(1),
+ },
+ [IQS62X_EVENT_HALL_S_T] = {
+ .reg = IQS62X_EVENT_HALL,
+ .mask = BIT(2) | BIT(0),
+ .val = BIT(2) | BIT(0),
+ },
+ [IQS62X_EVENT_HALL_S_P] = {
+ .reg = IQS62X_EVENT_HALL,
+ .mask = BIT(1) | BIT(0),
+ .val = BIT(1) | BIT(0),
+ },
+ [IQS62X_EVENT_SYS_RESET] = {
+ .reg = IQS62X_EVENT_SYS,
+ .mask = BIT(7),
+ .val = BIT(7),
+ },
+};
+EXPORT_SYMBOL_GPL(iqs62x_events);
+
+static irqreturn_t iqs62x_irq(int irq, void *context)
+{
+ struct iqs62x_core *iqs62x = context;
+ struct i2c_client *client = iqs62x->client;
+ struct iqs62x_event_data event_data;
+ struct iqs62x_event_desc event_desc;
+ enum iqs62x_event_reg event_reg;
+ unsigned long event_flags = 0;
+ int ret, i, j;
+ u8 event_map[IQS62X_EVENT_SIZE];
+
+ /*
+ * The device asserts the RDY output to signal the beginning of a
+ * communication window, which is closed by an I2C stop condition.
+ * As such, all interrupt status is captured in a single read and
+ * broadcast to any interested sub-device drivers.
+ */
+ ret = regmap_raw_read(iqs62x->regmap, IQS62X_SYS_FLAGS, event_map,
+ sizeof(event_map));
+ if (ret) {
+ dev_err(&client->dev, "Failed to read device status: %d\n",
+ ret);
+ return IRQ_NONE;
+ }
+
+ for (i = 0; i < sizeof(event_map); i++) {
+ event_reg = iqs62x->dev_desc->event_regs[iqs62x->ui_sel][i];
+
+ switch (event_reg) {
+ case IQS62X_EVENT_UI_LO:
+ event_data.ui_data = get_unaligned_le16(&event_map[i]);
+
+ /* fall through */
+
+ case IQS62X_EVENT_UI_HI:
+ case IQS62X_EVENT_NONE:
+ continue;
+
+ case IQS62X_EVENT_ALS:
+ event_data.als_flags = event_map[i];
+ continue;
+
+ case IQS62X_EVENT_IR:
+ event_data.ir_flags = event_map[i];
+ continue;
+
+ case IQS62X_EVENT_INTER:
+ event_data.interval = event_map[i];
+ continue;
+
+ case IQS62X_EVENT_HYST:
+ event_map[i] <<= iqs62x->dev_desc->hyst_shift;
+
+ /* fall through */
+
+ case IQS62X_EVENT_WHEEL:
+ case IQS62X_EVENT_HALL:
+ case IQS62X_EVENT_PROX:
+ case IQS62X_EVENT_SYS:
+ break;
+ }
+
+ for (j = 0; j < IQS62X_NUM_EVENTS; j++) {
+ event_desc = iqs62x_events[j];
+
+ if (event_desc.reg != event_reg)
+ continue;
+
+ if ((event_map[i] & event_desc.mask) == event_desc.val)
+ event_flags |= BIT(j);
+ }
+ }
+
+ /*
+ * The device resets itself in response to the I2C master stalling
+ * communication past a fixed timeout. In this case, all registers
+ * are restored and any interested sub-device drivers are notified.
+ */
+ if (event_flags & BIT(IQS62X_EVENT_SYS_RESET)) {
+ dev_err(&client->dev, "Unexpected device reset\n");
+
+ ret = iqs62x_dev_init(iqs62x);
+ if (ret) {
+ dev_err(&client->dev,
+ "Failed to re-initialize device: %d\n", ret);
+ return IRQ_NONE;
+ }
+ }
+
+ ret = blocking_notifier_call_chain(&iqs62x->nh, event_flags,
+ &event_data);
+ if (ret & NOTIFY_STOP_MASK)
+ return IRQ_NONE;
+
+ /*
+ * Once the communication window is closed, a small delay is added to
+ * ensure the device's RDY output has been deasserted by the time the
+ * interrupt handler returns.
+ */
+ usleep_range(50, 100);
+
+ return IRQ_HANDLED;
+}
+
+static void iqs62x_firmware_load(const struct firmware *fw, void *context)
+{
+ struct iqs62x_core *iqs62x = context;
+ struct i2c_client *client = iqs62x->client;
+ int ret;
+
+ if (fw) {
+ ret = iqs62x_firmware_parse(iqs62x, fw);
+ if (ret) {
+ dev_err(&client->dev, "Failed to parse firmware: %d\n",
+ ret);
+ goto err_out;
+ }
+ }
+
+ ret = iqs62x_dev_init(iqs62x);
+ if (ret) {
+ dev_err(&client->dev, "Failed to initialize device: %d\n", ret);
+ goto err_out;
+ }
+
+ ret = devm_request_threaded_irq(&client->dev, client->irq,
+ NULL, iqs62x_irq, IRQF_ONESHOT,
+ client->name, iqs62x);
+ if (ret) {
+ dev_err(&client->dev, "Failed to request IRQ: %d\n", ret);
+ goto err_out;
+ }
+
+ ret = devm_mfd_add_devices(&client->dev, PLATFORM_DEVID_NONE,
+ iqs62x->dev_desc->sub_devs,
+ iqs62x->dev_desc->num_sub_devs,
+ NULL, 0, NULL);
+ if (ret)
+ dev_err(&client->dev, "Failed to add sub-devices: %d\n", ret);
+
+err_out:
+ complete_all(&iqs62x->fw_done);
+}
+
+static const struct mfd_cell iqs620at_sub_devs[] = {
+ {
+ .name = "iqs62x-keys",
+ .of_compatible = "azoteq,iqs620a-keys",
+ },
+ {
+ .name = "iqs620a-pwm",
+ .of_compatible = "azoteq,iqs620a-pwm",
+ },
+ { .name = "iqs620at-temp", },
+};
+
+static const struct mfd_cell iqs620a_sub_devs[] = {
+ {
+ .name = "iqs62x-keys",
+ .of_compatible = "azoteq,iqs620a-keys",
+ },
+ {
+ .name = "iqs620a-pwm",
+ .of_compatible = "azoteq,iqs620a-pwm",
+ },
+};
+
+static const struct mfd_cell iqs621_sub_devs[] = {
+ {
+ .name = "iqs62x-keys",
+ .of_compatible = "azoteq,iqs621-keys",
+ },
+ { .name = "iqs621-als", },
+};
+
+static const struct mfd_cell iqs622_sub_devs[] = {
+ {
+ .name = "iqs62x-keys",
+ .of_compatible = "azoteq,iqs622-keys",
+ },
+ { .name = "iqs621-als", },
+};
+
+static const struct mfd_cell iqs624_sub_devs[] = {
+ {
+ .name = "iqs62x-keys",
+ .of_compatible = "azoteq,iqs624-keys",
+ },
+ { .name = "iqs624-pos", },
+};
+
+static const struct mfd_cell iqs625_sub_devs[] = {
+ {
+ .name = "iqs62x-keys",
+ .of_compatible = "azoteq,iqs625-keys",
+ },
+ { .name = "iqs624-pos", },
+};
+
+static const u8 iqs620at_cal_regs[] = {
+ IQS620_TEMP_CAL_MULT,
+ IQS620_TEMP_CAL_DIV,
+ IQS620_TEMP_CAL_OFFS,
+};
+
+static const u8 iqs621_cal_regs[] = {
+ IQS621_ALS_CAL_DIV_LUX,
+ IQS621_ALS_CAL_DIV_IR,
+};
+
+static const enum iqs62x_event_reg iqs620a_event_regs[][IQS62X_EVENT_SIZE] = {
+ [IQS62X_UI_PROX] = {
+ IQS62X_EVENT_SYS, /* 0x10 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_PROX, /* 0x12 */
+ IQS62X_EVENT_HYST, /* 0x13 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_HALL, /* 0x16 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_NONE,
+ },
+ [IQS62X_UI_SAR1] = {
+ IQS62X_EVENT_SYS, /* 0x10 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_HYST, /* 0x13 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_HALL, /* 0x16 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_NONE,
+ },
+};
+
+static const enum iqs62x_event_reg iqs621_event_regs[][IQS62X_EVENT_SIZE] = {
+ [IQS62X_UI_PROX] = {
+ IQS62X_EVENT_SYS, /* 0x10 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_PROX, /* 0x12 */
+ IQS62X_EVENT_HYST, /* 0x13 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_ALS, /* 0x16 */
+ IQS62X_EVENT_UI_LO, /* 0x17 */
+ IQS62X_EVENT_UI_HI, /* 0x18 */
+ IQS62X_EVENT_HALL, /* 0x19 */
+ },
+};
+
+static const enum iqs62x_event_reg iqs622_event_regs[][IQS62X_EVENT_SIZE] = {
+ [IQS62X_UI_PROX] = {
+ IQS62X_EVENT_SYS, /* 0x10 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_PROX, /* 0x12 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_ALS, /* 0x14 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_IR, /* 0x16 */
+ IQS62X_EVENT_UI_LO, /* 0x17 */
+ IQS62X_EVENT_UI_HI, /* 0x18 */
+ IQS62X_EVENT_HALL, /* 0x19 */
+ },
+ [IQS62X_UI_SAR1] = {
+ IQS62X_EVENT_SYS, /* 0x10 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_HYST, /* 0x13 */
+ IQS62X_EVENT_ALS, /* 0x14 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_IR, /* 0x16 */
+ IQS62X_EVENT_UI_LO, /* 0x17 */
+ IQS62X_EVENT_UI_HI, /* 0x18 */
+ IQS62X_EVENT_HALL, /* 0x19 */
+ },
+};
+
+static const enum iqs62x_event_reg iqs624_event_regs[][IQS62X_EVENT_SIZE] = {
+ [IQS62X_UI_PROX] = {
+ IQS62X_EVENT_SYS, /* 0x10 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_PROX, /* 0x12 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_WHEEL, /* 0x14 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_UI_LO, /* 0x16 */
+ IQS62X_EVENT_UI_HI, /* 0x17 */
+ IQS62X_EVENT_INTER, /* 0x18 */
+ IQS62X_EVENT_NONE,
+ },
+};
+
+static const enum iqs62x_event_reg iqs625_event_regs[][IQS62X_EVENT_SIZE] = {
+ [IQS62X_UI_PROX] = {
+ IQS62X_EVENT_SYS, /* 0x10 */
+ IQS62X_EVENT_PROX, /* 0x11 */
+ IQS62X_EVENT_INTER, /* 0x12 */
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_NONE,
+ },
+};
+
+static const struct iqs62x_dev_desc iqs62x_devs[] = {
+ {
+ .dev_name = "iqs620at",
+ .sub_devs = iqs620at_sub_devs,
+ .num_sub_devs = ARRAY_SIZE(iqs620at_sub_devs),
+
+ .prod_num = IQS620_PROD_NUM,
+ .sw_num = 0x08,
+ .cal_regs = iqs620at_cal_regs,
+ .num_cal_regs = ARRAY_SIZE(iqs620at_cal_regs),
+
+ .prox_mask = BIT(0),
+ .sar_mask = BIT(1) | BIT(7),
+ .hall_mask = BIT(2),
+ .hyst_mask = BIT(3),
+ .temp_mask = BIT(4),
+
+ .prox_settings = IQS620_PROX_SETTINGS_4,
+ .hall_flags = IQS620_HALL_FLAGS,
+
+ .clk_div = 4,
+ .fw_name = "iqs620a.bin",
+ .event_regs = &iqs620a_event_regs[IQS62X_UI_PROX],
+ },
+ {
+ .dev_name = "iqs620a",
+ .sub_devs = iqs620a_sub_devs,
+ .num_sub_devs = ARRAY_SIZE(iqs620a_sub_devs),
+
+ .prod_num = IQS620_PROD_NUM,
+ .sw_num = 0x08,
+
+ .prox_mask = BIT(0),
+ .sar_mask = BIT(1) | BIT(7),
+ .hall_mask = BIT(2),
+ .hyst_mask = BIT(3),
+ .temp_mask = BIT(4),
+
+ .prox_settings = IQS620_PROX_SETTINGS_4,
+ .hall_flags = IQS620_HALL_FLAGS,
+
+ .clk_div = 4,
+ .fw_name = "iqs620a.bin",
+ .event_regs = &iqs620a_event_regs[IQS62X_UI_PROX],
+ },
+ {
+ .dev_name = "iqs621",
+ .sub_devs = iqs621_sub_devs,
+ .num_sub_devs = ARRAY_SIZE(iqs621_sub_devs),
+
+ .prod_num = IQS621_PROD_NUM,
+ .sw_num = 0x09,
+ .cal_regs = iqs621_cal_regs,
+ .num_cal_regs = ARRAY_SIZE(iqs621_cal_regs),
+
+ .prox_mask = BIT(0),
+ .hall_mask = BIT(1),
+ .als_mask = BIT(2),
+ .hyst_mask = BIT(3),
+ .temp_mask = BIT(4),
+
+ .als_flags = IQS621_ALS_FLAGS,
+ .hall_flags = IQS621_HALL_FLAGS,
+ .hyst_shift = 5,
+
+ .clk_div = 2,
+ .fw_name = "iqs621.bin",
+ .event_regs = &iqs621_event_regs[IQS62X_UI_PROX],
+ },
+ {
+ .dev_name = "iqs622",
+ .sub_devs = iqs622_sub_devs,
+ .num_sub_devs = ARRAY_SIZE(iqs622_sub_devs),
+
+ .prod_num = IQS622_PROD_NUM,
+ .sw_num = 0x06,
+
+ .prox_mask = BIT(0),
+ .sar_mask = BIT(1),
+ .hall_mask = BIT(2),
+ .als_mask = BIT(3),
+ .ir_mask = BIT(4),
+
+ .prox_settings = IQS622_PROX_SETTINGS_4,
+ .als_flags = IQS622_ALS_FLAGS,
+ .hall_flags = IQS622_HALL_FLAGS,
+
+ .clk_div = 2,
+ .fw_name = "iqs622.bin",
+ .event_regs = &iqs622_event_regs[IQS62X_UI_PROX],
+ },
+ {
+ .dev_name = "iqs624",
+ .sub_devs = iqs624_sub_devs,
+ .num_sub_devs = ARRAY_SIZE(iqs624_sub_devs),
+
+ .prod_num = IQS624_PROD_NUM,
+ .sw_num = 0x0B,
+
+ .interval = IQS624_INTERVAL_NUM,
+ .interval_div = 3,
+
+ .clk_div = 2,
+ .fw_name = "iqs624.bin",
+ .event_regs = &iqs624_event_regs[IQS62X_UI_PROX],
+ },
+ {
+ .dev_name = "iqs625",
+ .sub_devs = iqs625_sub_devs,
+ .num_sub_devs = ARRAY_SIZE(iqs625_sub_devs),
+
+ .prod_num = IQS625_PROD_NUM,
+ .sw_num = 0x0B,
+
+ .interval = IQS625_INTERVAL_NUM,
+ .interval_div = 10,
+
+ .clk_div = 2,
+ .fw_name = "iqs625.bin",
+ .event_regs = &iqs625_event_regs[IQS62X_UI_PROX],
+ },
+};
+
+static const struct regmap_config iqs62x_map_config = {
+ .reg_bits = 8,
+ .val_bits = 8,
+ .max_register = IQS62X_MAX_REG,
+};
+
+static int iqs62x_probe(struct i2c_client *client)
+{
+ struct iqs62x_core *iqs62x;
+ struct iqs62x_info info;
+ unsigned int val;
+ int ret, i, j;
+ u8 sw_num = 0;
+ const char *fw_name = NULL;
+
+ iqs62x = devm_kzalloc(&client->dev, sizeof(*iqs62x), GFP_KERNEL);
+ if (!iqs62x)
+ return -ENOMEM;
+
+ i2c_set_clientdata(client, iqs62x);
+ iqs62x->client = client;
+
+ BLOCKING_INIT_NOTIFIER_HEAD(&iqs62x->nh);
+ INIT_LIST_HEAD(&iqs62x->fw_blk_head);
+ init_completion(&iqs62x->fw_done);
+
+ iqs62x->regmap = devm_regmap_init_i2c(client, &iqs62x_map_config);
+ if (IS_ERR(iqs62x->regmap)) {
+ ret = PTR_ERR(iqs62x->regmap);
+ dev_err(&client->dev, "Failed to initialize register map: %d\n",
+ ret);
+ return ret;
+ }
+
+ ret = regmap_raw_read(iqs62x->regmap, IQS62X_PROD_NUM, &info,
+ sizeof(info));
+ if (ret)
+ return ret;
+
+ /*
+ * The following sequence validates the device's product and software
+ * numbers. It then determines if the device is factory-calibrated by
+ * checking for nonzero values in the device's designated calibration
+ * registers (if applicable). Depending on the device, the absence of
+ * calibration data indicates a reduced feature set or invalid device.
+ *
+ * For devices given in both calibrated and uncalibrated versions, the
+ * calibrated version (e.g. IQS620AT) appears first in the iqs62x_devs
+ * array. The uncalibrated version (e.g. IQS620A) appears next and has
+ * the same product and software numbers, but no calibration registers
+ * are specified.
+ */
+ for (i = 0; i < ARRAY_SIZE(iqs62x_devs); i++) {
+ if (info.prod_num != iqs62x_devs[i].prod_num)
+ continue;
+
+ iqs62x->dev_desc = &iqs62x_devs[i];
+
+ if (info.sw_num < iqs62x->dev_desc->sw_num)
+ continue;
+
+ sw_num = info.sw_num;
+
+ /*
+ * Read each of the device's designated calibration registers,
+ * if any, and exit from the inner loop early if any are equal
+ * to zero (indicating the device is uncalibrated). This could
+ * be acceptable depending on the device (e.g. IQS620A instead
+ * of IQS620AT).
+ */
+ for (j = 0; j < iqs62x->dev_desc->num_cal_regs; j++) {
+ ret = regmap_read(iqs62x->regmap,
+ iqs62x->dev_desc->cal_regs[j], &val);
+ if (ret)
+ return ret;
+
+ if (!val)
+ break;
+ }
+
+ /*
+ * If the number of nonzero values read from the device equals
+ * the number of designated calibration registers (which could
+ * be zero), exit from the outer loop early to signal that the
+ * device's product and software numbers match a known device,
+ * and the device is calibrated (if applicable).
+ */
+ if (j == iqs62x->dev_desc->num_cal_regs)
+ break;
+ }
+
+ if (!iqs62x->dev_desc) {
+ dev_err(&client->dev, "Unrecognized product number: 0x%02X\n",
+ info.prod_num);
+ return -EINVAL;
+ }
+
+ if (!sw_num) {
+ dev_err(&client->dev, "Unrecognized software number: 0x%02X\n",
+ info.sw_num);
+ return -EINVAL;
+ }
+
+ if (i == ARRAY_SIZE(iqs62x_devs)) {
+ dev_err(&client->dev, "Uncalibrated device\n");
+ return -ENODATA;
+ }
+
+ device_property_read_string(&client->dev, "firmware-name", &fw_name);
+
+ ret = request_firmware_nowait(THIS_MODULE, FW_ACTION_HOTPLUG,
+ fw_name ? : iqs62x->dev_desc->fw_name,
+ &client->dev, GFP_KERNEL, iqs62x,
+ iqs62x_firmware_load);
+ if (ret)
+ dev_err(&client->dev, "Failed to request firmware: %d\n", ret);
+
+ return ret;
+}
+
+static int iqs62x_remove(struct i2c_client *client)
+{
+ struct iqs62x_core *iqs62x = i2c_get_clientdata(client);
+
+ wait_for_completion(&iqs62x->fw_done);
+
+ return 0;
+}
+
+static int __maybe_unused iqs62x_suspend(struct device *dev)
+{
+ struct iqs62x_core *iqs62x = dev_get_drvdata(dev);
+ int ret;
+
+ wait_for_completion(&iqs62x->fw_done);
+
+ /*
+ * As per the datasheet, automatic mode switching must be disabled
+ * before the device is placed in or taken out of halt mode.
+ */
+ ret = regmap_update_bits(iqs62x->regmap, IQS62X_PWR_SETTINGS,
+ IQS62X_PWR_SETTINGS_DIS_AUTO, 0xFF);
+ if (ret)
+ return ret;
+
+ return regmap_update_bits(iqs62x->regmap, IQS62X_PWR_SETTINGS,
+ IQS62X_PWR_SETTINGS_PWR_MODE_MASK,
+ IQS62X_PWR_SETTINGS_PWR_MODE_HALT);
+}
+
+static int __maybe_unused iqs62x_resume(struct device *dev)
+{
+ struct iqs62x_core *iqs62x = dev_get_drvdata(dev);
+ int ret;
+
+ ret = regmap_update_bits(iqs62x->regmap, IQS62X_PWR_SETTINGS,
+ IQS62X_PWR_SETTINGS_PWR_MODE_MASK,
+ IQS62X_PWR_SETTINGS_PWR_MODE_NORM);
+ if (ret)
+ return ret;
+
+ return regmap_update_bits(iqs62x->regmap, IQS62X_PWR_SETTINGS,
+ IQS62X_PWR_SETTINGS_DIS_AUTO, 0);
+}
+
+static SIMPLE_DEV_PM_OPS(iqs62x_pm, iqs62x_suspend, iqs62x_resume);
+
+static const struct of_device_id iqs62x_of_match[] = {
+ { .compatible = "azoteq,iqs620a" },
+ { .compatible = "azoteq,iqs621" },
+ { .compatible = "azoteq,iqs622" },
+ { .compatible = "azoteq,iqs624" },
+ { .compatible = "azoteq,iqs625" },
+ { }
+};
+MODULE_DEVICE_TABLE(of, iqs62x_of_match);
+
+static struct i2c_driver iqs62x_i2c_driver = {
+ .driver = {
+ .name = "iqs62x",
+ .of_match_table = iqs62x_of_match,
+ .pm = &iqs62x_pm,
+ },
+ .probe_new = iqs62x_probe,
+ .remove = iqs62x_remove,
+};
+module_i2c_driver(iqs62x_i2c_driver);
+
+MODULE_AUTHOR("Jeff LaBundy <jeff@labundy.com>");
+MODULE_DESCRIPTION("Azoteq IQS620A/621/622/624/625 Multi-Function Sensors");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mfd/omap-usb-host.c b/drivers/mfd/omap-usb-host.c
index 4798d9f3f9d5..1f4f01b02d98 100644
--- a/drivers/mfd/omap-usb-host.c
+++ b/drivers/mfd/omap-usb-host.c
@@ -840,7 +840,7 @@ MODULE_DEVICE_TABLE(of, usbhs_omap_dt_ids);
static struct platform_driver usbhs_omap_driver = {
.driver = {
- .name = (char *)usbhs_driver_name,
+ .name = usbhs_driver_name,
.pm = &usbhsomap_dev_pm_ops,
.of_match_table = usbhs_omap_dt_ids,
},
diff --git a/drivers/mfd/omap-usb-tll.c b/drivers/mfd/omap-usb-tll.c
index 265f5e350e1c..4b7f73c317e8 100644
--- a/drivers/mfd/omap-usb-tll.c
+++ b/drivers/mfd/omap-usb-tll.c
@@ -99,7 +99,7 @@
struct usbtll_omap {
void __iomem *base;
int nch; /* num. of channels */
- struct clk *ch_clk[0]; /* must be the last member */
+ struct clk *ch_clk[]; /* must be the last member */
};
/*-------------------------------------------------------------------------*/
@@ -304,7 +304,7 @@ MODULE_DEVICE_TABLE(of, usbtll_omap_dt_ids);
static struct platform_driver usbtll_omap_driver = {
.driver = {
- .name = (char *)usbtll_driver_name,
+ .name = usbtll_driver_name,
.of_match_table = usbtll_omap_dt_ids,
},
.probe = usbtll_omap_probe,
diff --git a/drivers/mfd/qcom-pm8xxx.c b/drivers/mfd/qcom-pm8xxx.c
index 29133326c6fd..acd172ddcbd6 100644
--- a/drivers/mfd/qcom-pm8xxx.c
+++ b/drivers/mfd/qcom-pm8xxx.c
@@ -76,7 +76,7 @@ struct pm_irq_chip {
unsigned int num_masters;
const struct pm_irq_data *pm_irq_data;
/* MUST BE AT THE END OF THIS STRUCT */
- u8 config[0];
+ u8 config[];
};
static int pm8xxx_read_block_irq(struct pm_irq_chip *chip, unsigned int bp,
diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c
index a69a6742ecdc..d109b9f14407 100644
--- a/drivers/mfd/rk808.c
+++ b/drivers/mfd/rk808.c
@@ -19,7 +19,6 @@
#include <linux/module.h>
#include <linux/of_device.h>
#include <linux/regmap.h>
-#include <linux/syscore_ops.h>
struct rk808_reg_data {
int addr;
@@ -186,7 +185,6 @@ static const struct rk808_reg_data rk805_pre_init_reg[] = {
{RK805_BUCK4_CONFIG_REG, RK805_BUCK3_4_ILMAX_MASK,
RK805_BUCK4_ILMAX_3500MA},
{RK805_BUCK4_CONFIG_REG, BUCK_ILMIN_MASK, BUCK_ILMIN_400MA},
- {RK805_GPIO_IO_POL_REG, SLP_SD_MSK, SLEEP_FUN},
{RK805_THERMAL_REG, TEMP_HOTDIE_MSK, TEMP115C},
};
@@ -449,88 +447,60 @@ static const struct regmap_irq_chip rk818_irq_chip = {
static struct i2c_client *rk808_i2c_client;
-static void rk805_device_shutdown(void)
+static void rk808_pm_power_off(void)
{
int ret;
+ unsigned int reg, bit;
struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
- if (!rk808)
- return;
-
- ret = regmap_update_bits(rk808->regmap,
- RK805_DEV_CTRL_REG,
- DEV_OFF, DEV_OFF);
- if (ret)
- dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
-}
-
-static void rk805_device_shutdown_prepare(void)
-{
- int ret;
- struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
-
- if (!rk808)
- return;
-
- ret = regmap_update_bits(rk808->regmap,
- RK805_GPIO_IO_POL_REG,
- SLP_SD_MSK, SHUTDOWN_FUN);
- if (ret)
- dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
-}
-
-static void rk808_device_shutdown(void)
-{
- int ret;
- struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
-
- if (!rk808)
- return;
-
- ret = regmap_update_bits(rk808->regmap,
- RK808_DEVCTRL_REG,
- DEV_OFF_RST, DEV_OFF_RST);
- if (ret)
- dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
-}
-
-static void rk818_device_shutdown(void)
-{
- int ret;
- struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
-
- if (!rk808)
+ switch (rk808->variant) {
+ case RK805_ID:
+ reg = RK805_DEV_CTRL_REG;
+ bit = DEV_OFF;
+ break;
+ case RK808_ID:
+ reg = RK808_DEVCTRL_REG,
+ bit = DEV_OFF_RST;
+ break;
+ case RK818_ID:
+ reg = RK818_DEVCTRL_REG;
+ bit = DEV_OFF;
+ break;
+ default:
return;
-
- ret = regmap_update_bits(rk808->regmap,
- RK818_DEVCTRL_REG,
- DEV_OFF, DEV_OFF);
+ }
+ ret = regmap_update_bits(rk808->regmap, reg, bit, bit);
if (ret)
dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
}
-static void rk8xx_syscore_shutdown(void)
+static void rk8xx_shutdown(struct i2c_client *client)
{
- struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+ struct rk808 *rk808 = i2c_get_clientdata(client);
int ret;
- if (system_state == SYSTEM_POWER_OFF &&
- (rk808->variant == RK809_ID || rk808->variant == RK817_ID)) {
+ switch (rk808->variant) {
+ case RK805_ID:
+ ret = regmap_update_bits(rk808->regmap,
+ RK805_GPIO_IO_POL_REG,
+ SLP_SD_MSK,
+ SHUTDOWN_FUN);
+ break;
+ case RK809_ID:
+ case RK817_ID:
ret = regmap_update_bits(rk808->regmap,
RK817_SYS_CFG(3),
RK817_SLPPIN_FUNC_MSK,
SLPPIN_DN_FUN);
- if (ret) {
- dev_warn(&rk808_i2c_client->dev,
- "Cannot switch to power down function\n");
- }
+ break;
+ default:
+ return;
}
+ if (ret)
+ dev_warn(&client->dev,
+ "Cannot switch to power down function\n");
}
-static struct syscore_ops rk808_syscore_ops = {
- .shutdown = rk8xx_syscore_shutdown,
-};
-
static const struct of_device_id rk808_of_match[] = {
{ .compatible = "rockchip,rk805" },
{ .compatible = "rockchip,rk808" },
@@ -550,7 +520,7 @@ static int rk808_probe(struct i2c_client *client,
const struct mfd_cell *cells;
int nr_pre_init_regs;
int nr_cells;
- int pm_off = 0, msb, lsb;
+ int msb, lsb;
unsigned char pmic_id_msb, pmic_id_lsb;
int ret;
int i;
@@ -594,8 +564,6 @@ static int rk808_probe(struct i2c_client *client,
nr_pre_init_regs = ARRAY_SIZE(rk805_pre_init_reg);
cells = rk805s;
nr_cells = ARRAY_SIZE(rk805s);
- rk808->pm_pwroff_fn = rk805_device_shutdown;
- rk808->pm_pwroff_prep_fn = rk805_device_shutdown_prepare;
break;
case RK808_ID:
rk808->regmap_cfg = &rk808_regmap_config;
@@ -604,7 +572,6 @@ static int rk808_probe(struct i2c_client *client,
nr_pre_init_regs = ARRAY_SIZE(rk808_pre_init_reg);
cells = rk808s;
nr_cells = ARRAY_SIZE(rk808s);
- rk808->pm_pwroff_fn = rk808_device_shutdown;
break;
case RK818_ID:
rk808->regmap_cfg = &rk818_regmap_config;
@@ -613,7 +580,6 @@ static int rk808_probe(struct i2c_client *client,
nr_pre_init_regs = ARRAY_SIZE(rk818_pre_init_reg);
cells = rk818s;
nr_cells = ARRAY_SIZE(rk818s);
- rk808->pm_pwroff_fn = rk818_device_shutdown;
break;
case RK809_ID:
case RK817_ID:
@@ -623,7 +589,6 @@ static int rk808_probe(struct i2c_client *client,
nr_pre_init_regs = ARRAY_SIZE(rk817_pre_init_reg);
cells = rk817s;
nr_cells = ARRAY_SIZE(rk817s);
- register_syscore_ops(&rk808_syscore_ops);
break;
default:
dev_err(&client->dev, "Unsupported RK8XX ID %lu\n",
@@ -674,17 +639,9 @@ static int rk808_probe(struct i2c_client *client,
goto err_irq;
}
- pm_off = of_property_read_bool(np,
- "rockchip,system-power-controller");
- if (pm_off && !pm_power_off) {
+ if (of_property_read_bool(np, "rockchip,system-power-controller")) {
rk808_i2c_client = client;
- pm_power_off = rk808->pm_pwroff_fn;
- }
-
- if (pm_off && !pm_power_off_prepare) {
- if (!rk808_i2c_client)
- rk808_i2c_client = client;
- pm_power_off_prepare = rk808->pm_pwroff_prep_fn;
+ pm_power_off = rk808_pm_power_off;
}
return 0;
@@ -704,25 +661,24 @@ static int rk808_remove(struct i2c_client *client)
* pm_power_off may points to a function from another module.
* Check if the pointer is set by us and only then overwrite it.
*/
- if (rk808->pm_pwroff_fn && pm_power_off == rk808->pm_pwroff_fn)
+ if (pm_power_off == rk808_pm_power_off)
pm_power_off = NULL;
- /**
- * As above, check if the pointer is set by us before overwrite.
- */
- if (rk808->pm_pwroff_prep_fn &&
- pm_power_off_prepare == rk808->pm_pwroff_prep_fn)
- pm_power_off_prepare = NULL;
-
return 0;
}
static int __maybe_unused rk8xx_suspend(struct device *dev)
{
- struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+ struct rk808 *rk808 = i2c_get_clientdata(to_i2c_client(dev));
int ret = 0;
switch (rk808->variant) {
+ case RK805_ID:
+ ret = regmap_update_bits(rk808->regmap,
+ RK805_GPIO_IO_POL_REG,
+ SLP_SD_MSK,
+ SLEEP_FUN);
+ break;
case RK809_ID:
case RK817_ID:
ret = regmap_update_bits(rk808->regmap,
@@ -739,7 +695,7 @@ static int __maybe_unused rk8xx_suspend(struct device *dev)
static int __maybe_unused rk8xx_resume(struct device *dev)
{
- struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+ struct rk808 *rk808 = i2c_get_clientdata(to_i2c_client(dev));
int ret = 0;
switch (rk808->variant) {
@@ -766,6 +722,7 @@ static struct i2c_driver rk808_i2c_driver = {
},
.probe = rk808_probe,
.remove = rk808_remove,
+ .shutdown = rk8xx_shutdown,
};
module_i2c_driver(rk808_i2c_driver);
diff --git a/drivers/mfd/rn5t618.c b/drivers/mfd/rn5t618.c
index ead2e79036a9..232de50562f9 100644
--- a/drivers/mfd/rn5t618.c
+++ b/drivers/mfd/rn5t618.c
@@ -8,10 +8,13 @@
#include <linux/delay.h>
#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/mfd/core.h>
#include <linux/mfd/rn5t618.h>
#include <linux/module.h>
#include <linux/of_device.h>
+#include <linux/platform_device.h>
#include <linux/reboot.h>
#include <linux/regmap.h>
@@ -20,6 +23,13 @@ static const struct mfd_cell rn5t618_cells[] = {
{ .name = "rn5t618-wdt" },
};
+static const struct mfd_cell rc5t619_cells[] = {
+ { .name = "rn5t618-adc" },
+ { .name = "rn5t618-regulator" },
+ { .name = "rc5t619-rtc" },
+ { .name = "rn5t618-wdt" },
+};
+
static bool rn5t618_volatile_reg(struct device *dev, unsigned int reg)
{
switch (reg) {
@@ -32,6 +42,8 @@ static bool rn5t618_volatile_reg(struct device *dev, unsigned int reg)
case RN5T618_IR_GPF:
case RN5T618_MON_IOIN:
case RN5T618_INTMON:
+ case RN5T618_RTC_CTRL1 ... RN5T618_RTC_CTRL2:
+ case RN5T618_RTC_SECONDS ... RN5T618_RTC_YEAR:
return true;
default:
return false;
@@ -46,9 +58,56 @@ static const struct regmap_config rn5t618_regmap_config = {
.cache_type = REGCACHE_RBTREE,
};
+static const struct regmap_irq rc5t619_irqs[] = {
+ REGMAP_IRQ_REG(RN5T618_IRQ_SYS, 0, BIT(0)),
+ REGMAP_IRQ_REG(RN5T618_IRQ_DCDC, 0, BIT(1)),
+ REGMAP_IRQ_REG(RN5T618_IRQ_RTC, 0, BIT(2)),
+ REGMAP_IRQ_REG(RN5T618_IRQ_ADC, 0, BIT(3)),
+ REGMAP_IRQ_REG(RN5T618_IRQ_GPIO, 0, BIT(4)),
+ REGMAP_IRQ_REG(RN5T618_IRQ_CHG, 0, BIT(6)),
+};
+
+static const struct regmap_irq_chip rc5t619_irq_chip = {
+ .name = "rc5t619",
+ .irqs = rc5t619_irqs,
+ .num_irqs = ARRAY_SIZE(rc5t619_irqs),
+ .num_regs = 1,
+ .status_base = RN5T618_INTMON,
+ .mask_base = RN5T618_INTEN,
+ .mask_invert = true,
+};
+
static struct rn5t618 *rn5t618_pm_power_off;
static struct notifier_block rn5t618_restart_handler;
+static int rn5t618_irq_init(struct rn5t618 *rn5t618)
+{
+ const struct regmap_irq_chip *irq_chip = NULL;
+ int ret;
+
+ if (!rn5t618->irq)
+ return 0;
+
+ switch (rn5t618->variant) {
+ case RC5T619:
+ irq_chip = &rc5t619_irq_chip;
+ break;
+ default:
+ dev_err(rn5t618->dev, "Currently no IRQ support for variant %d\n",
+ (int)rn5t618->variant);
+ return -ENOENT;
+ }
+
+ ret = devm_regmap_add_irq_chip(rn5t618->dev, rn5t618->regmap,
+ rn5t618->irq,
+ IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
+ 0, irq_chip, &rn5t618->irq_data);
+ if (ret)
+ dev_err(rn5t618->dev, "Failed to register IRQ chip\n");
+
+ return ret;
+}
+
static void rn5t618_trigger_poweroff_sequence(bool repower)
{
/* disable automatic repower-on */
@@ -87,8 +146,7 @@ static const struct of_device_id rn5t618_of_match[] = {
};
MODULE_DEVICE_TABLE(of, rn5t618_of_match);
-static int rn5t618_i2c_probe(struct i2c_client *i2c,
- const struct i2c_device_id *id)
+static int rn5t618_i2c_probe(struct i2c_client *i2c)
{
const struct of_device_id *of_id;
struct rn5t618 *priv;
@@ -106,6 +164,8 @@ static int rn5t618_i2c_probe(struct i2c_client *i2c,
i2c_set_clientdata(i2c, priv);
priv->variant = (long)of_id->data;
+ priv->irq = i2c->irq;
+ priv->dev = &i2c->dev;
priv->regmap = devm_regmap_init_i2c(i2c, &rn5t618_regmap_config);
if (IS_ERR(priv->regmap)) {
@@ -114,8 +174,16 @@ static int rn5t618_i2c_probe(struct i2c_client *i2c,
return ret;
}
- ret = devm_mfd_add_devices(&i2c->dev, -1, rn5t618_cells,
- ARRAY_SIZE(rn5t618_cells), NULL, 0, NULL);
+ if (priv->variant == RC5T619)
+ ret = devm_mfd_add_devices(&i2c->dev, PLATFORM_DEVID_NONE,
+ rc5t619_cells,
+ ARRAY_SIZE(rc5t619_cells),
+ NULL, 0, NULL);
+ else
+ ret = devm_mfd_add_devices(&i2c->dev, PLATFORM_DEVID_NONE,
+ rn5t618_cells,
+ ARRAY_SIZE(rn5t618_cells),
+ NULL, 0, NULL);
if (ret) {
dev_err(&i2c->dev, "failed to add sub-devices: %d\n", ret);
return ret;
@@ -138,7 +206,7 @@ static int rn5t618_i2c_probe(struct i2c_client *i2c,
return ret;
}
- return 0;
+ return rn5t618_irq_init(priv);
}
static int rn5t618_i2c_remove(struct i2c_client *i2c)
@@ -155,19 +223,38 @@ static int rn5t618_i2c_remove(struct i2c_client *i2c)
return 0;
}
-static const struct i2c_device_id rn5t618_i2c_id[] = {
- { }
-};
-MODULE_DEVICE_TABLE(i2c, rn5t618_i2c_id);
+static int __maybe_unused rn5t618_i2c_suspend(struct device *dev)
+{
+ struct rn5t618 *priv = dev_get_drvdata(dev);
+
+ if (priv->irq)
+ disable_irq(priv->irq);
+
+ return 0;
+}
+
+static int __maybe_unused rn5t618_i2c_resume(struct device *dev)
+{
+ struct rn5t618 *priv = dev_get_drvdata(dev);
+
+ if (priv->irq)
+ enable_irq(priv->irq);
+
+ return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(rn5t618_i2c_dev_pm_ops,
+ rn5t618_i2c_suspend,
+ rn5t618_i2c_resume);
static struct i2c_driver rn5t618_i2c_driver = {
.driver = {
.name = "rn5t618",
.of_match_table = of_match_ptr(rn5t618_of_match),
+ .pm = &rn5t618_i2c_dev_pm_ops,
},
- .probe = rn5t618_i2c_probe,
+ .probe_new = rn5t618_i2c_probe,
.remove = rn5t618_i2c_remove,
- .id_table = rn5t618_i2c_id,
};
module_i2c_driver(rn5t618_i2c_driver);
diff --git a/drivers/mfd/sprd-sc27xx-spi.c b/drivers/mfd/sprd-sc27xx-spi.c
index c0529a1cd5ea..ebdf2f11ae28 100644
--- a/drivers/mfd/sprd-sc27xx-spi.c
+++ b/drivers/mfd/sprd-sc27xx-spi.c
@@ -10,6 +10,7 @@
#include <linux/of_device.h>
#include <linux/regmap.h>
#include <linux/spi/spi.h>
+#include <uapi/linux/usb/charger.h>
#define SPRD_PMIC_INT_MASK_STATUS 0x0
#define SPRD_PMIC_INT_RAW_STATUS 0x4
@@ -17,6 +18,16 @@
#define SPRD_SC2731_IRQ_BASE 0x140
#define SPRD_SC2731_IRQ_NUMS 16
+#define SPRD_SC2731_CHG_DET 0xedc
+
+/* PMIC charger detection definition */
+#define SPRD_PMIC_CHG_DET_DELAY_US 200000
+#define SPRD_PMIC_CHG_DET_TIMEOUT 2000000
+#define SPRD_PMIC_CHG_DET_DONE BIT(11)
+#define SPRD_PMIC_SDP_TYPE BIT(7)
+#define SPRD_PMIC_DCP_TYPE BIT(6)
+#define SPRD_PMIC_CDP_TYPE BIT(5)
+#define SPRD_PMIC_CHG_TYPE_MASK GENMASK(7, 5)
struct sprd_pmic {
struct regmap *regmap;
@@ -24,12 +35,14 @@ struct sprd_pmic {
struct regmap_irq *irqs;
struct regmap_irq_chip irq_chip;
struct regmap_irq_chip_data *irq_data;
+ const struct sprd_pmic_data *pdata;
int irq;
};
struct sprd_pmic_data {
u32 irq_base;
u32 num_irqs;
+ u32 charger_det;
};
/*
@@ -40,8 +53,46 @@ struct sprd_pmic_data {
static const struct sprd_pmic_data sc2731_data = {
.irq_base = SPRD_SC2731_IRQ_BASE,
.num_irqs = SPRD_SC2731_IRQ_NUMS,
+ .charger_det = SPRD_SC2731_CHG_DET,
};
+enum usb_charger_type sprd_pmic_detect_charger_type(struct device *dev)
+{
+ struct spi_device *spi = to_spi_device(dev);
+ struct sprd_pmic *ddata = spi_get_drvdata(spi);
+ const struct sprd_pmic_data *pdata = ddata->pdata;
+ enum usb_charger_type type;
+ u32 val;
+ int ret;
+
+ ret = regmap_read_poll_timeout(ddata->regmap, pdata->charger_det, val,
+ (val & SPRD_PMIC_CHG_DET_DONE),
+ SPRD_PMIC_CHG_DET_DELAY_US,
+ SPRD_PMIC_CHG_DET_TIMEOUT);
+ if (ret) {
+ dev_err(&spi->dev, "failed to detect charger type\n");
+ return UNKNOWN_TYPE;
+ }
+
+ switch (val & SPRD_PMIC_CHG_TYPE_MASK) {
+ case SPRD_PMIC_CDP_TYPE:
+ type = CDP_TYPE;
+ break;
+ case SPRD_PMIC_DCP_TYPE:
+ type = DCP_TYPE;
+ break;
+ case SPRD_PMIC_SDP_TYPE:
+ type = SDP_TYPE;
+ break;
+ default:
+ type = UNKNOWN_TYPE;
+ break;
+ }
+
+ return type;
+}
+EXPORT_SYMBOL_GPL(sprd_pmic_detect_charger_type);
+
static const struct mfd_cell sprd_pmic_devs[] = {
{
.name = "sc27xx-wdt",
@@ -181,6 +232,7 @@ static int sprd_pmic_probe(struct spi_device *spi)
spi_set_drvdata(spi, ddata);
ddata->dev = &spi->dev;
ddata->irq = spi->irq;
+ ddata->pdata = pdata;
ddata->irq_chip.name = dev_name(&spi->dev);
ddata->irq_chip.status_base =
diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c
index cc92bc3ed820..886459e0ddd9 100644
--- a/drivers/misc/lkdtm/bugs.c
+++ b/drivers/misc/lkdtm/bugs.c
@@ -11,6 +11,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/task_stack.h>
#include <linux/uaccess.h>
+#include <linux/slab.h>
#ifdef CONFIG_X86_32
#include <asm/desc.h>
@@ -175,6 +176,80 @@ void lkdtm_HUNG_TASK(void)
schedule();
}
+volatile unsigned int huge = INT_MAX - 2;
+volatile unsigned int ignored;
+
+void lkdtm_OVERFLOW_SIGNED(void)
+{
+ int value;
+
+ value = huge;
+ pr_info("Normal signed addition ...\n");
+ value += 1;
+ ignored = value;
+
+ pr_info("Overflowing signed addition ...\n");
+ value += 4;
+ ignored = value;
+}
+
+
+void lkdtm_OVERFLOW_UNSIGNED(void)
+{
+ unsigned int value;
+
+ value = huge;
+ pr_info("Normal unsigned addition ...\n");
+ value += 1;
+ ignored = value;
+
+ pr_info("Overflowing unsigned addition ...\n");
+ value += 4;
+ ignored = value;
+}
+
+/* Intentially using old-style flex array definition of 1 byte. */
+struct array_bounds_flex_array {
+ int one;
+ int two;
+ char data[1];
+};
+
+struct array_bounds {
+ int one;
+ int two;
+ char data[8];
+ int three;
+};
+
+void lkdtm_ARRAY_BOUNDS(void)
+{
+ struct array_bounds_flex_array *not_checked;
+ struct array_bounds *checked;
+ volatile int i;
+
+ not_checked = kmalloc(sizeof(*not_checked) * 2, GFP_KERNEL);
+ checked = kmalloc(sizeof(*checked) * 2, GFP_KERNEL);
+
+ pr_info("Array access within bounds ...\n");
+ /* For both, touch all bytes in the actual member size. */
+ for (i = 0; i < sizeof(checked->data); i++)
+ checked->data[i] = 'A';
+ /*
+ * For the uninstrumented flex array member, also touch 1 byte
+ * beyond to verify it is correctly uninstrumented.
+ */
+ for (i = 0; i < sizeof(not_checked->data) + 1; i++)
+ not_checked->data[i] = 'A';
+
+ pr_info("Array access beyond bounds ...\n");
+ for (i = 0; i < sizeof(checked->data) + 1; i++)
+ checked->data[i] = 'B';
+
+ kfree(not_checked);
+ kfree(checked);
+}
+
void lkdtm_CORRUPT_LIST_ADD(void)
{
/*
diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
index 5ce4ac8c06fc..a5e344df9166 100644
--- a/drivers/misc/lkdtm/core.c
+++ b/drivers/misc/lkdtm/core.c
@@ -130,6 +130,9 @@ static const struct crashtype crashtypes[] = {
CRASHTYPE(HARDLOCKUP),
CRASHTYPE(SPINLOCKUP),
CRASHTYPE(HUNG_TASK),
+ CRASHTYPE(OVERFLOW_SIGNED),
+ CRASHTYPE(OVERFLOW_UNSIGNED),
+ CRASHTYPE(ARRAY_BOUNDS),
CRASHTYPE(EXEC_DATA),
CRASHTYPE(EXEC_STACK),
CRASHTYPE(EXEC_KMALLOC),
diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h
index 8d13d0176624..601a2156a0d4 100644
--- a/drivers/misc/lkdtm/lkdtm.h
+++ b/drivers/misc/lkdtm/lkdtm.h
@@ -22,6 +22,9 @@ void lkdtm_SOFTLOCKUP(void);
void lkdtm_HARDLOCKUP(void);
void lkdtm_SPINLOCKUP(void);
void lkdtm_HUNG_TASK(void);
+void lkdtm_OVERFLOW_SIGNED(void);
+void lkdtm_OVERFLOW_UNSIGNED(void);
+void lkdtm_ARRAY_BOUNDS(void);
void lkdtm_CORRUPT_LIST_ADD(void);
void lkdtm_CORRUPT_LIST_DEL(void);
void lkdtm_CORRUPT_USER_DS(void);
diff --git a/drivers/misc/mic/Kconfig b/drivers/misc/mic/Kconfig
index b6841ba6d922..8f201d019f5a 100644
--- a/drivers/misc/mic/Kconfig
+++ b/drivers/misc/mic/Kconfig
@@ -133,8 +133,4 @@ config VOP
OS and tools for MIC to use with this driver are available from
<http://software.intel.com/en-us/mic-developer>.
-if VOP
-source "drivers/vhost/Kconfig.vringh"
-endif
-
endmenu
diff --git a/drivers/mtd/ubi/fastmap-wl.c b/drivers/mtd/ubi/fastmap-wl.c
index 426820ab9afe..b486250923c5 100644
--- a/drivers/mtd/ubi/fastmap-wl.c
+++ b/drivers/mtd/ubi/fastmap-wl.c
@@ -39,6 +39,13 @@ static struct ubi_wl_entry *find_anchor_wl_entry(struct rb_root *root)
return victim;
}
+static inline void return_unused_peb(struct ubi_device *ubi,
+ struct ubi_wl_entry *e)
+{
+ wl_tree_add(e, &ubi->free);
+ ubi->free_count++;
+}
+
/**
* return_unused_pool_pebs - returns unused PEB to the free tree.
* @ubi: UBI device description object
@@ -52,8 +59,7 @@ static void return_unused_pool_pebs(struct ubi_device *ubi,
for (i = pool->used; i < pool->size; i++) {
e = ubi->lookuptbl[pool->pebs[i]];
- wl_tree_add(e, &ubi->free);
- ubi->free_count++;
+ return_unused_peb(ubi, e);
}
}
@@ -361,6 +367,11 @@ static void ubi_fastmap_close(struct ubi_device *ubi)
return_unused_pool_pebs(ubi, &ubi->fm_pool);
return_unused_pool_pebs(ubi, &ubi->fm_wl_pool);
+ if (ubi->fm_anchor) {
+ return_unused_peb(ubi, ubi->fm_anchor);
+ ubi->fm_anchor = NULL;
+ }
+
if (ubi->fm) {
for (i = 0; i < ubi->fm->used_blocks; i++)
kfree(ubi->fm->e[i]);
diff --git a/drivers/mtd/ubi/ubi-media.h b/drivers/mtd/ubi/ubi-media.h
index b5fe8f82281b..386db0598e95 100644
--- a/drivers/mtd/ubi/ubi-media.h
+++ b/drivers/mtd/ubi/ubi-media.h
@@ -498,6 +498,6 @@ struct ubi_fm_volhdr {
struct ubi_fm_eba {
__be32 magic;
__be32 reserved_pebs;
- __be32 pnum[0];
+ __be32 pnum[];
} __packed;
#endif /* !__UBI_MEDIA_H__ */
diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
index 837d690a8c60..5146cce5fe32 100644
--- a/drivers/mtd/ubi/wl.c
+++ b/drivers/mtd/ubi/wl.c
@@ -1875,7 +1875,8 @@ int ubi_wl_init(struct ubi_device *ubi, struct ubi_attach_info *ai)
goto out_free;
#ifdef CONFIG_MTD_UBI_FASTMAP
- ubi_ensure_anchor_pebs(ubi);
+ if (!ubi->ro_mode && !ubi->fm_disabled)
+ ubi_ensure_anchor_pebs(ubi);
#endif
return 0;
diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig
index e74e2bb61236..9db0570c5beb 100644
--- a/drivers/net/caif/Kconfig
+++ b/drivers/net/caif/Kconfig
@@ -58,8 +58,4 @@ config CAIF_VIRTIO
---help---
The CAIF driver for CAIF over Virtio.
-if CAIF_VIRTIO
-source "drivers/vhost/Kconfig.vringh"
-endif
-
endif # CAIF_DRIVERS
diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c
index 086dfb1b9d0b..91cdc0a2b1a7 100644
--- a/drivers/net/can/slcan.c
+++ b/drivers/net/can/slcan.c
@@ -148,7 +148,7 @@ static void slc_bump(struct slcan *sl)
u32 tmpid;
char *cmd = sl->rbuff;
- cf.can_id = 0;
+ memset(&cf, 0, sizeof(cf));
switch (*cmd) {
case 'r':
@@ -187,8 +187,6 @@ static void slc_bump(struct slcan *sl)
else
return;
- *(u64 *) (&cf.data) = 0; /* clear payload */
-
/* RTR frames may have a dlc > 0 but they never have any data bytes */
if (!(cf.can_id & CAN_RTR_FLAG)) {
for (i = 0; i < cf.can_dlc; i++) {
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index affa5c6e135c..c7ac63f41918 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -480,7 +480,7 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds)
priv->slave_mii_bus->parent = ds->dev->parent;
priv->slave_mii_bus->phy_mask = ~priv->indir_phy_mask;
- err = of_mdiobus_register(priv->slave_mii_bus, dn);
+ err = mdiobus_register(priv->slave_mii_bus);
if (err && dn)
of_node_put(dn);
@@ -1079,6 +1079,7 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev)
const struct bcm_sf2_of_data *data;
struct b53_platform_data *pdata;
struct dsa_switch_ops *ops;
+ struct device_node *ports;
struct bcm_sf2_priv *priv;
struct b53_device *dev;
struct dsa_switch *ds;
@@ -1146,7 +1147,11 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev)
set_bit(0, priv->cfp.used);
set_bit(0, priv->cfp.unique);
- bcm_sf2_identify_ports(priv, dn->child);
+ ports = of_find_node_by_name(dn, "ports");
+ if (ports) {
+ bcm_sf2_identify_ports(priv, ports);
+ of_node_put(ports);
+ }
priv->irq0 = irq_of_parse_and_map(dn, 0);
priv->irq1 = irq_of_parse_and_map(dn, 1);
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index ef57552db260..2d0d91db0ddb 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -1403,6 +1403,9 @@ mt7530_setup(struct dsa_switch *ds)
continue;
phy_node = of_parse_phandle(mac_np, "phy-handle", 0);
+ if (!phy_node)
+ continue;
+
if (phy_node->parent == priv->dev->of_node->parent) {
ret = of_get_phy_mode(mac_np, &interface);
if (ret && ret != -ENODEV)
diff --git a/drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c b/drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c
index 97901c114bfa..fbe9d88b13c7 100644
--- a/drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c
+++ b/drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c
@@ -491,7 +491,7 @@ get_ingress_preclass_record(struct aq_hw_s *hw,
rec->snap[1] = packed_record[8] & 0xFF;
rec->llc = (packed_record[8] >> 8) & 0xFF;
- rec->llc = packed_record[9] << 8;
+ rec->llc |= packed_record[9] << 8;
rec->mac_sa[0] = packed_record[10];
rec->mac_sa[0] |= packed_record[11] << 16;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
index 9638d65d8261..517caedc0a87 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
@@ -6874,7 +6874,8 @@ int bnx2x_link_update(struct link_params *params, struct link_vars *vars)
case PORT_HW_CFG_PHY_SELECTION_FIRST_PHY_PRIORITY:
/* In this option, the first PHY makes sure to pass the
* traffic through itself only.
- * Its not clear how to reset the link on the second phy
+ * It's not clear how to reset the link on the second
+ * phy.
*/
active_external_phy = EXT_PHY1;
break;
diff --git a/drivers/net/ethernet/cavium/common/cavium_ptp.h b/drivers/net/ethernet/cavium/common/cavium_ptp.h
index a04eccbc78e8..1e0ffe8f4152 100644
--- a/drivers/net/ethernet/cavium/common/cavium_ptp.h
+++ b/drivers/net/ethernet/cavium/common/cavium_ptp.h
@@ -24,7 +24,7 @@ struct cavium_ptp {
struct ptp_clock *ptp_clock;
};
-#if IS_ENABLED(CONFIG_CAVIUM_PTP)
+#if IS_REACHABLE(CONFIG_CAVIUM_PTP)
struct cavium_ptp *cavium_ptp_get(void);
void cavium_ptp_put(struct cavium_ptp *ptp);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 75fde0d4d493..a70018f067aa 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3132,7 +3132,6 @@ static int cxgb_set_mac_addr(struct net_device *dev, void *p)
return ret;
memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
- pi->xact_addr_filt = ret;
return 0;
}
@@ -6672,6 +6671,10 @@ static void shutdown_one(struct pci_dev *pdev)
if (adapter->port[i]->reg_state == NETREG_REGISTERED)
cxgb_close(adapter->port[i]);
+ rtnl_lock();
+ cxgb4_mqprio_stop_offload(adapter);
+ rtnl_unlock();
+
if (is_uld(adapter)) {
detach_ulds(adapter);
t4_uld_clean_up(adapter);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c
index ec3eb45ee3b4..e6af4906d674 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c
@@ -301,6 +301,7 @@ static void cxgb4_mqprio_free_hw_resources(struct net_device *dev)
cxgb4_clear_msix_aff(eorxq->msix->vec,
eorxq->msix->aff_mask);
free_irq(eorxq->msix->vec, &eorxq->rspq);
+ cxgb4_free_msix_idx_in_bmap(adap, eorxq->msix->idx);
}
free_rspq_fl(adap, &eorxq->rspq, &eorxq->fl);
@@ -611,6 +612,28 @@ out:
return ret;
}
+void cxgb4_mqprio_stop_offload(struct adapter *adap)
+{
+ struct cxgb4_tc_port_mqprio *tc_port_mqprio;
+ struct net_device *dev;
+ u8 i;
+
+ if (!adap->tc_mqprio || !adap->tc_mqprio->port_mqprio)
+ return;
+
+ for_each_port(adap, i) {
+ dev = adap->port[i];
+ if (!dev)
+ continue;
+
+ tc_port_mqprio = &adap->tc_mqprio->port_mqprio[i];
+ if (!tc_port_mqprio->mqprio.qopt.num_tc)
+ continue;
+
+ cxgb4_mqprio_disable_offload(dev);
+ }
+}
+
int cxgb4_init_tc_mqprio(struct adapter *adap)
{
struct cxgb4_tc_port_mqprio *tc_port_mqprio, *port_mqprio;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h
index c532f1ef8451..ff8794132b22 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h
@@ -38,6 +38,7 @@ struct cxgb4_tc_mqprio {
int cxgb4_setup_tc_mqprio(struct net_device *dev,
struct tc_mqprio_qopt_offload *mqprio);
+void cxgb4_mqprio_stop_offload(struct adapter *adap);
int cxgb4_init_tc_mqprio(struct adapter *adap);
void cxgb4_cleanup_tc_mqprio(struct adapter *adap);
#endif /* __CXGB4_TC_MQPRIO_H__ */
diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index 835b7816e372..87236206366f 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -1731,7 +1731,7 @@ static int ftgmac100_setup_clk(struct ftgmac100 *priv)
if (rc)
goto cleanup_clk;
- /* RCLK is for RMII, typically used for NCSI. Optional because its not
+ /* RCLK is for RMII, typically used for NCSI. Optional because it's not
* necessary if it's the AST2400 MAC, or the MAC is configured for
* RGMII, or the controller is not an ASPEED-based controller.
*/
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
index 2f76908cae73..51117a5a6bbf 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
@@ -150,14 +150,20 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp,
u8 prio = act->vlan.prio;
u16 vid = act->vlan.vid;
- return mlxsw_sp_acl_rulei_act_vlan(mlxsw_sp, rulei,
- act->id, vid,
- proto, prio, extack);
+ err = mlxsw_sp_acl_rulei_act_vlan(mlxsw_sp, rulei,
+ act->id, vid,
+ proto, prio, extack);
+ if (err)
+ return err;
+ break;
}
case FLOW_ACTION_PRIORITY:
- return mlxsw_sp_acl_rulei_act_priority(mlxsw_sp, rulei,
- act->priority,
- extack);
+ err = mlxsw_sp_acl_rulei_act_priority(mlxsw_sp, rulei,
+ act->priority,
+ extack);
+ if (err)
+ return err;
+ break;
case FLOW_ACTION_MANGLE: {
enum flow_action_mangle_base htype = act->mangle.htype;
__be32 be_mask = (__force __be32) act->mangle.mask;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c
index 9096ffd89e50..fbf714d027d8 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c
@@ -643,7 +643,7 @@ static int mlxsw_sp_trap_policer_bs(u64 burst, u8 *p_burst_size,
{
int bs = fls64(burst) - 1;
- if (burst != (1 << bs)) {
+ if (burst != (BIT_ULL(bs))) {
NL_SET_ERR_MSG_MOD(extack, "Policer burst size is not power of two");
return -EINVAL;
}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 1a5fc2ae351c..29810a1aa210 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -369,8 +369,8 @@ int qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn,
struct qed_spq_entry *p_ent = NULL;
struct qed_sp_init_data init_data;
u8 abs_vport_id = 0;
- int rc = -EINVAL;
u16 rx_mode = 0;
+ int rc;
rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id);
if (rc)
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index 1305522f72d6..40efe60eff8d 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -282,7 +282,6 @@ static int rmnet_changelink(struct net_device *dev, struct nlattr *tb[],
{
struct rmnet_priv *priv = netdev_priv(dev);
struct net_device *real_dev;
- struct rmnet_endpoint *ep;
struct rmnet_port *port;
u16 mux_id;
@@ -297,19 +296,27 @@ static int rmnet_changelink(struct net_device *dev, struct nlattr *tb[],
if (data[IFLA_RMNET_MUX_ID]) {
mux_id = nla_get_u16(data[IFLA_RMNET_MUX_ID]);
- if (rmnet_get_endpoint(port, mux_id)) {
- NL_SET_ERR_MSG_MOD(extack, "MUX ID already exists");
- return -EINVAL;
- }
- ep = rmnet_get_endpoint(port, priv->mux_id);
- if (!ep)
- return -ENODEV;
- hlist_del_init_rcu(&ep->hlnode);
- hlist_add_head_rcu(&ep->hlnode, &port->muxed_ep[mux_id]);
+ if (mux_id != priv->mux_id) {
+ struct rmnet_endpoint *ep;
+
+ ep = rmnet_get_endpoint(port, priv->mux_id);
+ if (!ep)
+ return -ENODEV;
- ep->mux_id = mux_id;
- priv->mux_id = mux_id;
+ if (rmnet_get_endpoint(port, mux_id)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MUX ID already exists");
+ return -EINVAL;
+ }
+
+ hlist_del_init_rcu(&ep->hlnode);
+ hlist_add_head_rcu(&ep->hlnode,
+ &port->muxed_ep[mux_id]);
+
+ ep->mux_id = mux_id;
+ priv->mux_id = mux_id;
+ }
}
if (data[IFLA_RMNET_FLAGS]) {
diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 55cb5730beb6..bf5bf05970a2 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -5441,9 +5441,8 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
netif_napi_add(dev, &tp->napi, rtl8169_poll, NAPI_POLL_WEIGHT);
- dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO |
- NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_TX |
- NETIF_F_HW_VLAN_CTAG_RX;
+ dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM |
+ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO |
NETIF_F_HIGHDMA;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
@@ -5460,26 +5459,26 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
/* Disallow toggling */
dev->hw_features &= ~NETIF_F_HW_VLAN_CTAG_RX;
+ if (rtl_chip_supports_csum_v2(tp))
+ dev->hw_features |= NETIF_F_IPV6_CSUM;
+
+ dev->features |= dev->hw_features;
+
+ /* There has been a number of reports that using SG/TSO results in
+ * tx timeouts. However for a lot of people SG/TSO works fine.
+ * Therefore disable both features by default, but allow users to
+ * enable them. Use at own risk!
+ */
if (rtl_chip_supports_csum_v2(tp)) {
- dev->hw_features |= NETIF_F_IPV6_CSUM | NETIF_F_TSO6;
+ dev->hw_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6;
dev->gso_max_size = RTL_GSO_MAX_SIZE_V2;
dev->gso_max_segs = RTL_GSO_MAX_SEGS_V2;
} else {
+ dev->hw_features |= NETIF_F_SG | NETIF_F_TSO;
dev->gso_max_size = RTL_GSO_MAX_SIZE_V1;
dev->gso_max_segs = RTL_GSO_MAX_SEGS_V1;
}
- /* RTL8168e-vl and one RTL8168c variant are known to have a
- * HW issue with TSO.
- */
- if (tp->mac_version == RTL_GIGA_MAC_VER_34 ||
- tp->mac_version == RTL_GIGA_MAC_VER_22) {
- dev->vlan_features &= ~(NETIF_F_ALL_TSO | NETIF_F_SG);
- dev->hw_features &= ~(NETIF_F_ALL_TSO | NETIF_F_SG);
- }
-
- dev->features |= dev->hw_features;
-
dev->hw_features |= NETIF_F_RXALL;
dev->hw_features |= NETIF_F_RXFCS;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
index 542784300620..efc6ec1b8027 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
@@ -207,7 +207,7 @@ static void dwmac1000_set_filter(struct mac_device_info *hw,
reg++;
}
- while (reg <= perfect_addr_number) {
+ while (reg < perfect_addr_number) {
writel(0, ioaddr + GMAC_ADDR_HIGH(reg));
writel(0, ioaddr + GMAC_ADDR_LOW(reg));
reg++;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
index 0e4575f7bedb..ad4df9bddcf3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
@@ -577,8 +577,13 @@ static void dwxgmac2_update_vlan_hash(struct mac_device_info *hw, u32 hash,
value |= XGMAC_VLAN_EDVLP;
value |= XGMAC_VLAN_ESVL;
value |= XGMAC_VLAN_DOVLTC;
+ } else {
+ value &= ~XGMAC_VLAN_EDVLP;
+ value &= ~XGMAC_VLAN_ESVL;
+ value &= ~XGMAC_VLAN_DOVLTC;
}
+ value &= ~XGMAC_VLAN_VID;
writel(value, ioaddr + XGMAC_VLAN_TAG);
} else if (perfect_match) {
u32 value = readl(ioaddr + XGMAC_PACKET_FILTER);
@@ -589,13 +594,19 @@ static void dwxgmac2_update_vlan_hash(struct mac_device_info *hw, u32 hash,
value = readl(ioaddr + XGMAC_VLAN_TAG);
+ value &= ~XGMAC_VLAN_VTHM;
value |= XGMAC_VLAN_ETV;
if (is_double) {
value |= XGMAC_VLAN_EDVLP;
value |= XGMAC_VLAN_ESVL;
value |= XGMAC_VLAN_DOVLTC;
+ } else {
+ value &= ~XGMAC_VLAN_EDVLP;
+ value &= ~XGMAC_VLAN_ESVL;
+ value &= ~XGMAC_VLAN_DOVLTC;
}
+ value &= ~XGMAC_VLAN_VID;
writel(value | perfect_match, ioaddr + XGMAC_VLAN_TAG);
} else {
u32 value = readl(ioaddr + XGMAC_PACKET_FILTER);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 2fb671e61ee8..e6898fd5223f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -4566,9 +4566,13 @@ static int stmmac_vlan_rx_add_vid(struct net_device *ndev, __be16 proto, u16 vid
return ret;
}
- ret = stmmac_add_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid);
+ if (priv->hw->num_vlan) {
+ ret = stmmac_add_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid);
+ if (ret)
+ return ret;
+ }
- return ret;
+ return 0;
}
static int stmmac_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vid)
@@ -4581,9 +4585,12 @@ static int stmmac_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vi
is_double = true;
clear_bit(vid, priv->active_vlans);
- ret = stmmac_del_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid);
- if (ret)
- return ret;
+
+ if (priv->hw->num_vlan) {
+ ret = stmmac_del_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid);
+ if (ret)
+ return ret;
+ }
return stmmac_vlan_update(priv, is_double);
}
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index da82d7f16a09..0d580d81d910 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -2594,6 +2594,9 @@ static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info)
return PTR_ERR(dev);
macsec = macsec_priv(dev);
+ if (!tb_offload[MACSEC_OFFLOAD_ATTR_TYPE])
+ return -EINVAL;
+
offload = nla_get_u8(tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]);
if (macsec->offload == offload)
return 0;
diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c
index 481cf48c9b9e..31f731e6df72 100644
--- a/drivers/net/phy/at803x.c
+++ b/drivers/net/phy/at803x.c
@@ -425,8 +425,8 @@ static int at803x_parse_dt(struct phy_device *phydev)
*/
if (at803x_match_phy_id(phydev, ATH8030_PHY_ID) ||
at803x_match_phy_id(phydev, ATH8035_PHY_ID)) {
- priv->clk_25m_reg &= ~AT8035_CLK_OUT_MASK;
- priv->clk_25m_mask &= ~AT8035_CLK_OUT_MASK;
+ priv->clk_25m_reg &= AT8035_CLK_OUT_MASK;
+ priv->clk_25m_mask &= AT8035_CLK_OUT_MASK;
}
}
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 2ec19e5540bf..05d20343b816 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -25,6 +25,7 @@
#include <linux/micrel_phy.h>
#include <linux/of.h>
#include <linux/clk.h>
+#include <linux/delay.h>
/* Operation Mode Strap Override */
#define MII_KSZPHY_OMSO 0x16
@@ -952,6 +953,12 @@ static int kszphy_resume(struct phy_device *phydev)
genphy_resume(phydev);
+ /* After switching from power-down to normal mode, an internal global
+ * reset is automatically generated. Wait a minimum of 1 ms before
+ * read/write access to the PHY registers.
+ */
+ usleep_range(1000, 2000);
+
ret = kszphy_config_reset(phydev);
if (ret)
return ret;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 228fe449dc6d..07476c6510f2 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1678,8 +1678,12 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
alloc_frag->offset += buflen;
}
err = tun_xdp_act(tun, xdp_prog, &xdp, act);
- if (err < 0)
- goto err_xdp;
+ if (err < 0) {
+ if (act == XDP_REDIRECT || act == XDP_TX)
+ put_page(alloc_frag->page);
+ goto out;
+ }
+
if (err == XDP_REDIRECT)
xdp_do_flush();
if (err != XDP_PASS)
@@ -1693,8 +1697,6 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);
-err_xdp:
- put_page(alloc_frag->page);
out:
rcu_read_unlock();
local_bh_enable();
diff --git a/drivers/net/usb/pegasus.c b/drivers/net/usb/pegasus.c
index 8783e2ab3ec0..0ef7e1f443e3 100644
--- a/drivers/net/usb/pegasus.c
+++ b/drivers/net/usb/pegasus.c
@@ -54,6 +54,7 @@ static const char driver_name[] = "pegasus";
#undef PEGASUS_WRITE_EEPROM
#define BMSR_MEDIA (BMSR_10HALF | BMSR_10FULL | BMSR_100HALF | \
BMSR_100FULL | BMSR_ANEGCAPABLE)
+#define CARRIER_CHECK_DELAY (2 * HZ)
static bool loopback;
static bool mii_mode;
@@ -1089,17 +1090,12 @@ static inline void setup_pegasus_II(pegasus_t *pegasus)
set_register(pegasus, Reg81, 2);
}
-
-static int pegasus_count;
-static struct workqueue_struct *pegasus_workqueue;
-#define CARRIER_CHECK_DELAY (2 * HZ)
-
static void check_carrier(struct work_struct *work)
{
pegasus_t *pegasus = container_of(work, pegasus_t, carrier_check.work);
set_carrier(pegasus->net);
if (!(pegasus->flags & PEGASUS_UNPLUG)) {
- queue_delayed_work(pegasus_workqueue, &pegasus->carrier_check,
+ queue_delayed_work(system_long_wq, &pegasus->carrier_check,
CARRIER_CHECK_DELAY);
}
}
@@ -1120,18 +1116,6 @@ static int pegasus_blacklisted(struct usb_device *udev)
return 0;
}
-/* we rely on probe() and remove() being serialized so we
- * don't need extra locking on pegasus_count.
- */
-static void pegasus_dec_workqueue(void)
-{
- pegasus_count--;
- if (pegasus_count == 0) {
- destroy_workqueue(pegasus_workqueue);
- pegasus_workqueue = NULL;
- }
-}
-
static int pegasus_probe(struct usb_interface *intf,
const struct usb_device_id *id)
{
@@ -1144,14 +1128,6 @@ static int pegasus_probe(struct usb_interface *intf,
if (pegasus_blacklisted(dev))
return -ENODEV;
- if (pegasus_count == 0) {
- pegasus_workqueue = alloc_workqueue("pegasus", WQ_MEM_RECLAIM,
- 0);
- if (!pegasus_workqueue)
- return -ENOMEM;
- }
- pegasus_count++;
-
net = alloc_etherdev(sizeof(struct pegasus));
if (!net)
goto out;
@@ -1209,7 +1185,7 @@ static int pegasus_probe(struct usb_interface *intf,
res = register_netdev(net);
if (res)
goto out3;
- queue_delayed_work(pegasus_workqueue, &pegasus->carrier_check,
+ queue_delayed_work(system_long_wq, &pegasus->carrier_check,
CARRIER_CHECK_DELAY);
dev_info(&intf->dev, "%s, %s, %pM\n", net->name,
usb_dev_id[dev_index].name, net->dev_addr);
@@ -1222,7 +1198,6 @@ out2:
out1:
free_netdev(net);
out:
- pegasus_dec_workqueue();
return res;
}
@@ -1237,7 +1212,7 @@ static void pegasus_disconnect(struct usb_interface *intf)
}
pegasus->flags |= PEGASUS_UNPLUG;
- cancel_delayed_work(&pegasus->carrier_check);
+ cancel_delayed_work_sync(&pegasus->carrier_check);
unregister_netdev(pegasus->net);
unlink_all_urbs(pegasus);
free_all_urbs(pegasus);
@@ -1246,7 +1221,6 @@ static void pegasus_disconnect(struct usb_interface *intf)
pegasus->rx_skb = NULL;
}
free_netdev(pegasus->net);
- pegasus_dec_workqueue();
}
static int pegasus_suspend(struct usb_interface *intf, pm_message_t message)
@@ -1254,7 +1228,7 @@ static int pegasus_suspend(struct usb_interface *intf, pm_message_t message)
struct pegasus *pegasus = usb_get_intfdata(intf);
netif_device_detach(pegasus->net);
- cancel_delayed_work(&pegasus->carrier_check);
+ cancel_delayed_work_sync(&pegasus->carrier_check);
if (netif_running(pegasus->net)) {
usb_kill_urb(pegasus->rx_urb);
usb_kill_urb(pegasus->intr_urb);
@@ -1276,7 +1250,7 @@ static int pegasus_resume(struct usb_interface *intf)
pegasus->intr_urb->actual_length = 0;
intr_callback(pegasus->intr_urb);
}
- queue_delayed_work(pegasus_workqueue, &pegasus->carrier_check,
+ queue_delayed_work(system_long_wq, &pegasus->carrier_check,
CARRIER_CHECK_DELAY);
return 0;
}
diff --git a/drivers/net/wimax/i2400m/driver.c b/drivers/net/wimax/i2400m/driver.c
index f66c0f8f6f4a..ecb3fccca603 100644
--- a/drivers/net/wimax/i2400m/driver.c
+++ b/drivers/net/wimax/i2400m/driver.c
@@ -740,9 +740,6 @@ EXPORT_SYMBOL_GPL(i2400m_error_recovery);
static
int i2400m_bm_buf_alloc(struct i2400m *i2400m)
{
- int result;
-
- result = -ENOMEM;
i2400m->bm_cmd_buf = kzalloc(I2400M_BM_CMD_BUF_SIZE, GFP_KERNEL);
if (i2400m->bm_cmd_buf == NULL)
goto error_bm_cmd_kzalloc;
@@ -754,7 +751,7 @@ int i2400m_bm_buf_alloc(struct i2400m *i2400m)
error_bm_ack_buf_kzalloc:
kfree(i2400m->bm_cmd_buf);
error_bm_cmd_kzalloc:
- return result;
+ return -ENOMEM;
}
@@ -843,7 +840,7 @@ EXPORT_SYMBOL_GPL(i2400m_reset);
*/
int i2400m_setup(struct i2400m *i2400m, enum i2400m_bri bm_flags)
{
- int result = -ENODEV;
+ int result;
struct device *dev = i2400m_dev(i2400m);
struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
struct net_device *net_dev = i2400m->wimax_dev.net_dev;
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index a8b515968569..09087c38fabd 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -1042,8 +1042,10 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
return -EFAULT;
}
- if (!desc || (desc->out_num + desc->in_num == 0) ||
- !test_bit(cmd, &cmd_mask))
+ if (!desc ||
+ (desc->out_num + desc->in_num == 0) ||
+ cmd > ND_CMD_CALL ||
+ !test_bit(cmd, &cmd_mask))
return -ENOTTY;
/* fail write commands (when read-only) */
diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c
index 64776ed15bb3..7d4ddc4d9322 100644
--- a/drivers/nvdimm/dimm.c
+++ b/drivers/nvdimm/dimm.c
@@ -99,7 +99,7 @@ static int nvdimm_probe(struct device *dev)
if (ndd->ns_current >= 0) {
rc = nd_label_reserve_dpa(ndd);
if (rc == 0)
- nvdimm_set_aliasing(dev);
+ nvdimm_set_labeling(dev);
}
nvdimm_bus_unlock(dev);
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index 94ea6dba6b4f..b7b77e8d9027 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -32,7 +32,7 @@ int nvdimm_check_config_data(struct device *dev)
if (!nvdimm->cmd_mask ||
!test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) {
- if (test_bit(NDD_ALIASING, &nvdimm->flags))
+ if (test_bit(NDD_LABELING, &nvdimm->flags))
return -ENXIO;
else
return -ENOTTY;
@@ -173,11 +173,11 @@ int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
return rc;
}
-void nvdimm_set_aliasing(struct device *dev)
+void nvdimm_set_labeling(struct device *dev)
{
struct nvdimm *nvdimm = to_nvdimm(dev);
- set_bit(NDD_ALIASING, &nvdimm->flags);
+ set_bit(NDD_LABELING, &nvdimm->flags);
}
void nvdimm_set_locked(struct device *dev)
@@ -312,8 +312,9 @@ static ssize_t flags_show(struct device *dev,
{
struct nvdimm *nvdimm = to_nvdimm(dev);
- return sprintf(buf, "%s%s\n",
+ return sprintf(buf, "%s%s%s\n",
test_bit(NDD_ALIASING, &nvdimm->flags) ? "alias " : "",
+ test_bit(NDD_LABELING, &nvdimm->flags) ? "label " : "",
test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : "");
}
static DEVICE_ATTR_RO(flags);
@@ -562,6 +563,21 @@ int nvdimm_security_freeze(struct nvdimm *nvdimm)
return rc;
}
+static unsigned long dpa_align(struct nd_region *nd_region)
+{
+ struct device *dev = &nd_region->dev;
+
+ if (dev_WARN_ONCE(dev, !is_nvdimm_bus_locked(dev),
+ "bus lock required for capacity provision\n"))
+ return 0;
+ if (dev_WARN_ONCE(dev, !nd_region->ndr_mappings || nd_region->align
+ % nd_region->ndr_mappings,
+ "invalid region align %#lx mappings: %d\n",
+ nd_region->align, nd_region->ndr_mappings))
+ return 0;
+ return nd_region->align / nd_region->ndr_mappings;
+}
+
int alias_dpa_busy(struct device *dev, void *data)
{
resource_size_t map_end, blk_start, new;
@@ -570,6 +586,7 @@ int alias_dpa_busy(struct device *dev, void *data)
struct nd_region *nd_region;
struct nvdimm_drvdata *ndd;
struct resource *res;
+ unsigned long align;
int i;
if (!is_memory(dev))
@@ -607,13 +624,21 @@ int alias_dpa_busy(struct device *dev, void *data)
* Find the free dpa from the end of the last pmem allocation to
* the end of the interleave-set mapping.
*/
+ align = dpa_align(nd_region);
+ if (!align)
+ return 0;
+
for_each_dpa_resource(ndd, res) {
+ resource_size_t start, end;
+
if (strncmp(res->name, "pmem", 4) != 0)
continue;
- if ((res->start >= blk_start && res->start < map_end)
- || (res->end >= blk_start
- && res->end <= map_end)) {
- new = max(blk_start, min(map_end + 1, res->end + 1));
+
+ start = ALIGN_DOWN(res->start, align);
+ end = ALIGN(res->end + 1, align) - 1;
+ if ((start >= blk_start && start < map_end)
+ || (end >= blk_start && end <= map_end)) {
+ new = max(blk_start, min(map_end, end) + 1);
if (new != blk_start) {
blk_start = new;
goto retry;
@@ -653,6 +678,7 @@ resource_size_t nd_blk_available_dpa(struct nd_region *nd_region)
.res = NULL,
};
struct resource *res;
+ unsigned long align;
if (!ndd)
return 0;
@@ -660,10 +686,20 @@ resource_size_t nd_blk_available_dpa(struct nd_region *nd_region)
device_for_each_child(&nvdimm_bus->dev, &info, alias_dpa_busy);
/* now account for busy blk allocations in unaliased dpa */
+ align = dpa_align(nd_region);
+ if (!align)
+ return 0;
for_each_dpa_resource(ndd, res) {
+ resource_size_t start, end, size;
+
if (strncmp(res->name, "blk", 3) != 0)
continue;
- info.available -= resource_size(res);
+ start = ALIGN_DOWN(res->start, align);
+ end = ALIGN(res->end + 1, align) - 1;
+ size = end - start + 1;
+ if (size >= info.available)
+ return 0;
+ info.available -= size;
}
return info.available;
@@ -682,19 +718,31 @@ resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region,
struct nvdimm_bus *nvdimm_bus;
resource_size_t max = 0;
struct resource *res;
+ unsigned long align;
/* if a dimm is disabled the available capacity is zero */
if (!ndd)
return 0;
+ align = dpa_align(nd_region);
+ if (!align)
+ return 0;
+
nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
if (__reserve_free_pmem(&nd_region->dev, nd_mapping->nvdimm))
return 0;
for_each_dpa_resource(ndd, res) {
+ resource_size_t start, end;
+
if (strcmp(res->name, "pmem-reserve") != 0)
continue;
- if (resource_size(res) > max)
- max = resource_size(res);
+ /* trim free space relative to current alignment setting */
+ start = ALIGN(res->start, align);
+ end = ALIGN_DOWN(res->end + 1, align) - 1;
+ if (end < start)
+ continue;
+ if (end - start + 1 > max)
+ max = end - start + 1;
}
release_free_pmem(nvdimm_bus, nd_mapping);
return max;
@@ -722,24 +770,33 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
struct resource *res;
const char *reason;
+ unsigned long align;
if (!ndd)
return 0;
+ align = dpa_align(nd_region);
+ if (!align)
+ return 0;
+
map_start = nd_mapping->start;
map_end = map_start + nd_mapping->size - 1;
blk_start = max(map_start, map_end + 1 - *overlap);
for_each_dpa_resource(ndd, res) {
- if (res->start >= map_start && res->start < map_end) {
+ resource_size_t start, end;
+
+ start = ALIGN_DOWN(res->start, align);
+ end = ALIGN(res->end + 1, align) - 1;
+ if (start >= map_start && start < map_end) {
if (strncmp(res->name, "blk", 3) == 0)
blk_start = min(blk_start,
- max(map_start, res->start));
- else if (res->end > map_end) {
+ max(map_start, start));
+ else if (end > map_end) {
reason = "misaligned to iset";
goto err;
} else
- busy += resource_size(res);
- } else if (res->end >= map_start && res->end <= map_end) {
+ busy += end - start + 1;
+ } else if (end >= map_start && end <= map_end) {
if (strncmp(res->name, "blk", 3) == 0) {
/*
* If a BLK allocation overlaps the start of
@@ -748,8 +805,8 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
*/
blk_start = map_start;
} else
- busy += resource_size(res);
- } else if (map_start > res->start && map_start < res->end) {
+ busy += end - start + 1;
+ } else if (map_start > start && map_start < end) {
/* total eclipse of the mapping */
busy += nd_mapping->size;
blk_start = map_start;
@@ -759,7 +816,7 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
*overlap = map_end + 1 - blk_start;
available = blk_start - map_start;
if (busy < available)
- return available - busy;
+ return ALIGN_DOWN(available - busy, align);
return 0;
err:
diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
index e02f60ad6c99..4cd18be9d0e9 100644
--- a/drivers/nvdimm/e820.c
+++ b/drivers/nvdimm/e820.c
@@ -7,6 +7,7 @@
#include <linux/memory_hotplug.h>
#include <linux/libnvdimm.h>
#include <linux/module.h>
+#include <linux/numa.h>
static int e820_pmem_remove(struct platform_device *pdev)
{
@@ -16,27 +17,16 @@ static int e820_pmem_remove(struct platform_device *pdev)
return 0;
}
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int e820_range_to_nid(resource_size_t addr)
-{
- return memory_add_physaddr_to_nid(addr);
-}
-#else
-static int e820_range_to_nid(resource_size_t addr)
-{
- return NUMA_NO_NODE;
-}
-#endif
-
static int e820_register_one(struct resource *res, void *data)
{
struct nd_region_desc ndr_desc;
struct nvdimm_bus *nvdimm_bus = data;
+ int nid = phys_to_target_node(res->start);
memset(&ndr_desc, 0, sizeof(ndr_desc));
ndr_desc.res = res;
- ndr_desc.numa_node = e820_range_to_nid(res->start);
- ndr_desc.target_node = ndr_desc.numa_node;
+ ndr_desc.numa_node = numa_map_to_online_node(nid);
+ ndr_desc.target_node = nid;
set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
return -ENXIO;
diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h
index 4c7b775c2811..956b6d1bd8cc 100644
--- a/drivers/nvdimm/label.h
+++ b/drivers/nvdimm/label.h
@@ -62,7 +62,7 @@ struct nd_namespace_index {
__le16 major;
__le16 minor;
__le64 checksum;
- u8 free[0];
+ u8 free[];
};
/**
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 032dc61725ff..ae155e860fdc 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -10,6 +10,7 @@
#include <linux/nd.h>
#include "nd-core.h"
#include "pmem.h"
+#include "pfn.h"
#include "nd.h"
static void namespace_io_release(struct device *dev)
@@ -541,6 +542,11 @@ static void space_valid(struct nd_region *nd_region, struct nvdimm_drvdata *ndd,
{
bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0;
bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
+ unsigned long align;
+
+ align = nd_region->align / nd_region->ndr_mappings;
+ valid->start = ALIGN(valid->start, align);
+ valid->end = ALIGN_DOWN(valid->end + 1, align) - 1;
if (valid->start >= valid->end)
goto invalid;
@@ -980,10 +986,10 @@ static ssize_t __size_store(struct device *dev, unsigned long long val)
return -ENXIO;
}
- div_u64_rem(val, PAGE_SIZE * nd_region->ndr_mappings, &remainder);
+ div_u64_rem(val, nd_region->align, &remainder);
if (remainder) {
dev_dbg(dev, "%llu is not %ldK aligned\n", val,
- (PAGE_SIZE * nd_region->ndr_mappings) / SZ_1K);
+ nd_region->align / SZ_1K);
return -EINVAL;
}
@@ -1739,6 +1745,22 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
return ERR_PTR(-ENODEV);
}
+ /*
+ * Note, alignment validation for fsdax and devdax mode
+ * namespaces happens in nd_pfn_validate() where infoblock
+ * padding parameters can be applied.
+ */
+ if (pmem_should_map_pages(dev)) {
+ struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+ struct resource *res = &nsio->res;
+
+ if (!IS_ALIGNED(res->start | (res->end + 1),
+ memremap_compat_align())) {
+ dev_err(&ndns->dev, "%pr misaligned, unable to map\n", res);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+ }
+
if (is_namespace_pmem(&ndns->dev)) {
struct nd_namespace_pmem *nspm;
@@ -2521,7 +2543,7 @@ static int init_active_labels(struct nd_region *nd_region)
if (!ndd) {
if (test_bit(NDD_LOCKED, &nvdimm->flags))
/* fail, label data may be unreadable */;
- else if (test_bit(NDD_ALIASING, &nvdimm->flags))
+ else if (test_bit(NDD_LABELING, &nvdimm->flags))
/* fail, labels needed to disambiguate dpa */;
else
return 0;
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index c9f6a5b5253a..85dbb2a322b9 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -39,7 +39,7 @@ struct nd_region_data {
int ns_count;
int ns_active;
unsigned int hints_shift;
- void __iomem *flush_wpq[0];
+ void __iomem *flush_wpq[];
};
static inline void __iomem *ndrd_get_flush_wpq(struct nd_region_data *ndrd,
@@ -146,6 +146,7 @@ struct nd_region {
struct device *btt_seed;
struct device *pfn_seed;
struct device *dax_seed;
+ unsigned long align;
u16 ndr_mappings;
u64 ndr_size;
u64 ndr_start;
@@ -156,7 +157,7 @@ struct nd_region {
struct nd_interleave_set *nd_set;
struct nd_percpu_lane __percpu *lane;
int (*flush)(struct nd_region *nd_region, struct bio *bio);
- struct nd_mapping mapping[0];
+ struct nd_mapping mapping[];
};
struct nd_blk_region {
@@ -252,7 +253,7 @@ int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
void *buf, size_t len);
long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
unsigned int len);
-void nvdimm_set_aliasing(struct device *dev);
+void nvdimm_set_labeling(struct device *dev);
void nvdimm_set_locked(struct device *dev);
void nvdimm_clear_locked(struct device *dev);
int nvdimm_security_setup_events(struct device *dev);
diff --git a/drivers/nvdimm/of_pmem.c b/drivers/nvdimm/of_pmem.c
index 8224d1431ea9..6826a274a1f1 100644
--- a/drivers/nvdimm/of_pmem.c
+++ b/drivers/nvdimm/of_pmem.c
@@ -62,8 +62,10 @@ static int of_pmem_region_probe(struct platform_device *pdev)
if (is_volatile)
region = nvdimm_volatile_region_create(bus, &ndr_desc);
- else
+ else {
+ set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags);
region = nvdimm_pmem_region_create(bus, &ndr_desc);
+ }
if (!region)
dev_warn(&pdev->dev, "Unable to register region %pR from %pOF\n",
diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index acb19517f678..37cb1b8a2a39 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -24,6 +24,18 @@ struct nd_pfn_sb {
__le64 npfns;
__le32 mode;
/* minor-version-1 additions for section alignment */
+ /**
+ * @start_pad: Deprecated attribute to pad start-misaligned namespaces
+ *
+ * start_pad is deprecated because the original definition did
+ * not comprehend that dataoff is relative to the base address
+ * of the namespace not the start_pad adjusted base. The result
+ * is that the dax path is broken, but the block-I/O path is
+ * not. The kernel will no longer create namespaces using start
+ * padding, but it still supports block-I/O for legacy
+ * configurations mainly to allow a backup, reconfigure the
+ * namespace, and restore flow to repair dax operation.
+ */
__le32 start_pad;
__le32 end_trunc;
/* minor-version-2 record the base alignment of the mapping */
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index b94f7a7e94b8..34db557dbad1 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -446,6 +446,7 @@ static bool nd_supported_alignment(unsigned long align)
int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
{
u64 checksum, offset;
+ struct resource *res;
enum nd_pfn_mode mode;
struct nd_namespace_io *nsio;
unsigned long align, start_pad;
@@ -561,14 +562,14 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
dev_dbg(&nd_pfn->dev, "align: %lx:%lx mode: %d:%d\n",
nd_pfn->align, align, nd_pfn->mode,
mode);
- return -EINVAL;
+ return -EOPNOTSUPP;
}
}
if (align > nvdimm_namespace_capacity(ndns)) {
dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n",
align, nvdimm_namespace_capacity(ndns));
- return -EINVAL;
+ return -EOPNOTSUPP;
}
/*
@@ -578,18 +579,31 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
* established.
*/
nsio = to_nd_namespace_io(&ndns->dev);
- if (offset >= resource_size(&nsio->res)) {
+ res = &nsio->res;
+ if (offset >= resource_size(res)) {
dev_err(&nd_pfn->dev, "pfn array size exceeds capacity of %s\n",
dev_name(&ndns->dev));
- return -EBUSY;
+ return -EOPNOTSUPP;
}
- if ((align && !IS_ALIGNED(nsio->res.start + offset + start_pad, align))
+ if ((align && !IS_ALIGNED(res->start + offset + start_pad, align))
|| !IS_ALIGNED(offset, PAGE_SIZE)) {
dev_err(&nd_pfn->dev,
"bad offset: %#llx dax disabled align: %#lx\n",
offset, align);
- return -ENXIO;
+ return -EOPNOTSUPP;
+ }
+
+ if (!IS_ALIGNED(res->start + le32_to_cpu(pfn_sb->start_pad),
+ memremap_compat_align())) {
+ dev_err(&nd_pfn->dev, "resource start misaligned\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (!IS_ALIGNED(res->end + 1 - le32_to_cpu(pfn_sb->end_trunc),
+ memremap_compat_align())) {
+ dev_err(&nd_pfn->dev, "resource end misaligned\n");
+ return -EOPNOTSUPP;
}
return 0;
@@ -750,7 +764,19 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
start = nsio->res.start;
size = resource_size(&nsio->res);
npfns = PHYS_PFN(size - SZ_8K);
- align = max(nd_pfn->align, (1UL << SUBSECTION_SHIFT));
+ align = max(nd_pfn->align, memremap_compat_align());
+
+ /*
+ * When @start is misaligned fail namespace creation. See
+ * the 'struct nd_pfn_sb' commentary on why ->start_pad is not
+ * an option.
+ */
+ if (!IS_ALIGNED(start, memremap_compat_align())) {
+ dev_err(&nd_pfn->dev, "%s: start %pa misaligned to %#lx\n",
+ dev_name(&ndns->dev), &start,
+ memremap_compat_align());
+ return -EINVAL;
+ }
end_trunc = start + size - ALIGN_DOWN(start + size, align);
if (nd_pfn->mode == PFN_MODE_PMEM) {
/*
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 4ffc6f7ca131..2df6994acf83 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -136,9 +136,25 @@ static blk_status_t read_pmem(struct page *page, unsigned int off,
return BLK_STS_OK;
}
-static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
- unsigned int len, unsigned int off, unsigned int op,
- sector_t sector)
+static blk_status_t pmem_do_read(struct pmem_device *pmem,
+ struct page *page, unsigned int page_off,
+ sector_t sector, unsigned int len)
+{
+ blk_status_t rc;
+ phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
+ void *pmem_addr = pmem->virt_addr + pmem_off;
+
+ if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
+ return BLK_STS_IOERR;
+
+ rc = read_pmem(page, page_off, pmem_addr, len);
+ flush_dcache_page(page);
+ return rc;
+}
+
+static blk_status_t pmem_do_write(struct pmem_device *pmem,
+ struct page *page, unsigned int page_off,
+ sector_t sector, unsigned int len)
{
blk_status_t rc = BLK_STS_OK;
bool bad_pmem = false;
@@ -148,34 +164,25 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
bad_pmem = true;
- if (!op_is_write(op)) {
- if (unlikely(bad_pmem))
- rc = BLK_STS_IOERR;
- else {
- rc = read_pmem(page, off, pmem_addr, len);
- flush_dcache_page(page);
- }
- } else {
- /*
- * Note that we write the data both before and after
- * clearing poison. The write before clear poison
- * handles situations where the latest written data is
- * preserved and the clear poison operation simply marks
- * the address range as valid without changing the data.
- * In this case application software can assume that an
- * interrupted write will either return the new good
- * data or an error.
- *
- * However, if pmem_clear_poison() leaves the data in an
- * indeterminate state we need to perform the write
- * after clear poison.
- */
- flush_dcache_page(page);
- write_pmem(pmem_addr, page, off, len);
- if (unlikely(bad_pmem)) {
- rc = pmem_clear_poison(pmem, pmem_off, len);
- write_pmem(pmem_addr, page, off, len);
- }
+ /*
+ * Note that we write the data both before and after
+ * clearing poison. The write before clear poison
+ * handles situations where the latest written data is
+ * preserved and the clear poison operation simply marks
+ * the address range as valid without changing the data.
+ * In this case application software can assume that an
+ * interrupted write will either return the new good
+ * data or an error.
+ *
+ * However, if pmem_clear_poison() leaves the data in an
+ * indeterminate state we need to perform the write
+ * after clear poison.
+ */
+ flush_dcache_page(page);
+ write_pmem(pmem_addr, page, page_off, len);
+ if (unlikely(bad_pmem)) {
+ rc = pmem_clear_poison(pmem, pmem_off, len);
+ write_pmem(pmem_addr, page, page_off, len);
}
return rc;
@@ -197,8 +204,12 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
do_acct = nd_iostat_start(bio, &start);
bio_for_each_segment(bvec, bio, iter) {
- rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
- bvec.bv_offset, bio_op(bio), iter.bi_sector);
+ if (op_is_write(bio_op(bio)))
+ rc = pmem_do_write(pmem, bvec.bv_page, bvec.bv_offset,
+ iter.bi_sector, bvec.bv_len);
+ else
+ rc = pmem_do_read(pmem, bvec.bv_page, bvec.bv_offset,
+ iter.bi_sector, bvec.bv_len);
if (rc) {
bio->bi_status = rc;
break;
@@ -223,9 +234,12 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
struct pmem_device *pmem = bdev->bd_queue->queuedata;
blk_status_t rc;
- rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
- 0, op, sector);
-
+ if (op_is_write(op))
+ rc = pmem_do_write(pmem, page, 0, sector,
+ hpage_nr_pages(page) * PAGE_SIZE);
+ else
+ rc = pmem_do_read(pmem, page, 0, sector,
+ hpage_nr_pages(page) * PAGE_SIZE);
/*
* The ->rw_page interface is subtle and tricky. The core
* retries on any error, so we can only invoke page_endio() in
@@ -268,6 +282,16 @@ static const struct block_device_operations pmem_fops = {
.revalidate_disk = nvdimm_revalidate_disk,
};
+static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
+ size_t nr_pages)
+{
+ struct pmem_device *pmem = dax_get_private(dax_dev);
+
+ return blk_status_to_errno(pmem_do_write(pmem, ZERO_PAGE(0), 0,
+ PFN_PHYS(pgoff) >> SECTOR_SHIFT,
+ PAGE_SIZE));
+}
+
static long pmem_dax_direct_access(struct dax_device *dax_dev,
pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
{
@@ -299,6 +323,7 @@ static const struct dax_operations pmem_dax_ops = {
.dax_supported = generic_fsdax_supported,
.copy_from_iter = pmem_copy_from_iter,
.copy_to_iter = pmem_copy_to_iter,
+ .zero_page_range = pmem_dax_zero_page_range,
};
static const struct attribute_group *pmem_attribute_groups[] = {
@@ -461,9 +486,9 @@ static int pmem_attach_disk(struct device *dev,
if (is_nvdimm_sync(nd_region))
flags = DAXDEV_F_SYNC;
dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags);
- if (!dax_dev) {
+ if (IS_ERR(dax_dev)) {
put_disk(disk);
- return -ENOMEM;
+ return PTR_ERR(dax_dev);
}
dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
pmem->dax_dev = dax_dev;
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index a19e535830d9..ccbb5b43b8b2 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -195,16 +195,16 @@ EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data);
int nd_region_to_nstype(struct nd_region *nd_region)
{
if (is_memory(&nd_region->dev)) {
- u16 i, alias;
+ u16 i, label;
- for (i = 0, alias = 0; i < nd_region->ndr_mappings; i++) {
+ for (i = 0, label = 0; i < nd_region->ndr_mappings; i++) {
struct nd_mapping *nd_mapping = &nd_region->mapping[i];
struct nvdimm *nvdimm = nd_mapping->nvdimm;
- if (test_bit(NDD_ALIASING, &nvdimm->flags))
- alias++;
+ if (test_bit(NDD_LABELING, &nvdimm->flags))
+ label++;
}
- if (alias)
+ if (label)
return ND_DEVICE_NAMESPACE_PMEM;
else
return ND_DEVICE_NAMESPACE_IO;
@@ -216,21 +216,25 @@ int nd_region_to_nstype(struct nd_region *nd_region)
}
EXPORT_SYMBOL(nd_region_to_nstype);
-static ssize_t size_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static unsigned long long region_size(struct nd_region *nd_region)
{
- struct nd_region *nd_region = to_nd_region(dev);
- unsigned long long size = 0;
-
- if (is_memory(dev)) {
- size = nd_region->ndr_size;
+ if (is_memory(&nd_region->dev)) {
+ return nd_region->ndr_size;
} else if (nd_region->ndr_mappings == 1) {
struct nd_mapping *nd_mapping = &nd_region->mapping[0];
- size = nd_mapping->size;
+ return nd_mapping->size;
}
- return sprintf(buf, "%llu\n", size);
+ return 0;
+}
+
+static ssize_t size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ return sprintf(buf, "%llu\n", region_size(nd_region));
}
static DEVICE_ATTR_RO(size);
@@ -529,6 +533,54 @@ static ssize_t read_only_store(struct device *dev,
}
static DEVICE_ATTR_RW(read_only);
+static ssize_t align_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ return sprintf(buf, "%#lx\n", nd_region->align);
+}
+
+static ssize_t align_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ unsigned long val, dpa;
+ u32 remainder;
+ int rc;
+
+ rc = kstrtoul(buf, 0, &val);
+ if (rc)
+ return rc;
+
+ if (!nd_region->ndr_mappings)
+ return -ENXIO;
+
+ /*
+ * Ensure space-align is evenly divisible by the region
+ * interleave-width because the kernel typically has no facility
+ * to determine which DIMM(s), dimm-physical-addresses, would
+ * contribute to the tail capacity in system-physical-address
+ * space for the namespace.
+ */
+ dpa = div_u64_rem(val, nd_region->ndr_mappings, &remainder);
+ if (!is_power_of_2(dpa) || dpa < PAGE_SIZE
+ || val > region_size(nd_region) || remainder)
+ return -EINVAL;
+
+ /*
+ * Given that space allocation consults this value multiple
+ * times ensure it does not change for the duration of the
+ * allocation.
+ */
+ nvdimm_bus_lock(dev);
+ nd_region->align = val;
+ nvdimm_bus_unlock(dev);
+
+ return len;
+}
+static DEVICE_ATTR_RW(align);
+
static ssize_t region_badblocks_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -571,6 +623,7 @@ static DEVICE_ATTR_RO(persistence_domain);
static struct attribute *nd_region_attributes[] = {
&dev_attr_size.attr,
+ &dev_attr_align.attr,
&dev_attr_nstype.attr,
&dev_attr_mappings.attr,
&dev_attr_btt_seed.attr,
@@ -626,6 +679,19 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
return a->mode;
}
+ if (a == &dev_attr_align.attr) {
+ int i;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+ if (test_bit(NDD_LABELING, &nvdimm->flags))
+ return a->mode;
+ }
+ return 0;
+ }
+
if (a != &dev_attr_set_cookie.attr
&& a != &dev_attr_available_size.attr)
return a->mode;
@@ -935,6 +1001,41 @@ void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane)
}
EXPORT_SYMBOL(nd_region_release_lane);
+/*
+ * PowerPC requires this alignment for memremap_pages(). All other archs
+ * should be ok with SUBSECTION_SIZE (see memremap_compat_align()).
+ */
+#define MEMREMAP_COMPAT_ALIGN_MAX SZ_16M
+
+static unsigned long default_align(struct nd_region *nd_region)
+{
+ unsigned long align;
+ int i, mappings;
+ u32 remainder;
+
+ if (is_nd_blk(&nd_region->dev))
+ align = PAGE_SIZE;
+ else
+ align = MEMREMAP_COMPAT_ALIGN_MAX;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+ if (test_bit(NDD_ALIASING, &nvdimm->flags)) {
+ align = MEMREMAP_COMPAT_ALIGN_MAX;
+ break;
+ }
+ }
+
+ mappings = max_t(u16, 1, nd_region->ndr_mappings);
+ div_u64_rem(align, mappings, &remainder);
+ if (remainder)
+ align *= mappings;
+
+ return align;
+}
+
static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
struct nd_region_desc *ndr_desc,
const struct device_type *dev_type, const char *caller)
@@ -1039,6 +1140,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
dev->of_node = ndr_desc->of_node;
nd_region->ndr_size = resource_size(ndr_desc->res);
nd_region->ndr_start = ndr_desc->res->start;
+ nd_region->align = default_align(nd_region);
if (ndr_desc->flush)
nd_region->flush = ndr_desc->flush;
else
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4f907e3beda1..91c1bd659947 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -6,6 +6,7 @@
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
+#include <linux/compat.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/hdreg.h>
@@ -1252,6 +1253,18 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl)
queue_work(nvme_wq, &ctrl->async_event_work);
}
+/*
+ * Convert integer values from ioctl structures to user pointers, silently
+ * ignoring the upper bits in the compat case to match behaviour of 32-bit
+ * kernels.
+ */
+static void __user *nvme_to_user_ptr(uintptr_t ptrval)
+{
+ if (in_compat_syscall())
+ ptrval = (compat_uptr_t)ptrval;
+ return (void __user *)ptrval;
+}
+
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
{
struct nvme_user_io io;
@@ -1275,7 +1288,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
length = (io.nblocks + 1) << ns->lba_shift;
meta_len = (io.nblocks + 1) * ns->ms;
- metadata = (void __user *)(uintptr_t)io.metadata;
+ metadata = nvme_to_user_ptr(io.metadata);
if (ns->ext) {
length += meta_len;
@@ -1298,7 +1311,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
c.rw.appmask = cpu_to_le16(io.appmask);
return nvme_submit_user_cmd(ns->queue, &c,
- (void __user *)(uintptr_t)io.addr, length,
+ nvme_to_user_ptr(io.addr), length,
metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
}
@@ -1418,9 +1431,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
- (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
- (void __user *)(uintptr_t)cmd.metadata,
- cmd.metadata_len, 0, &result, timeout);
+ nvme_to_user_ptr(cmd.addr), cmd.data_len,
+ nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
+ 0, &result, timeout);
nvme_passthru_end(ctrl, effects);
if (status >= 0) {
@@ -1465,8 +1478,8 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
- (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
- (void __user *)(uintptr_t)cmd.metadata, cmd.metadata_len,
+ nvme_to_user_ptr(cmd.addr), cmd.data_len,
+ nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
0, &cmd.result, timeout);
nvme_passthru_end(ctrl, effects);
@@ -1884,6 +1897,13 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
if (ns->head->disk) {
nvme_update_disk_info(ns->head->disk, ns, id);
blk_queue_stack_limits(ns->head->disk->queue, ns->queue);
+ if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) {
+ struct backing_dev_info *info =
+ ns->head->disk->queue->backing_dev_info;
+
+ info->capabilities |= BDI_CAP_STABLE_WRITES;
+ }
+
revalidate_disk(ns->head->disk);
}
#endif
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index a8bf2fb1287b..7dfc4a2ecf1e 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -342,8 +342,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo,
!template->ls_req || !template->fcp_io ||
!template->ls_abort || !template->fcp_abort ||
!template->max_hw_queues || !template->max_sgl_segments ||
- !template->max_dif_sgl_segments || !template->dma_boundary ||
- !template->module) {
+ !template->max_dif_sgl_segments || !template->dma_boundary) {
ret = -EINVAL;
goto out_reghost_failed;
}
@@ -2016,7 +2015,6 @@ nvme_fc_ctrl_free(struct kref *ref)
{
struct nvme_fc_ctrl *ctrl =
container_of(ref, struct nvme_fc_ctrl, ref);
- struct nvme_fc_lport *lport = ctrl->lport;
unsigned long flags;
if (ctrl->ctrl.tagset) {
@@ -2043,7 +2041,6 @@ nvme_fc_ctrl_free(struct kref *ref)
if (ctrl->ctrl.opts)
nvmf_free_options(ctrl->ctrl.opts);
kfree(ctrl);
- module_put(lport->ops->module);
}
static void
@@ -3074,15 +3071,10 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
goto out_fail;
}
- if (!try_module_get(lport->ops->module)) {
- ret = -EUNATCH;
- goto out_free_ctrl;
- }
-
idx = ida_simple_get(&nvme_fc_ctrl_cnt, 0, 0, GFP_KERNEL);
if (idx < 0) {
ret = -ENOSPC;
- goto out_mod_put;
+ goto out_free_ctrl;
}
ctrl->ctrl.opts = opts;
@@ -3232,8 +3224,6 @@ out_free_queues:
out_free_ida:
put_device(ctrl->dev);
ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum);
-out_mod_put:
- module_put(lport->ops->module);
out_free_ctrl:
kfree(ctrl);
out_fail:
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 61bf87592570..54603bd3e02d 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -510,7 +510,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
if (!nr_nsids)
return 0;
- down_write(&ctrl->namespaces_rwsem);
+ down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
unsigned nsid = le32_to_cpu(desc->nsids[n]);
@@ -521,7 +521,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
if (++n == nr_nsids)
break;
}
- up_write(&ctrl->namespaces_rwsem);
+ up_read(&ctrl->namespaces_rwsem);
return 0;
}
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 76dbb55625ac..cac8a930396a 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1342,7 +1342,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
int ret;
sge->addr = qe->dma;
- sge->length = sizeof(struct nvme_command),
+ sge->length = sizeof(struct nvme_command);
sge->lkey = queue->device->pd->local_dma_lkey;
wr.next = NULL;
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 0ef14f0fad86..c15a92163c1f 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -174,16 +174,14 @@ static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
{
struct request *rq;
- unsigned int bytes;
if (unlikely(nvme_tcp_async_req(req)))
return false; /* async events don't have a request */
rq = blk_mq_rq_from_pdu(req);
- bytes = blk_rq_payload_bytes(rq);
- return rq_data_dir(rq) == WRITE && bytes &&
- bytes <= nvme_tcp_inline_data_size(req->queue);
+ return rq_data_dir(rq) == WRITE && req->data_len &&
+ req->data_len <= nvme_tcp_inline_data_size(req->queue);
}
static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
@@ -1075,7 +1073,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
if (result > 0)
pending = true;
else if (unlikely(result < 0))
- break;
+ return;
if (!pending)
return;
@@ -2164,7 +2162,9 @@ static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
c->common.flags |= NVME_CMD_SGL_METABUF;
- if (rq_data_dir(rq) == WRITE && req->data_len &&
+ if (!blk_rq_nr_phys_segments(rq))
+ nvme_tcp_set_sg_null(c);
+ else if (rq_data_dir(rq) == WRITE &&
req->data_len <= nvme_tcp_inline_data_size(queue))
nvme_tcp_set_sg_inline(queue, c, req->data_len);
else
@@ -2191,7 +2191,8 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
req->data_sent = 0;
req->pdu_len = 0;
req->pdu_sent = 0;
- req->data_len = blk_rq_payload_bytes(rq);
+ req->data_len = blk_rq_nr_phys_segments(rq) ?
+ blk_rq_payload_bytes(rq) : 0;
req->curr_bio = rq->bio;
if (rq_data_dir(rq) == WRITE &&
@@ -2298,6 +2299,9 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
struct nvme_tcp_queue *queue = hctx->driver_data;
struct sock *sk = queue->sock->sk;
+ if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
+ return 0;
+
if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
sk_busy_loop(sk, true);
nvme_tcp_try_recv(queue);
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 7aa10788b7c8..58cabd7b6fc5 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -1098,12 +1098,19 @@ static struct configfs_attribute *nvmet_referral_attrs[] = {
NULL,
};
-static void nvmet_referral_release(struct config_item *item)
+static void nvmet_referral_notify(struct config_group *group,
+ struct config_item *item)
{
struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent);
struct nvmet_port *port = to_nvmet_port(item);
nvmet_referral_disable(parent, port);
+}
+
+static void nvmet_referral_release(struct config_item *item)
+{
+ struct nvmet_port *port = to_nvmet_port(item);
+
kfree(port);
}
@@ -1134,6 +1141,7 @@ static struct config_group *nvmet_referral_make(
static struct configfs_group_operations nvmet_referral_group_ops = {
.make_group = nvmet_referral_make,
+ .disconnect_notify = nvmet_referral_notify,
};
static const struct config_item_type nvmet_referrals_type = {
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index a0db6371b43e..a8ceb7721640 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -684,7 +684,7 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
disconnect = atomic_xchg(&queue->connected, 0);
spin_lock_irqsave(&queue->qlock, flags);
- /* about outstanding io's */
+ /* abort outstanding io's */
for (i = 0; i < queue->sqsize; fod++, i++) {
if (fod->active) {
spin_lock(&fod->flock);
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 1c50af6219f3..f69ce66e2d44 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -198,10 +198,13 @@ struct fcloop_lport_priv {
};
struct fcloop_rport {
- struct nvme_fc_remote_port *remoteport;
- struct nvmet_fc_target_port *targetport;
- struct fcloop_nport *nport;
- struct fcloop_lport *lport;
+ struct nvme_fc_remote_port *remoteport;
+ struct nvmet_fc_target_port *targetport;
+ struct fcloop_nport *nport;
+ struct fcloop_lport *lport;
+ spinlock_t lock;
+ struct list_head ls_list;
+ struct work_struct ls_work;
};
struct fcloop_tport {
@@ -224,11 +227,10 @@ struct fcloop_nport {
};
struct fcloop_lsreq {
- struct fcloop_tport *tport;
struct nvmefc_ls_req *lsreq;
- struct work_struct work;
struct nvmefc_tgt_ls_req tgt_ls_req;
int status;
+ struct list_head ls_list; /* fcloop_rport->ls_list */
};
struct fcloop_rscn {
@@ -292,21 +294,32 @@ fcloop_delete_queue(struct nvme_fc_local_port *localport,
{
}
-
-/*
- * Transmit of LS RSP done (e.g. buffers all set). call back up
- * initiator "done" flows.
- */
static void
-fcloop_tgt_lsrqst_done_work(struct work_struct *work)
+fcloop_rport_lsrqst_work(struct work_struct *work)
{
- struct fcloop_lsreq *tls_req =
- container_of(work, struct fcloop_lsreq, work);
- struct fcloop_tport *tport = tls_req->tport;
- struct nvmefc_ls_req *lsreq = tls_req->lsreq;
+ struct fcloop_rport *rport =
+ container_of(work, struct fcloop_rport, ls_work);
+ struct fcloop_lsreq *tls_req;
- if (!tport || tport->remoteport)
- lsreq->done(lsreq, tls_req->status);
+ spin_lock(&rport->lock);
+ for (;;) {
+ tls_req = list_first_entry_or_null(&rport->ls_list,
+ struct fcloop_lsreq, ls_list);
+ if (!tls_req)
+ break;
+
+ list_del(&tls_req->ls_list);
+ spin_unlock(&rport->lock);
+
+ tls_req->lsreq->done(tls_req->lsreq, tls_req->status);
+ /*
+ * callee may free memory containing tls_req.
+ * do not reference lsreq after this.
+ */
+
+ spin_lock(&rport->lock);
+ }
+ spin_unlock(&rport->lock);
}
static int
@@ -319,17 +332,18 @@ fcloop_ls_req(struct nvme_fc_local_port *localport,
int ret = 0;
tls_req->lsreq = lsreq;
- INIT_WORK(&tls_req->work, fcloop_tgt_lsrqst_done_work);
+ INIT_LIST_HEAD(&tls_req->ls_list);
if (!rport->targetport) {
tls_req->status = -ECONNREFUSED;
- tls_req->tport = NULL;
- schedule_work(&tls_req->work);
+ spin_lock(&rport->lock);
+ list_add_tail(&rport->ls_list, &tls_req->ls_list);
+ spin_unlock(&rport->lock);
+ schedule_work(&rport->ls_work);
return ret;
}
tls_req->status = 0;
- tls_req->tport = rport->targetport->private;
ret = nvmet_fc_rcv_ls_req(rport->targetport, &tls_req->tgt_ls_req,
lsreq->rqstaddr, lsreq->rqstlen);
@@ -337,18 +351,28 @@ fcloop_ls_req(struct nvme_fc_local_port *localport,
}
static int
-fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport,
+fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *targetport,
struct nvmefc_tgt_ls_req *tgt_lsreq)
{
struct fcloop_lsreq *tls_req = tgt_ls_req_to_lsreq(tgt_lsreq);
struct nvmefc_ls_req *lsreq = tls_req->lsreq;
+ struct fcloop_tport *tport = targetport->private;
+ struct nvme_fc_remote_port *remoteport = tport->remoteport;
+ struct fcloop_rport *rport;
memcpy(lsreq->rspaddr, tgt_lsreq->rspbuf,
((lsreq->rsplen < tgt_lsreq->rsplen) ?
lsreq->rsplen : tgt_lsreq->rsplen));
+
tgt_lsreq->done(tgt_lsreq);
- schedule_work(&tls_req->work);
+ if (remoteport) {
+ rport = remoteport->private;
+ spin_lock(&rport->lock);
+ list_add_tail(&rport->ls_list, &tls_req->ls_list);
+ spin_unlock(&rport->lock);
+ schedule_work(&rport->ls_work);
+ }
return 0;
}
@@ -834,6 +858,7 @@ fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport)
{
struct fcloop_rport *rport = remoteport->private;
+ flush_work(&rport->ls_work);
fcloop_nport_put(rport->nport);
}
@@ -850,7 +875,6 @@ fcloop_targetport_delete(struct nvmet_fc_target_port *targetport)
#define FCLOOP_DMABOUND_4G 0xFFFFFFFF
static struct nvme_fc_port_template fctemplate = {
- .module = THIS_MODULE,
.localport_delete = fcloop_localport_delete,
.remoteport_delete = fcloop_remoteport_delete,
.create_queue = fcloop_create_queue,
@@ -1136,6 +1160,9 @@ fcloop_create_remote_port(struct device *dev, struct device_attribute *attr,
rport->nport = nport;
rport->lport = nport->lport;
nport->rport = rport;
+ spin_lock_init(&rport->lock);
+ INIT_WORK(&rport->ls_work, fcloop_rport_lsrqst_work);
+ INIT_LIST_HEAD(&rport->ls_list);
return count;
}
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index c90c06839d64..fd47de0e4e4e 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -78,6 +78,7 @@ enum nvmet_rdma_queue_state {
struct nvmet_rdma_queue {
struct rdma_cm_id *cm_id;
+ struct ib_qp *qp;
struct nvmet_port *port;
struct ib_cq *cq;
atomic_t sq_wr_avail;
@@ -105,6 +106,13 @@ struct nvmet_rdma_queue {
struct list_head queue_list;
};
+struct nvmet_rdma_port {
+ struct nvmet_port *nport;
+ struct sockaddr_storage addr;
+ struct rdma_cm_id *cm_id;
+ struct delayed_work repair_work;
+};
+
struct nvmet_rdma_device {
struct ib_device *device;
struct ib_pd *pd;
@@ -461,7 +469,7 @@ static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
if (ndev->srq)
ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL);
else
- ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL);
+ ret = ib_post_recv(cmd->queue->qp, &cmd->wr, NULL);
if (unlikely(ret))
pr_err("post_recv cmd failed\n");
@@ -500,7 +508,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
if (rsp->n_rdma) {
- rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
+ rdma_rw_ctx_destroy(&rsp->rw, queue->qp,
queue->cm_id->port_num, rsp->req.sg,
rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
}
@@ -584,7 +592,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
WARN_ON(rsp->n_rdma <= 0);
atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
- rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
+ rdma_rw_ctx_destroy(&rsp->rw, queue->qp,
queue->cm_id->port_num, rsp->req.sg,
rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
rsp->n_rdma = 0;
@@ -739,7 +747,7 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
}
if (nvmet_rdma_need_data_in(rsp)) {
- if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp,
+ if (rdma_rw_ctx_post(&rsp->rw, queue->qp,
queue->cm_id->port_num, &rsp->read_cqe, NULL))
nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
} else {
@@ -911,7 +919,8 @@ static void nvmet_rdma_free_dev(struct kref *ref)
static struct nvmet_rdma_device *
nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
{
- struct nvmet_port *port = cm_id->context;
+ struct nvmet_rdma_port *port = cm_id->context;
+ struct nvmet_port *nport = port->nport;
struct nvmet_rdma_device *ndev;
int inline_page_count;
int inline_sge_count;
@@ -928,17 +937,17 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
if (!ndev)
goto out_err;
- inline_page_count = num_pages(port->inline_data_size);
+ inline_page_count = num_pages(nport->inline_data_size);
inline_sge_count = max(cm_id->device->attrs.max_sge_rd,
cm_id->device->attrs.max_recv_sge) - 1;
if (inline_page_count > inline_sge_count) {
pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n",
- port->inline_data_size, cm_id->device->name,
+ nport->inline_data_size, cm_id->device->name,
inline_sge_count * PAGE_SIZE);
- port->inline_data_size = inline_sge_count * PAGE_SIZE;
+ nport->inline_data_size = inline_sge_count * PAGE_SIZE;
inline_page_count = inline_sge_count;
}
- ndev->inline_data_size = port->inline_data_size;
+ ndev->inline_data_size = nport->inline_data_size;
ndev->inline_page_count = inline_page_count;
ndev->device = cm_id->device;
kref_init(&ndev->ref);
@@ -1024,6 +1033,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
pr_err("failed to create_qp ret= %d\n", ret);
goto err_destroy_cq;
}
+ queue->qp = queue->cm_id->qp;
atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr);
@@ -1052,11 +1062,10 @@ err_destroy_cq:
static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
{
- struct ib_qp *qp = queue->cm_id->qp;
-
- ib_drain_qp(qp);
- rdma_destroy_id(queue->cm_id);
- ib_destroy_qp(qp);
+ ib_drain_qp(queue->qp);
+ if (queue->cm_id)
+ rdma_destroy_id(queue->cm_id);
+ ib_destroy_qp(queue->qp);
ib_free_cq(queue->cq);
}
@@ -1266,6 +1275,7 @@ static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
{
+ struct nvmet_rdma_port *port = cm_id->context;
struct nvmet_rdma_device *ndev;
struct nvmet_rdma_queue *queue;
int ret = -EINVAL;
@@ -1281,7 +1291,7 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
ret = -ENOMEM;
goto put_device;
}
- queue->port = cm_id->context;
+ queue->port = port->nport;
if (queue->host_qid == 0) {
/* Let inflight controller teardown complete */
@@ -1290,9 +1300,12 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
if (ret) {
- schedule_work(&queue->release_work);
- /* Destroying rdma_cm id is not needed here */
- return 0;
+ /*
+ * Don't destroy the cm_id in free path, as we implicitly
+ * destroy the cm_id here with non-zero ret code.
+ */
+ queue->cm_id = NULL;
+ goto free_queue;
}
mutex_lock(&nvmet_rdma_queue_mutex);
@@ -1301,6 +1314,8 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
return 0;
+free_queue:
+ nvmet_rdma_free_queue(queue);
put_device:
kref_put(&ndev->ref, nvmet_rdma_free_dev);
@@ -1406,7 +1421,7 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
struct nvmet_rdma_queue *queue)
{
- struct nvmet_port *port;
+ struct nvmet_rdma_port *port;
if (queue) {
/*
@@ -1425,7 +1440,7 @@ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
* cm_id destroy. use atomic xchg to make sure
* we don't compete with remove_port.
*/
- if (xchg(&port->priv, NULL) != cm_id)
+ if (xchg(&port->cm_id, NULL) != cm_id)
return 0;
/*
@@ -1456,6 +1471,13 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
nvmet_rdma_queue_established(queue);
break;
case RDMA_CM_EVENT_ADDR_CHANGE:
+ if (!queue) {
+ struct nvmet_rdma_port *port = cm_id->context;
+
+ schedule_delayed_work(&port->repair_work, 0);
+ break;
+ }
+ /* FALLTHROUGH */
case RDMA_CM_EVENT_DISCONNECTED:
case RDMA_CM_EVENT_TIMEWAIT_EXIT:
nvmet_rdma_queue_disconnect(queue);
@@ -1498,42 +1520,19 @@ restart:
mutex_unlock(&nvmet_rdma_queue_mutex);
}
-static int nvmet_rdma_add_port(struct nvmet_port *port)
+static void nvmet_rdma_disable_port(struct nvmet_rdma_port *port)
{
- struct rdma_cm_id *cm_id;
- struct sockaddr_storage addr = { };
- __kernel_sa_family_t af;
- int ret;
+ struct rdma_cm_id *cm_id = xchg(&port->cm_id, NULL);
- switch (port->disc_addr.adrfam) {
- case NVMF_ADDR_FAMILY_IP4:
- af = AF_INET;
- break;
- case NVMF_ADDR_FAMILY_IP6:
- af = AF_INET6;
- break;
- default:
- pr_err("address family %d not supported\n",
- port->disc_addr.adrfam);
- return -EINVAL;
- }
-
- if (port->inline_data_size < 0) {
- port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
- } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
- pr_warn("inline_data_size %u is too large, reducing to %u\n",
- port->inline_data_size,
- NVMET_RDMA_MAX_INLINE_DATA_SIZE);
- port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
- }
+ if (cm_id)
+ rdma_destroy_id(cm_id);
+}
- ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
- port->disc_addr.trsvcid, &addr);
- if (ret) {
- pr_err("malformed ip/port passed: %s:%s\n",
- port->disc_addr.traddr, port->disc_addr.trsvcid);
- return ret;
- }
+static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port)
+{
+ struct sockaddr *addr = (struct sockaddr *)&port->addr;
+ struct rdma_cm_id *cm_id;
+ int ret;
cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
RDMA_PS_TCP, IB_QPT_RC);
@@ -1552,23 +1551,19 @@ static int nvmet_rdma_add_port(struct nvmet_port *port)
goto out_destroy_id;
}
- ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
+ ret = rdma_bind_addr(cm_id, addr);
if (ret) {
- pr_err("binding CM ID to %pISpcs failed (%d)\n",
- (struct sockaddr *)&addr, ret);
+ pr_err("binding CM ID to %pISpcs failed (%d)\n", addr, ret);
goto out_destroy_id;
}
ret = rdma_listen(cm_id, 128);
if (ret) {
- pr_err("listening to %pISpcs failed (%d)\n",
- (struct sockaddr *)&addr, ret);
+ pr_err("listening to %pISpcs failed (%d)\n", addr, ret);
goto out_destroy_id;
}
- pr_info("enabling port %d (%pISpcs)\n",
- le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
- port->priv = cm_id;
+ port->cm_id = cm_id;
return 0;
out_destroy_id:
@@ -1576,18 +1571,92 @@ out_destroy_id:
return ret;
}
-static void nvmet_rdma_remove_port(struct nvmet_port *port)
+static void nvmet_rdma_repair_port_work(struct work_struct *w)
{
- struct rdma_cm_id *cm_id = xchg(&port->priv, NULL);
+ struct nvmet_rdma_port *port = container_of(to_delayed_work(w),
+ struct nvmet_rdma_port, repair_work);
+ int ret;
- if (cm_id)
- rdma_destroy_id(cm_id);
+ nvmet_rdma_disable_port(port);
+ ret = nvmet_rdma_enable_port(port);
+ if (ret)
+ schedule_delayed_work(&port->repair_work, 5 * HZ);
+}
+
+static int nvmet_rdma_add_port(struct nvmet_port *nport)
+{
+ struct nvmet_rdma_port *port;
+ __kernel_sa_family_t af;
+ int ret;
+
+ port = kzalloc(sizeof(*port), GFP_KERNEL);
+ if (!port)
+ return -ENOMEM;
+
+ nport->priv = port;
+ port->nport = nport;
+ INIT_DELAYED_WORK(&port->repair_work, nvmet_rdma_repair_port_work);
+
+ switch (nport->disc_addr.adrfam) {
+ case NVMF_ADDR_FAMILY_IP4:
+ af = AF_INET;
+ break;
+ case NVMF_ADDR_FAMILY_IP6:
+ af = AF_INET6;
+ break;
+ default:
+ pr_err("address family %d not supported\n",
+ nport->disc_addr.adrfam);
+ ret = -EINVAL;
+ goto out_free_port;
+ }
+
+ if (nport->inline_data_size < 0) {
+ nport->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
+ } else if (nport->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
+ pr_warn("inline_data_size %u is too large, reducing to %u\n",
+ nport->inline_data_size,
+ NVMET_RDMA_MAX_INLINE_DATA_SIZE);
+ nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
+ }
+
+ ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
+ nport->disc_addr.trsvcid, &port->addr);
+ if (ret) {
+ pr_err("malformed ip/port passed: %s:%s\n",
+ nport->disc_addr.traddr, nport->disc_addr.trsvcid);
+ goto out_free_port;
+ }
+
+ ret = nvmet_rdma_enable_port(port);
+ if (ret)
+ goto out_free_port;
+
+ pr_info("enabling port %d (%pISpcs)\n",
+ le16_to_cpu(nport->disc_addr.portid),
+ (struct sockaddr *)&port->addr);
+
+ return 0;
+
+out_free_port:
+ kfree(port);
+ return ret;
+}
+
+static void nvmet_rdma_remove_port(struct nvmet_port *nport)
+{
+ struct nvmet_rdma_port *port = nport->priv;
+
+ cancel_delayed_work_sync(&port->repair_work);
+ nvmet_rdma_disable_port(port);
+ kfree(port);
}
static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
- struct nvmet_port *port, char *traddr)
+ struct nvmet_port *nport, char *traddr)
{
- struct rdma_cm_id *cm_id = port->priv;
+ struct nvmet_rdma_port *port = nport->priv;
+ struct rdma_cm_id *cm_id = port->cm_id;
if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) {
struct nvmet_rdma_rsp *rsp =
@@ -1597,7 +1666,7 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
sprintf(traddr, "%pISc", addr);
} else {
- memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE);
+ memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
}
}
diff --git a/drivers/parisc/eisa.c b/drivers/parisc/eisa.c
index 9d00a24277aa..f96e5eaee87e 100644
--- a/drivers/parisc/eisa.c
+++ b/drivers/parisc/eisa.c
@@ -243,11 +243,6 @@ static irqreturn_t dummy_irq2_handler(int _, void *dev)
return IRQ_HANDLED;
}
-static struct irqaction irq2_action = {
- .handler = dummy_irq2_handler,
- .name = "cascade",
-};
-
static void init_eisa_pic(void)
{
unsigned long flags;
@@ -335,7 +330,8 @@ static int __init eisa_probe(struct parisc_device *dev)
}
/* Reserve IRQ2 */
- setup_irq(2, &irq2_action);
+ if (request_irq(2, dummy_irq2_handler, 0, "cascade", NULL))
+ pr_err("Failed to request irq 2 (cascade)\n");
for (i = 0; i < 16; i++) {
irq_set_chip_and_handler(i, &eisa_interrupt_type,
handle_simple_irq);
diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 3ef0bb281e7c..390e92f2d8d1 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -366,6 +366,7 @@ int pci_enable_pasid(struct pci_dev *pdev, int features)
return 0;
}
+EXPORT_SYMBOL_GPL(pci_enable_pasid);
/**
* pci_disable_pasid - Disable the PASID capability
@@ -390,6 +391,7 @@ void pci_disable_pasid(struct pci_dev *pdev)
pdev->pasid_enabled = 0;
}
+EXPORT_SYMBOL_GPL(pci_disable_pasid);
/**
* pci_restore_pasid_state - Restore PASID capabilities
@@ -441,6 +443,7 @@ int pci_pasid_features(struct pci_dev *pdev)
return supported;
}
+EXPORT_SYMBOL_GPL(pci_pasid_features);
#define PASID_NUMBER_SHIFT 8
#define PASID_NUMBER_MASK (0x1f << PASID_NUMBER_SHIFT)
@@ -469,4 +472,5 @@ int pci_max_pasids(struct pci_dev *pdev)
return (1 << supported);
}
+EXPORT_SYMBOL_GPL(pci_max_pasids);
#endif /* CONFIG_PCI_PASID */
diff --git a/drivers/pcmcia/cs_internal.h b/drivers/pcmcia/cs_internal.h
index 33c9b6ea7364..fb9b17fa0fb5 100644
--- a/drivers/pcmcia/cs_internal.h
+++ b/drivers/pcmcia/cs_internal.h
@@ -40,7 +40,7 @@ struct cis_cache_entry {
unsigned int addr;
unsigned int len;
unsigned int attr;
- unsigned char cache[0];
+ unsigned char cache[];
};
struct pccard_resource_ops {
diff --git a/drivers/pcmcia/omap_cf.c b/drivers/pcmcia/omap_cf.c
index 0a04eb04f3a2..d3ef5534991e 100644
--- a/drivers/pcmcia/omap_cf.c
+++ b/drivers/pcmcia/omap_cf.c
@@ -329,7 +329,7 @@ static int __exit omap_cf_remove(struct platform_device *pdev)
static struct platform_driver omap_cf_driver = {
.driver = {
- .name = (char *) driver_name,
+ .name = driver_name,
},
.remove = __exit_p(omap_cf_remove),
};
diff --git a/drivers/pcmcia/rsrc_nonstatic.c b/drivers/pcmcia/rsrc_nonstatic.c
index 9e6922c08ef6..3b05760e69d6 100644
--- a/drivers/pcmcia/rsrc_nonstatic.c
+++ b/drivers/pcmcia/rsrc_nonstatic.c
@@ -1076,7 +1076,7 @@ static ssize_t show_io_db(struct device *dev,
for (p = data->io_db.next; p != &data->io_db; p = p->next) {
if (ret > (PAGE_SIZE - 10))
continue;
- ret += snprintf(&buf[ret], (PAGE_SIZE - ret - 1),
+ ret += scnprintf(&buf[ret], (PAGE_SIZE - ret - 1),
"0x%08lx - 0x%08lx\n",
((unsigned long) p->base),
((unsigned long) p->base + p->num - 1));
@@ -1133,7 +1133,7 @@ static ssize_t show_mem_db(struct device *dev,
p = p->next) {
if (ret > (PAGE_SIZE - 10))
continue;
- ret += snprintf(&buf[ret], (PAGE_SIZE - ret - 1),
+ ret += scnprintf(&buf[ret], (PAGE_SIZE - ret - 1),
"0x%08lx - 0x%08lx\n",
((unsigned long) p->base),
((unsigned long) p->base + p->num - 1));
@@ -1142,7 +1142,7 @@ static ssize_t show_mem_db(struct device *dev,
for (p = data->mem_db.next; p != &data->mem_db; p = p->next) {
if (ret > (PAGE_SIZE - 10))
continue;
- ret += snprintf(&buf[ret], (PAGE_SIZE - ret - 1),
+ ret += scnprintf(&buf[ret], (PAGE_SIZE - ret - 1),
"0x%08lx - 0x%08lx\n",
((unsigned long) p->base),
((unsigned long) p->base + p->num - 1));
diff --git a/drivers/pcmcia/sa1100_simpad.c b/drivers/pcmcia/sa1100_simpad.c
index e2e8729afd9d..784ada5b8c4f 100644
--- a/drivers/pcmcia/sa1100_simpad.c
+++ b/drivers/pcmcia/sa1100_simpad.c
@@ -14,7 +14,7 @@
#include <asm/mach-types.h>
#include <mach/simpad.h>
#include "sa1100_generic.h"
-
+
static int simpad_pcmcia_hw_init(struct soc_pcmcia_socket *skt)
{
@@ -66,7 +66,7 @@ simpad_pcmcia_configure_socket(struct soc_pcmcia_socket *skt,
simpad_clear_cs3_bit(VCC_3V_EN|VCC_5V_EN|EN0|EN1);
break;
- case 33:
+ case 33:
simpad_clear_cs3_bit(VCC_3V_EN|EN1);
simpad_set_cs3_bit(VCC_5V_EN|EN0);
break;
@@ -95,7 +95,7 @@ static void simpad_pcmcia_socket_suspend(struct soc_pcmcia_socket *skt)
simpad_set_cs3_bit(PCMCIA_RESET);
}
-static struct pcmcia_low_level simpad_pcmcia_ops = {
+static struct pcmcia_low_level simpad_pcmcia_ops = {
.owner = THIS_MODULE,
.hw_init = simpad_pcmcia_hw_init,
.hw_shutdown = simpad_pcmcia_hw_shutdown,
diff --git a/drivers/pcmcia/soc_common.h b/drivers/pcmcia/soc_common.h
index b7f993f1bbd0..222e81c79365 100644
--- a/drivers/pcmcia/soc_common.h
+++ b/drivers/pcmcia/soc_common.h
@@ -88,7 +88,7 @@ struct soc_pcmcia_socket {
struct skt_dev_info {
int nskt;
- struct soc_pcmcia_socket skt[0];
+ struct soc_pcmcia_socket skt[];
};
struct pcmcia_state {
diff --git a/drivers/pcmcia/yenta_socket.c b/drivers/pcmcia/yenta_socket.c
index 49b1c6a1bdbe..bf6529b0b5b0 100644
--- a/drivers/pcmcia/yenta_socket.c
+++ b/drivers/pcmcia/yenta_socket.c
@@ -180,12 +180,12 @@ static ssize_t show_yenta_registers(struct device *yentadev, struct device_attri
for (i = 0; i < 0x24; i += 4) {
unsigned val;
if (!(i & 15))
- offset += snprintf(buf + offset, PAGE_SIZE - offset, "\n%02x:", i);
+ offset += scnprintf(buf + offset, PAGE_SIZE - offset, "\n%02x:", i);
val = cb_readl(socket, i);
- offset += snprintf(buf + offset, PAGE_SIZE - offset, " %08x", val);
+ offset += scnprintf(buf + offset, PAGE_SIZE - offset, " %08x", val);
}
- offset += snprintf(buf + offset, PAGE_SIZE - offset, "\n\nExCA registers:");
+ offset += scnprintf(buf + offset, PAGE_SIZE - offset, "\n\nExCA registers:");
for (i = 0; i < 0x45; i++) {
unsigned char val;
if (!(i & 7)) {
@@ -193,10 +193,10 @@ static ssize_t show_yenta_registers(struct device *yentadev, struct device_attri
memcpy(buf + offset, " -", 2);
offset += 2;
} else
- offset += snprintf(buf + offset, PAGE_SIZE - offset, "\n%02x:", i);
+ offset += scnprintf(buf + offset, PAGE_SIZE - offset, "\n%02x:", i);
}
val = exca_readb(socket, i);
- offset += snprintf(buf + offset, PAGE_SIZE - offset, " %02x", val);
+ offset += scnprintf(buf + offset, PAGE_SIZE - offset, " %02x", val);
}
buf[offset++] = '\n';
return offset;
diff --git a/drivers/platform/chrome/Kconfig b/drivers/platform/chrome/Kconfig
index 5f57282a28da..03ea5129ed0c 100644
--- a/drivers/platform/chrome/Kconfig
+++ b/drivers/platform/chrome/Kconfig
@@ -7,7 +7,7 @@ config MFD_CROS_EC
tristate "Platform support for Chrome hardware (transitional)"
select CHROME_PLATFORMS
select CROS_EC
- select CONFIG_MFD_CROS_EC_DEV
+ select MFD_CROS_EC_DEV
depends on X86 || ARM || ARM64 || COMPILE_TEST
help
This is a transitional Kconfig option and will be removed after
@@ -214,6 +214,17 @@ config CROS_EC_SYSFS
To compile this driver as a module, choose M here: the
module will be called cros_ec_sysfs.
+config CROS_EC_TYPEC
+ tristate "ChromeOS EC Type-C Connector Control"
+ depends on MFD_CROS_EC_DEV && TYPEC
+ default MFD_CROS_EC_DEV
+ help
+ If you say Y here, you get support for accessing Type C connector
+ information from the Chrome OS EC.
+
+ To compile this driver as a module, choose M here: the module will be
+ called cros_ec_typec.
+
config CROS_USBPD_LOGGER
tristate "Logging driver for USB PD charger"
depends on CHARGER_CROS_USBPD
@@ -226,6 +237,20 @@ config CROS_USBPD_LOGGER
To compile this driver as a module, choose M here: the
module will be called cros_usbpd_logger.
+config CROS_USBPD_NOTIFY
+ tristate "ChromeOS Type-C power delivery event notifier"
+ depends on MFD_CROS_EC_DEV
+ default MFD_CROS_EC_DEV
+ help
+ If you say Y here, you get support for Type-C PD event notifications
+ from the ChromeOS EC. On ACPI platorms this driver will bind to the
+ GOOG0003 ACPI device, and on platforms which don't have this device it
+ will get initialized on ECs which support the feature
+ EC_FEATURE_USB_PD.
+
+ To compile this driver as a module, choose M here: the
+ module will be called cros_usbpd_notify.
+
source "drivers/platform/chrome/wilco_ec/Kconfig"
endif # CHROMEOS_PLATFORMS
diff --git a/drivers/platform/chrome/Makefile b/drivers/platform/chrome/Makefile
index aacd5920d8a1..41baccba033f 100644
--- a/drivers/platform/chrome/Makefile
+++ b/drivers/platform/chrome/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CROS_EC_ISHTP) += cros_ec_ishtp.o
obj-$(CONFIG_CROS_EC_RPMSG) += cros_ec_rpmsg.o
obj-$(CONFIG_CROS_EC_SPI) += cros_ec_spi.o
cros_ec_lpcs-objs := cros_ec_lpc.o cros_ec_lpc_mec.o
+obj-$(CONFIG_CROS_EC_TYPEC) += cros_ec_typec.o
obj-$(CONFIG_CROS_EC_LPC) += cros_ec_lpcs.o
obj-$(CONFIG_CROS_EC_PROTO) += cros_ec_proto.o cros_ec_trace.o
obj-$(CONFIG_CROS_KBD_LED_BACKLIGHT) += cros_kbd_led_backlight.o
@@ -19,8 +20,10 @@ obj-$(CONFIG_CROS_EC_CHARDEV) += cros_ec_chardev.o
obj-$(CONFIG_CROS_EC_LIGHTBAR) += cros_ec_lightbar.o
obj-$(CONFIG_CROS_EC_VBC) += cros_ec_vbc.o
obj-$(CONFIG_CROS_EC_DEBUGFS) += cros_ec_debugfs.o
-obj-$(CONFIG_CROS_EC_SENSORHUB) += cros_ec_sensorhub.o
+cros-ec-sensorhub-objs := cros_ec_sensorhub.o cros_ec_sensorhub_ring.o
+obj-$(CONFIG_CROS_EC_SENSORHUB) += cros-ec-sensorhub.o
obj-$(CONFIG_CROS_EC_SYSFS) += cros_ec_sysfs.o
obj-$(CONFIG_CROS_USBPD_LOGGER) += cros_usbpd_logger.o
+obj-$(CONFIG_CROS_USBPD_NOTIFY) += cros_usbpd_notify.o
obj-$(CONFIG_WILCO_EC) += wilco_ec/
diff --git a/drivers/platform/chrome/chromeos_laptop.c b/drivers/platform/chrome/chromeos_laptop.c
index 4f3651fcd9fe..472a03daa869 100644
--- a/drivers/platform/chrome/chromeos_laptop.c
+++ b/drivers/platform/chrome/chromeos_laptop.c
@@ -103,7 +103,7 @@ chromes_laptop_instantiate_i2c_device(struct i2c_adapter *adapter,
pr_debug("%d-%02x is probed at %02x\n",
adapter->nr, info->addr, dummy->addr);
i2c_unregister_device(dummy);
- client = i2c_new_device(adapter, info);
+ client = i2c_new_client_device(adapter, info);
}
}
diff --git a/drivers/platform/chrome/cros_ec.c b/drivers/platform/chrome/cros_ec.c
index 6fc8f2c3ac51..3104680b7485 100644
--- a/drivers/platform/chrome/cros_ec.c
+++ b/drivers/platform/chrome/cros_ec.c
@@ -120,7 +120,7 @@ static int cros_ec_sleep_event(struct cros_ec_device *ec_dev, u8 sleep_event)
buf.msg.command = EC_CMD_HOST_SLEEP_EVENT;
- ret = cros_ec_cmd_xfer(ec_dev, &buf.msg);
+ ret = cros_ec_cmd_xfer_status(ec_dev, &buf.msg);
/* For now, report failure to transition to S0ix with a warning. */
if (ret >= 0 && ec_dev->host_sleep_v1 &&
@@ -138,6 +138,24 @@ static int cros_ec_sleep_event(struct cros_ec_device *ec_dev, u8 sleep_event)
return ret;
}
+static int cros_ec_ready_event(struct notifier_block *nb,
+ unsigned long queued_during_suspend,
+ void *_notify)
+{
+ struct cros_ec_device *ec_dev = container_of(nb, struct cros_ec_device,
+ notifier_ready);
+ u32 host_event = cros_ec_get_host_event(ec_dev);
+
+ if (host_event & EC_HOST_EVENT_MASK(EC_HOST_EVENT_INTERFACE_READY)) {
+ mutex_lock(&ec_dev->lock);
+ cros_ec_query_all(ec_dev);
+ mutex_unlock(&ec_dev->lock);
+ return NOTIFY_OK;
+ }
+
+ return NOTIFY_DONE;
+}
+
/**
* cros_ec_register() - Register a new ChromeOS EC, using the provided info.
* @ec_dev: Device to register.
@@ -237,6 +255,18 @@ int cros_ec_register(struct cros_ec_device *ec_dev)
dev_dbg(ec_dev->dev, "Error %d clearing sleep event to ec",
err);
+ if (ec_dev->mkbp_event_supported) {
+ /*
+ * Register the notifier for EC_HOST_EVENT_INTERFACE_READY
+ * event.
+ */
+ ec_dev->notifier_ready.notifier_call = cros_ec_ready_event;
+ err = blocking_notifier_chain_register(&ec_dev->event_notifier,
+ &ec_dev->notifier_ready);
+ if (err)
+ return err;
+ }
+
dev_info(dev, "Chrome EC device registered\n");
return 0;
diff --git a/drivers/platform/chrome/cros_ec_chardev.c b/drivers/platform/chrome/cros_ec_chardev.c
index c65e70bc168d..e0bce869c49a 100644
--- a/drivers/platform/chrome/cros_ec_chardev.c
+++ b/drivers/platform/chrome/cros_ec_chardev.c
@@ -48,7 +48,7 @@ struct ec_event {
struct list_head node;
size_t size;
u8 event_type;
- u8 data[0];
+ u8 data[];
};
static int ec_get_version(struct cros_ec_dev *ec, char *str, int maxlen)
@@ -301,7 +301,7 @@ static long cros_ec_chardev_ioctl_xcmd(struct cros_ec_dev *ec, void __user *arg)
}
s_cmd->command += ec->cmd_offset;
- ret = cros_ec_cmd_xfer(ec->ec_dev, s_cmd);
+ ret = cros_ec_cmd_xfer_status(ec->ec_dev, s_cmd);
/* Only copy data to userland if data was received. */
if (ret < 0)
goto exit;
diff --git a/drivers/platform/chrome/cros_ec_lightbar.c b/drivers/platform/chrome/cros_ec_lightbar.c
index b4c110c5fee0..b59180bff5a3 100644
--- a/drivers/platform/chrome/cros_ec_lightbar.c
+++ b/drivers/platform/chrome/cros_ec_lightbar.c
@@ -116,7 +116,7 @@ static int get_lightbar_version(struct cros_ec_dev *ec,
param = (struct ec_params_lightbar *)msg->data;
param->cmd = LIGHTBAR_CMD_VERSION;
- ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
+ ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
if (ret < 0) {
ret = 0;
goto exit;
@@ -193,15 +193,10 @@ static ssize_t brightness_store(struct device *dev,
if (ret)
goto exit;
- ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
+ ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
if (ret < 0)
goto exit;
- if (msg->result != EC_RES_SUCCESS) {
- ret = -EINVAL;
- goto exit;
- }
-
ret = count;
exit:
kfree(msg);
@@ -258,13 +253,10 @@ static ssize_t led_rgb_store(struct device *dev, struct device_attribute *attr,
goto exit;
}
- ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
+ ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
if (ret < 0)
goto exit;
- if (msg->result != EC_RES_SUCCESS)
- goto exit;
-
i = 0;
ok = 1;
}
@@ -305,14 +297,13 @@ static ssize_t sequence_show(struct device *dev,
if (ret)
goto exit;
- ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
- if (ret < 0)
- goto exit;
-
- if (msg->result != EC_RES_SUCCESS) {
+ ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
+ if (ret == -EPROTO) {
ret = scnprintf(buf, PAGE_SIZE,
"ERROR: EC returned %d\n", msg->result);
goto exit;
+ } else if (ret < 0) {
+ goto exit;
}
resp = (struct ec_response_lightbar *)msg->data;
@@ -344,13 +335,10 @@ static int lb_send_empty_cmd(struct cros_ec_dev *ec, uint8_t cmd)
if (ret)
goto error;
- ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
+ ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
if (ret < 0)
goto error;
- if (msg->result != EC_RES_SUCCESS) {
- ret = -EINVAL;
- goto error;
- }
+
ret = 0;
error:
kfree(msg);
@@ -377,13 +365,10 @@ static int lb_manual_suspend_ctrl(struct cros_ec_dev *ec, uint8_t enable)
if (ret)
goto error;
- ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
+ ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
if (ret < 0)
goto error;
- if (msg->result != EC_RES_SUCCESS) {
- ret = -EINVAL;
- goto error;
- }
+
ret = 0;
error:
kfree(msg);
@@ -425,15 +410,10 @@ static ssize_t sequence_store(struct device *dev, struct device_attribute *attr,
if (ret)
goto exit;
- ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
+ ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
if (ret < 0)
goto exit;
- if (msg->result != EC_RES_SUCCESS) {
- ret = -EINVAL;
- goto exit;
- }
-
ret = count;
exit:
kfree(msg);
@@ -487,13 +467,9 @@ static ssize_t program_store(struct device *dev, struct device_attribute *attr,
*/
msg->outsize = count + extra_bytes;
- ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
+ ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
if (ret < 0)
goto exit;
- if (msg->result != EC_RES_SUCCESS) {
- ret = -EINVAL;
- goto exit;
- }
ret = count;
exit:
diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c
index 3cfa643f1d07..3e745e0fe092 100644
--- a/drivers/platform/chrome/cros_ec_proto.c
+++ b/drivers/platform/chrome/cros_ec_proto.c
@@ -553,7 +553,10 @@ EXPORT_SYMBOL(cros_ec_cmd_xfer);
* replied with success status. It's not necessary to check msg->result when
* using this function.
*
- * Return: The number of bytes transferred on success or negative error code.
+ * Return:
+ * >=0 - The number of bytes transferred
+ * -ENOTSUPP - Operation not supported
+ * -EPROTO - Protocol error
*/
int cros_ec_cmd_xfer_status(struct cros_ec_device *ec_dev,
struct cros_ec_command *msg)
@@ -563,6 +566,10 @@ int cros_ec_cmd_xfer_status(struct cros_ec_device *ec_dev,
ret = cros_ec_cmd_xfer(ec_dev, msg);
if (ret < 0) {
dev_err(ec_dev->dev, "Command xfer error (err:%d)\n", ret);
+ } else if (msg->result == EC_RES_INVALID_VERSION) {
+ dev_dbg(ec_dev->dev, "Command invalid version (err:%d)\n",
+ msg->result);
+ return -ENOTSUPP;
} else if (msg->result != EC_RES_SUCCESS) {
dev_dbg(ec_dev->dev, "Command result (err: %d)\n", msg->result);
return -EPROTO;
diff --git a/drivers/platform/chrome/cros_ec_rpmsg.c b/drivers/platform/chrome/cros_ec_rpmsg.c
index dbc3f5523b83..7e8629e3db74 100644
--- a/drivers/platform/chrome/cros_ec_rpmsg.c
+++ b/drivers/platform/chrome/cros_ec_rpmsg.c
@@ -44,6 +44,8 @@ struct cros_ec_rpmsg {
struct completion xfer_ack;
struct work_struct host_event_work;
struct rpmsg_endpoint *ept;
+ bool has_pending_host_event;
+ bool probe_done;
};
/**
@@ -177,7 +179,14 @@ static int cros_ec_rpmsg_callback(struct rpmsg_device *rpdev, void *data,
memcpy(ec_dev->din, resp->data, len);
complete(&ec_rpmsg->xfer_ack);
} else if (resp->type == HOST_EVENT_MARK) {
- schedule_work(&ec_rpmsg->host_event_work);
+ /*
+ * If the host event is sent before cros_ec_register is
+ * finished, queue the host event.
+ */
+ if (ec_rpmsg->probe_done)
+ schedule_work(&ec_rpmsg->host_event_work);
+ else
+ ec_rpmsg->has_pending_host_event = true;
} else {
dev_warn(ec_dev->dev, "rpmsg received invalid type = %d",
resp->type);
@@ -240,6 +249,11 @@ static int cros_ec_rpmsg_probe(struct rpmsg_device *rpdev)
return ret;
}
+ ec_rpmsg->probe_done = true;
+
+ if (ec_rpmsg->has_pending_host_event)
+ schedule_work(&ec_rpmsg->host_event_work);
+
return 0;
}
diff --git a/drivers/platform/chrome/cros_ec_sensorhub.c b/drivers/platform/chrome/cros_ec_sensorhub.c
index 79fefd3bb0fa..b7f2c00db5e1 100644
--- a/drivers/platform/chrome/cros_ec_sensorhub.c
+++ b/drivers/platform/chrome/cros_ec_sensorhub.c
@@ -50,10 +50,8 @@ static int cros_ec_sensorhub_register(struct device *dev,
struct cros_ec_sensorhub *sensorhub)
{
int sensor_type[MOTIONSENSE_TYPE_MAX] = { 0 };
+ struct cros_ec_command *msg = sensorhub->msg;
struct cros_ec_dev *ec = sensorhub->ec;
- struct ec_params_motion_sense *params;
- struct ec_response_motion_sense *resp;
- struct cros_ec_command *msg;
int ret, i, sensor_num;
char *name;
@@ -65,27 +63,19 @@ static int cros_ec_sensorhub_register(struct device *dev,
return sensor_num;
}
+ sensorhub->sensor_num = sensor_num;
if (sensor_num == 0) {
dev_err(dev, "Zero sensors reported.\n");
return -EINVAL;
}
- /* Prepare a message to send INFO command to each sensor. */
- msg = kzalloc(sizeof(*msg) + max(sizeof(*params), sizeof(*resp)),
- GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
msg->version = 1;
- msg->command = EC_CMD_MOTION_SENSE_CMD + ec->cmd_offset;
- msg->outsize = sizeof(*params);
- msg->insize = sizeof(*resp);
- params = (struct ec_params_motion_sense *)msg->data;
- resp = (struct ec_response_motion_sense *)msg->data;
+ msg->insize = sizeof(struct ec_response_motion_sense);
+ msg->outsize = sizeof(struct ec_params_motion_sense);
for (i = 0; i < sensor_num; i++) {
- params->cmd = MOTIONSENSE_CMD_INFO;
- params->info.sensor_num = i;
+ sensorhub->params->cmd = MOTIONSENSE_CMD_INFO;
+ sensorhub->params->info.sensor_num = i;
ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
if (ret < 0) {
@@ -94,7 +84,7 @@ static int cros_ec_sensorhub_register(struct device *dev,
continue;
}
- switch (resp->info.type) {
+ switch (sensorhub->resp->info.type) {
case MOTIONSENSE_TYPE_ACCEL:
name = "cros-ec-accel";
break;
@@ -117,15 +107,16 @@ static int cros_ec_sensorhub_register(struct device *dev,
name = "cros-ec-activity";
break;
default:
- dev_warn(dev, "unknown type %d\n", resp->info.type);
+ dev_warn(dev, "unknown type %d\n",
+ sensorhub->resp->info.type);
continue;
}
ret = cros_ec_sensorhub_allocate_sensor(dev, name, i);
if (ret)
- goto error;
+ return ret;
- sensor_type[resp->info.type]++;
+ sensor_type[sensorhub->resp->info.type]++;
}
if (sensor_type[MOTIONSENSE_TYPE_ACCEL] >= 2)
@@ -137,29 +128,41 @@ static int cros_ec_sensorhub_register(struct device *dev,
"cros-ec-lid-angle",
0);
if (ret)
- goto error;
+ return ret;
}
- kfree(msg);
return 0;
-
-error:
- kfree(msg);
- return ret;
}
static int cros_ec_sensorhub_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
+ struct cros_ec_dev *ec = dev_get_drvdata(dev->parent);
struct cros_ec_sensorhub *data;
+ struct cros_ec_command *msg;
int ret;
int i;
+ msg = devm_kzalloc(dev, sizeof(struct cros_ec_command) +
+ max((u16)sizeof(struct ec_params_motion_sense),
+ ec->ec_dev->max_response), GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ msg->command = EC_CMD_MOTION_SENSE_CMD + ec->cmd_offset;
+
data = devm_kzalloc(dev, sizeof(struct cros_ec_sensorhub), GFP_KERNEL);
if (!data)
return -ENOMEM;
- data->ec = dev_get_drvdata(dev->parent);
+ mutex_init(&data->cmd_lock);
+
+ data->dev = dev;
+ data->ec = ec;
+ data->msg = msg;
+ data->params = (struct ec_params_motion_sense *)msg->data;
+ data->resp = (struct ec_response_motion_sense *)msg->data;
+
dev_set_drvdata(dev, data);
/* Check whether this EC is a sensor hub. */
@@ -172,7 +175,8 @@ static int cros_ec_sensorhub_probe(struct platform_device *pdev)
* If the device has sensors but does not claim to
* be a sensor hub, we are in legacy mode.
*/
- for (i = 0; i < 2; i++) {
+ data->sensor_num = 2;
+ for (i = 0; i < data->sensor_num; i++) {
ret = cros_ec_sensorhub_allocate_sensor(dev,
"cros-ec-accel-legacy", i);
if (ret)
@@ -180,12 +184,63 @@ static int cros_ec_sensorhub_probe(struct platform_device *pdev)
}
}
+ /*
+ * If the EC does not have a FIFO, the sensors will query their data
+ * themselves via sysfs or a software trigger.
+ */
+ if (cros_ec_check_features(ec, EC_FEATURE_MOTION_SENSE_FIFO)) {
+ ret = cros_ec_sensorhub_ring_add(data);
+ if (ret)
+ return ret;
+ /*
+ * The msg and its data is not under the control of the ring
+ * handler.
+ */
+ return devm_add_action_or_reset(dev,
+ cros_ec_sensorhub_ring_remove,
+ data);
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+/*
+ * When the EC is suspending, we must stop sending interrupt,
+ * we may use the same interrupt line for waking up the device.
+ * Tell the EC to stop sending non-interrupt event on the iio ring.
+ */
+static int cros_ec_sensorhub_suspend(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ struct cros_ec_sensorhub *sensorhub = platform_get_drvdata(pdev);
+ struct cros_ec_dev *ec = sensorhub->ec;
+
+ if (cros_ec_check_features(ec, EC_FEATURE_MOTION_SENSE_FIFO))
+ return cros_ec_sensorhub_ring_fifo_enable(sensorhub, false);
return 0;
}
+static int cros_ec_sensorhub_resume(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ struct cros_ec_sensorhub *sensorhub = platform_get_drvdata(pdev);
+ struct cros_ec_dev *ec = sensorhub->ec;
+
+ if (cros_ec_check_features(ec, EC_FEATURE_MOTION_SENSE_FIFO))
+ return cros_ec_sensorhub_ring_fifo_enable(sensorhub, true);
+ return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(cros_ec_sensorhub_pm_ops,
+ cros_ec_sensorhub_suspend,
+ cros_ec_sensorhub_resume);
+
static struct platform_driver cros_ec_sensorhub_driver = {
.driver = {
.name = DRV_NAME,
+ .pm = &cros_ec_sensorhub_pm_ops,
},
.probe = cros_ec_sensorhub_probe,
};
diff --git a/drivers/platform/chrome/cros_ec_sensorhub_ring.c b/drivers/platform/chrome/cros_ec_sensorhub_ring.c
new file mode 100644
index 000000000000..230e6cf3da2f
--- /dev/null
+++ b/drivers/platform/chrome/cros_ec_sensorhub_ring.c
@@ -0,0 +1,1046 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Driver for Chrome OS EC Sensor hub FIFO.
+ *
+ * Copyright 2020 Google LLC
+ */
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/iio/iio.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_data/cros_ec_commands.h>
+#include <linux/platform_data/cros_ec_proto.h>
+#include <linux/platform_data/cros_ec_sensorhub.h>
+#include <linux/platform_device.h>
+#include <linux/sort.h>
+#include <linux/slab.h>
+
+/* Precision of fixed point for the m values from the filter */
+#define M_PRECISION BIT(23)
+
+/* Only activate the filter once we have at least this many elements. */
+#define TS_HISTORY_THRESHOLD 8
+
+/*
+ * If we don't have any history entries for this long, empty the filter to
+ * make sure there are no big discontinuities.
+ */
+#define TS_HISTORY_BORED_US 500000
+
+/* To measure by how much the filter is overshooting, if it happens. */
+#define FUTURE_TS_ANALYTICS_COUNT_MAX 100
+
+static inline int
+cros_sensorhub_send_sample(struct cros_ec_sensorhub *sensorhub,
+ struct cros_ec_sensors_ring_sample *sample)
+{
+ cros_ec_sensorhub_push_data_cb_t cb;
+ int id = sample->sensor_id;
+ struct iio_dev *indio_dev;
+
+ if (id > sensorhub->sensor_num)
+ return -EINVAL;
+
+ cb = sensorhub->push_data[id].push_data_cb;
+ if (!cb)
+ return 0;
+
+ indio_dev = sensorhub->push_data[id].indio_dev;
+
+ if (sample->flag & MOTIONSENSE_SENSOR_FLAG_FLUSH)
+ return 0;
+
+ return cb(indio_dev, sample->vector, sample->timestamp);
+}
+
+/**
+ * cros_ec_sensorhub_register_push_data() - register the callback to the hub.
+ *
+ * @sensorhub : Sensor Hub object
+ * @sensor_num : The sensor the caller is interested in.
+ * @indio_dev : The iio device to use when a sample arrives.
+ * @cb : The callback to call when a sample arrives.
+ *
+ * The callback cb will be used by cros_ec_sensorhub_ring to distribute events
+ * from the EC.
+ *
+ * Return: 0 when callback is registered.
+ * EINVAL is the sensor number is invalid or the slot already used.
+ */
+int cros_ec_sensorhub_register_push_data(struct cros_ec_sensorhub *sensorhub,
+ u8 sensor_num,
+ struct iio_dev *indio_dev,
+ cros_ec_sensorhub_push_data_cb_t cb)
+{
+ if (sensor_num >= sensorhub->sensor_num)
+ return -EINVAL;
+ if (sensorhub->push_data[sensor_num].indio_dev)
+ return -EINVAL;
+
+ sensorhub->push_data[sensor_num].indio_dev = indio_dev;
+ sensorhub->push_data[sensor_num].push_data_cb = cb;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cros_ec_sensorhub_register_push_data);
+
+void cros_ec_sensorhub_unregister_push_data(struct cros_ec_sensorhub *sensorhub,
+ u8 sensor_num)
+{
+ sensorhub->push_data[sensor_num].indio_dev = NULL;
+ sensorhub->push_data[sensor_num].push_data_cb = NULL;
+}
+EXPORT_SYMBOL_GPL(cros_ec_sensorhub_unregister_push_data);
+
+/**
+ * cros_ec_sensorhub_ring_fifo_enable() - Enable or disable interrupt generation
+ * for FIFO events.
+ * @sensorhub: Sensor Hub object
+ * @on: true when events are requested.
+ *
+ * To be called before sleeping or when noone is listening.
+ * Return: 0 on success, or an error when we can not communicate with the EC.
+ *
+ */
+int cros_ec_sensorhub_ring_fifo_enable(struct cros_ec_sensorhub *sensorhub,
+ bool on)
+{
+ int ret, i;
+
+ mutex_lock(&sensorhub->cmd_lock);
+ if (sensorhub->tight_timestamps)
+ for (i = 0; i < sensorhub->sensor_num; i++)
+ sensorhub->batch_state[i].last_len = 0;
+
+ sensorhub->params->cmd = MOTIONSENSE_CMD_FIFO_INT_ENABLE;
+ sensorhub->params->fifo_int_enable.enable = on;
+
+ sensorhub->msg->outsize = sizeof(struct ec_params_motion_sense);
+ sensorhub->msg->insize = sizeof(struct ec_response_motion_sense);
+
+ ret = cros_ec_cmd_xfer_status(sensorhub->ec->ec_dev, sensorhub->msg);
+ mutex_unlock(&sensorhub->cmd_lock);
+
+ /* We expect to receive a payload of 4 bytes, ignore. */
+ if (ret > 0)
+ ret = 0;
+
+ return ret;
+}
+
+static int cros_ec_sensor_ring_median_cmp(const void *pv1, const void *pv2)
+{
+ s64 v1 = *(s64 *)pv1;
+ s64 v2 = *(s64 *)pv2;
+
+ if (v1 > v2)
+ return 1;
+ else if (v1 < v2)
+ return -1;
+ else
+ return 0;
+}
+
+/*
+ * cros_ec_sensor_ring_median: Gets median of an array of numbers
+ *
+ * For now it's implemented using an inefficient > O(n) sort then return
+ * the middle element. A more optimal method would be something like
+ * quickselect, but given that n = 64 we can probably live with it in the
+ * name of clarity.
+ *
+ * Warning: the input array gets modified (sorted)!
+ */
+static s64 cros_ec_sensor_ring_median(s64 *array, size_t length)
+{
+ sort(array, length, sizeof(s64), cros_ec_sensor_ring_median_cmp, NULL);
+ return array[length / 2];
+}
+
+/*
+ * IRQ Timestamp Filtering
+ *
+ * Lower down in cros_ec_sensor_ring_process_event(), for each sensor event
+ * we have to calculate it's timestamp in the AP timebase. There are 3 time
+ * points:
+ * a - EC timebase, sensor event
+ * b - EC timebase, IRQ
+ * c - AP timebase, IRQ
+ * a' - what we want: sensor even in AP timebase
+ *
+ * While a and b are recorded at accurate times (due to the EC real time
+ * nature); c is pretty untrustworthy, even though it's recorded the
+ * first thing in ec_irq_handler(). There is a very good change we'll get
+ * added lantency due to:
+ * other irqs
+ * ddrfreq
+ * cpuidle
+ *
+ * Normally a' = c - b + a, but if we do that naive math any jitter in c
+ * will get coupled in a', which we don't want. We want a function
+ * a' = cros_ec_sensor_ring_ts_filter(a) which will filter out outliers in c.
+ *
+ * Think of a graph of AP time(b) on the y axis vs EC time(c) on the x axis.
+ * The slope of the line won't be exactly 1, there will be some clock drift
+ * between the 2 chips for various reasons (mechanical stress, temperature,
+ * voltage). We need to extrapolate values for a future x, without trusting
+ * recent y values too much.
+ *
+ * We use a median filter for the slope, then another median filter for the
+ * y-intercept to calculate this function:
+ * dx[n] = x[n-1] - x[n]
+ * dy[n] = x[n-1] - x[n]
+ * m[n] = dy[n] / dx[n]
+ * median_m = median(m[n-k:n])
+ * error[i] = y[n-i] - median_m * x[n-i]
+ * median_error = median(error[:k])
+ * predicted_y = median_m * x + median_error
+ *
+ * Implementation differences from above:
+ * - Redefined y to be actually c - b, this gives us a lot more precision
+ * to do the math. (c-b)/b variations are more obvious than c/b variations.
+ * - Since we don't have floating point, any operations involving slope are
+ * done using fixed point math (*M_PRECISION)
+ * - Since x and y grow with time, we keep zeroing the graph (relative to
+ * the last sample), this way math involving *x[n-i] will not overflow
+ * - EC timestamps are kept in us, it improves the slope calculation precision
+ */
+
+/**
+ * cros_ec_sensor_ring_ts_filter_update() - Update filter history.
+ *
+ * @state: Filter information.
+ * @b: IRQ timestamp, EC timebase (us)
+ * @c: IRQ timestamp, AP timebase (ns)
+ *
+ * Given a new IRQ timestamp pair (EC and AP timebases), add it to the filter
+ * history.
+ */
+static void
+cros_ec_sensor_ring_ts_filter_update(struct cros_ec_sensors_ts_filter_state
+ *state,
+ s64 b, s64 c)
+{
+ s64 x, y;
+ s64 dx, dy;
+ s64 m; /* stored as *M_PRECISION */
+ s64 *m_history_copy = state->temp_buf;
+ s64 *error = state->temp_buf;
+ int i;
+
+ /* we trust b the most, that'll be our independent variable */
+ x = b;
+ /* y is the offset between AP and EC times, in ns */
+ y = c - b * 1000;
+
+ dx = (state->x_history[0] + state->x_offset) - x;
+ if (dx == 0)
+ return; /* we already have this irq in the history */
+ dy = (state->y_history[0] + state->y_offset) - y;
+ m = div64_s64(dy * M_PRECISION, dx);
+
+ /* Empty filter if we haven't seen any action in a while. */
+ if (-dx > TS_HISTORY_BORED_US)
+ state->history_len = 0;
+
+ /* Move everything over, also update offset to all absolute coords .*/
+ for (i = state->history_len - 1; i >= 1; i--) {
+ state->x_history[i] = state->x_history[i - 1] + dx;
+ state->y_history[i] = state->y_history[i - 1] + dy;
+
+ state->m_history[i] = state->m_history[i - 1];
+ /*
+ * Also use the same loop to copy m_history for future
+ * median extraction.
+ */
+ m_history_copy[i] = state->m_history[i - 1];
+ }
+
+ /* Store the x and y, but remember offset is actually last sample. */
+ state->x_offset = x;
+ state->y_offset = y;
+ state->x_history[0] = 0;
+ state->y_history[0] = 0;
+
+ state->m_history[0] = m;
+ m_history_copy[0] = m;
+
+ if (state->history_len < CROS_EC_SENSORHUB_TS_HISTORY_SIZE)
+ state->history_len++;
+
+ /* Precalculate things for the filter. */
+ if (state->history_len > TS_HISTORY_THRESHOLD) {
+ state->median_m =
+ cros_ec_sensor_ring_median(m_history_copy,
+ state->history_len - 1);
+
+ /*
+ * Calculate y-intercepts as if m_median is the slope and
+ * points in the history are on the line. median_error will
+ * still be in the offset coordinate system.
+ */
+ for (i = 0; i < state->history_len; i++)
+ error[i] = state->y_history[i] -
+ div_s64(state->median_m * state->x_history[i],
+ M_PRECISION);
+ state->median_error =
+ cros_ec_sensor_ring_median(error, state->history_len);
+ } else {
+ state->median_m = 0;
+ state->median_error = 0;
+ }
+}
+
+/**
+ * cros_ec_sensor_ring_ts_filter() - Translate EC timebase timestamp to AP
+ * timebase
+ *
+ * @state: filter information.
+ * @x: any ec timestamp (us):
+ *
+ * cros_ec_sensor_ring_ts_filter(a) => a' event timestamp, AP timebase
+ * cros_ec_sensor_ring_ts_filter(b) => calculated timestamp when the EC IRQ
+ * should have happened on the AP, with low jitter
+ *
+ * Note: The filter will only activate once state->history_len goes
+ * over TS_HISTORY_THRESHOLD. Otherwise it'll just do the naive c - b + a
+ * transform.
+ *
+ * How to derive the formula, starting from:
+ * f(x) = median_m * x + median_error
+ * That's the calculated AP - EC offset (at the x point in time)
+ * Undo the coordinate system transform:
+ * f(x) = median_m * (x - x_offset) + median_error + y_offset
+ * Remember to undo the "y = c - b * 1000" modification:
+ * f(x) = median_m * (x - x_offset) + median_error + y_offset + x * 1000
+ *
+ * Return: timestamp in AP timebase (ns)
+ */
+static s64
+cros_ec_sensor_ring_ts_filter(struct cros_ec_sensors_ts_filter_state *state,
+ s64 x)
+{
+ return div_s64(state->median_m * (x - state->x_offset), M_PRECISION)
+ + state->median_error + state->y_offset + x * 1000;
+}
+
+/*
+ * Since a and b were originally 32 bit values from the EC,
+ * they overflow relatively often, casting is not enough, so we need to
+ * add an offset.
+ */
+static void
+cros_ec_sensor_ring_fix_overflow(s64 *ts,
+ const s64 overflow_period,
+ struct cros_ec_sensors_ec_overflow_state
+ *state)
+{
+ s64 adjust;
+
+ *ts += state->offset;
+ if (abs(state->last - *ts) > (overflow_period / 2)) {
+ adjust = state->last > *ts ? overflow_period : -overflow_period;
+ state->offset += adjust;
+ *ts += adjust;
+ }
+ state->last = *ts;
+}
+
+static void
+cros_ec_sensor_ring_check_for_past_timestamp(struct cros_ec_sensorhub
+ *sensorhub,
+ struct cros_ec_sensors_ring_sample
+ *sample)
+{
+ const u8 sensor_id = sample->sensor_id;
+
+ /* If this event is earlier than one we saw before... */
+ if (sensorhub->batch_state[sensor_id].newest_sensor_event >
+ sample->timestamp)
+ /* mark it for spreading. */
+ sample->timestamp =
+ sensorhub->batch_state[sensor_id].last_ts;
+ else
+ sensorhub->batch_state[sensor_id].newest_sensor_event =
+ sample->timestamp;
+}
+
+/**
+ * cros_ec_sensor_ring_process_event() - Process one EC FIFO event
+ *
+ * @sensorhub: Sensor Hub object.
+ * @fifo_info: FIFO information from the EC (includes b point, EC timebase).
+ * @fifo_timestamp: EC IRQ, kernel timebase (aka c).
+ * @current_timestamp: calculated event timestamp, kernel timebase (aka a').
+ * @in: incoming FIFO event from EC (includes a point, EC timebase).
+ * @out: outgoing event to user space (includes a').
+ *
+ * Process one EC event, add it in the ring if necessary.
+ *
+ * Return: true if out event has been populated.
+ */
+static bool
+cros_ec_sensor_ring_process_event(struct cros_ec_sensorhub *sensorhub,
+ const struct ec_response_motion_sense_fifo_info
+ *fifo_info,
+ const ktime_t fifo_timestamp,
+ ktime_t *current_timestamp,
+ struct ec_response_motion_sensor_data *in,
+ struct cros_ec_sensors_ring_sample *out)
+{
+ const s64 now = cros_ec_get_time_ns();
+ int axis, async_flags;
+
+ /* Do not populate the filter based on asynchronous events. */
+ async_flags = in->flags &
+ (MOTIONSENSE_SENSOR_FLAG_ODR | MOTIONSENSE_SENSOR_FLAG_FLUSH);
+
+ if (in->flags & MOTIONSENSE_SENSOR_FLAG_TIMESTAMP && !async_flags) {
+ s64 a = in->timestamp;
+ s64 b = fifo_info->timestamp;
+ s64 c = fifo_timestamp;
+
+ cros_ec_sensor_ring_fix_overflow(&a, 1LL << 32,
+ &sensorhub->overflow_a);
+ cros_ec_sensor_ring_fix_overflow(&b, 1LL << 32,
+ &sensorhub->overflow_b);
+
+ if (sensorhub->tight_timestamps) {
+ cros_ec_sensor_ring_ts_filter_update(
+ &sensorhub->filter, b, c);
+ *current_timestamp = cros_ec_sensor_ring_ts_filter(
+ &sensorhub->filter, a);
+ } else {
+ s64 new_timestamp;
+
+ /*
+ * Disable filtering since we might add more jitter
+ * if b is in a random point in time.
+ */
+ new_timestamp = fifo_timestamp -
+ fifo_info->timestamp * 1000 +
+ in->timestamp * 1000;
+ /*
+ * The timestamp can be stale if we had to use the fifo
+ * info timestamp.
+ */
+ if (new_timestamp - *current_timestamp > 0)
+ *current_timestamp = new_timestamp;
+ }
+ }
+
+ if (in->flags & MOTIONSENSE_SENSOR_FLAG_ODR) {
+ if (sensorhub->tight_timestamps) {
+ sensorhub->batch_state[in->sensor_num].last_len = 0;
+ sensorhub->batch_state[in->sensor_num].penul_len = 0;
+ }
+ /*
+ * ODR change is only useful for the sensor_ring, it does not
+ * convey information to clients.
+ */
+ return false;
+ }
+
+ if (in->flags & MOTIONSENSE_SENSOR_FLAG_FLUSH) {
+ out->sensor_id = in->sensor_num;
+ out->timestamp = *current_timestamp;
+ out->flag = in->flags;
+ if (sensorhub->tight_timestamps)
+ sensorhub->batch_state[out->sensor_id].last_len = 0;
+ /*
+ * No other payload information provided with
+ * flush ack.
+ */
+ return true;
+ }
+
+ if (in->flags & MOTIONSENSE_SENSOR_FLAG_TIMESTAMP)
+ /* If we just have a timestamp, skip this entry. */
+ return false;
+
+ /* Regular sample */
+ out->sensor_id = in->sensor_num;
+ if (*current_timestamp - now > 0) {
+ /*
+ * This fix is needed to overcome the timestamp filter putting
+ * events in the future.
+ */
+ sensorhub->future_timestamp_total_ns +=
+ *current_timestamp - now;
+ if (++sensorhub->future_timestamp_count ==
+ FUTURE_TS_ANALYTICS_COUNT_MAX) {
+ s64 avg = div_s64(sensorhub->future_timestamp_total_ns,
+ sensorhub->future_timestamp_count);
+ dev_warn_ratelimited(sensorhub->dev,
+ "100 timestamps in the future, %lldns shaved on average\n",
+ avg);
+ sensorhub->future_timestamp_count = 0;
+ sensorhub->future_timestamp_total_ns = 0;
+ }
+ out->timestamp = now;
+ } else {
+ out->timestamp = *current_timestamp;
+ }
+
+ out->flag = in->flags;
+ for (axis = 0; axis < 3; axis++)
+ out->vector[axis] = in->data[axis];
+
+ if (sensorhub->tight_timestamps)
+ cros_ec_sensor_ring_check_for_past_timestamp(sensorhub, out);
+ return true;
+}
+
+/*
+ * cros_ec_sensor_ring_spread_add: Calculate proper timestamps then add to
+ * ringbuffer.
+ *
+ * This is the new spreading code, assumes every sample's timestamp
+ * preceeds the sample. Run if tight_timestamps == true.
+ *
+ * Sometimes the EC receives only one interrupt (hence timestamp) for
+ * a batch of samples. Only the first sample will have the correct
+ * timestamp. So we must interpolate the other samples.
+ * We use the previous batch timestamp and our current batch timestamp
+ * as a way to calculate period, then spread the samples evenly.
+ *
+ * s0 int, 0ms
+ * s1 int, 10ms
+ * s2 int, 20ms
+ * 30ms point goes by, no interrupt, previous one is still asserted
+ * downloading s2 and s3
+ * s3 sample, 20ms (incorrect timestamp)
+ * s4 int, 40ms
+ *
+ * The batches are [(s0), (s1), (s2, s3), (s4)]. Since the 3rd batch
+ * has 2 samples in them, we adjust the timestamp of s3.
+ * s2 - s1 = 10ms, so s3 must be s2 + 10ms => 20ms. If s1 would have
+ * been part of a bigger batch things would have gotten a little
+ * more complicated.
+ *
+ * Note: we also assume another sensor sample doesn't break up a batch
+ * in 2 or more partitions. Example, there can't ever be a sync sensor
+ * in between S2 and S3. This simplifies the following code.
+ */
+static void
+cros_ec_sensor_ring_spread_add(struct cros_ec_sensorhub *sensorhub,
+ unsigned long sensor_mask,
+ struct cros_ec_sensors_ring_sample *last_out)
+{
+ struct cros_ec_sensors_ring_sample *batch_start, *next_batch_start;
+ int id;
+
+ for_each_set_bit(id, &sensor_mask, sensorhub->sensor_num) {
+ for (batch_start = sensorhub->ring; batch_start < last_out;
+ batch_start = next_batch_start) {
+ /*
+ * For each batch (where all samples have the same
+ * timestamp).
+ */
+ int batch_len, sample_idx;
+ struct cros_ec_sensors_ring_sample *batch_end =
+ batch_start;
+ struct cros_ec_sensors_ring_sample *s;
+ s64 batch_timestamp = batch_start->timestamp;
+ s64 sample_period;
+
+ /*
+ * Skip over batches that start with the sensor types
+ * we're not looking at right now.
+ */
+ if (batch_start->sensor_id != id) {
+ next_batch_start = batch_start + 1;
+ continue;
+ }
+
+ /*
+ * Do not start a batch
+ * from a flush, as it happens asynchronously to the
+ * regular flow of events.
+ */
+ if (batch_start->flag & MOTIONSENSE_SENSOR_FLAG_FLUSH) {
+ cros_sensorhub_send_sample(sensorhub,
+ batch_start);
+ next_batch_start = batch_start + 1;
+ continue;
+ }
+
+ if (batch_start->timestamp <=
+ sensorhub->batch_state[id].last_ts) {
+ batch_timestamp =
+ sensorhub->batch_state[id].last_ts;
+ batch_len = sensorhub->batch_state[id].last_len;
+
+ sample_idx = batch_len;
+
+ sensorhub->batch_state[id].last_ts =
+ sensorhub->batch_state[id].penul_ts;
+ sensorhub->batch_state[id].last_len =
+ sensorhub->batch_state[id].penul_len;
+ } else {
+ /*
+ * Push first sample in the batch to the,
+ * kifo, it's guaranteed to be correct, the
+ * rest will follow later on.
+ */
+ sample_idx = 1;
+ batch_len = 1;
+ cros_sensorhub_send_sample(sensorhub,
+ batch_start);
+ batch_start++;
+ }
+
+ /* Find all samples have the same timestamp. */
+ for (s = batch_start; s < last_out; s++) {
+ if (s->sensor_id != id)
+ /*
+ * Skip over other sensor types that
+ * are interleaved, don't count them.
+ */
+ continue;
+ if (s->timestamp != batch_timestamp)
+ /* we discovered the next batch */
+ break;
+ if (s->flag & MOTIONSENSE_SENSOR_FLAG_FLUSH)
+ /* break on flush packets */
+ break;
+ batch_end = s;
+ batch_len++;
+ }
+
+ if (batch_len == 1)
+ goto done_with_this_batch;
+
+ /* Can we calculate period? */
+ if (sensorhub->batch_state[id].last_len == 0) {
+ dev_warn(sensorhub->dev, "Sensor %d: lost %d samples when spreading\n",
+ id, batch_len - 1);
+ goto done_with_this_batch;
+ /*
+ * Note: we're dropping the rest of the samples
+ * in this batch since we have no idea where
+ * they're supposed to go without a period
+ * calculation.
+ */
+ }
+
+ sample_period = div_s64(batch_timestamp -
+ sensorhub->batch_state[id].last_ts,
+ sensorhub->batch_state[id].last_len);
+ dev_dbg(sensorhub->dev,
+ "Adjusting %d samples, sensor %d last_batch @%lld (%d samples) batch_timestamp=%lld => period=%lld\n",
+ batch_len, id,
+ sensorhub->batch_state[id].last_ts,
+ sensorhub->batch_state[id].last_len,
+ batch_timestamp,
+ sample_period);
+
+ /*
+ * Adjust timestamps of the samples then push them to
+ * kfifo.
+ */
+ for (s = batch_start; s <= batch_end; s++) {
+ if (s->sensor_id != id)
+ /*
+ * Skip over other sensor types that
+ * are interleaved, don't change them.
+ */
+ continue;
+
+ s->timestamp = batch_timestamp +
+ sample_period * sample_idx;
+ sample_idx++;
+
+ cros_sensorhub_send_sample(sensorhub, s);
+ }
+
+done_with_this_batch:
+ sensorhub->batch_state[id].penul_ts =
+ sensorhub->batch_state[id].last_ts;
+ sensorhub->batch_state[id].penul_len =
+ sensorhub->batch_state[id].last_len;
+
+ sensorhub->batch_state[id].last_ts =
+ batch_timestamp;
+ sensorhub->batch_state[id].last_len = batch_len;
+
+ next_batch_start = batch_end + 1;
+ }
+ }
+}
+
+/*
+ * cros_ec_sensor_ring_spread_add_legacy: Calculate proper timestamps then
+ * add to ringbuffer (legacy).
+ *
+ * Note: This assumes we're running old firmware, where every sample's timestamp
+ * is after the sample. Run if tight_timestamps == false.
+ *
+ * If there is a sample with a proper timestamp
+ *
+ * timestamp | count
+ * -----------------
+ * older_unprocess_out --> TS1 | 1
+ * TS1 | 2
+ * out --> TS1 | 3
+ * next_out --> TS2 |
+ *
+ * We spread time for the samples [older_unprocess_out .. out]
+ * between TS1 and TS2: [TS1+1/4, TS1+2/4, TS1+3/4, TS2].
+ *
+ * If we reach the end of the samples, we compare with the
+ * current timestamp:
+ *
+ * older_unprocess_out --> TS1 | 1
+ * TS1 | 2
+ * out --> TS1 | 3
+ *
+ * We know have [TS1+1/3, TS1+2/3, current timestamp]
+ */
+static void
+cros_ec_sensor_ring_spread_add_legacy(struct cros_ec_sensorhub *sensorhub,
+ unsigned long sensor_mask,
+ s64 current_timestamp,
+ struct cros_ec_sensors_ring_sample
+ *last_out)
+{
+ struct cros_ec_sensors_ring_sample *out;
+ int i;
+
+ for_each_set_bit(i, &sensor_mask, sensorhub->sensor_num) {
+ s64 older_timestamp;
+ s64 timestamp;
+ struct cros_ec_sensors_ring_sample *older_unprocess_out =
+ sensorhub->ring;
+ struct cros_ec_sensors_ring_sample *next_out;
+ int count = 1;
+
+ for (out = sensorhub->ring; out < last_out; out = next_out) {
+ s64 time_period;
+
+ next_out = out + 1;
+ if (out->sensor_id != i)
+ continue;
+
+ /* Timestamp to start with */
+ older_timestamp = out->timestamp;
+
+ /* Find next sample. */
+ while (next_out < last_out && next_out->sensor_id != i)
+ next_out++;
+
+ if (next_out >= last_out) {
+ timestamp = current_timestamp;
+ } else {
+ timestamp = next_out->timestamp;
+ if (timestamp == older_timestamp) {
+ count++;
+ continue;
+ }
+ }
+
+ /*
+ * The next sample has a new timestamp, spread the
+ * unprocessed samples.
+ */
+ if (next_out < last_out)
+ count++;
+ time_period = div_s64(timestamp - older_timestamp,
+ count);
+
+ for (; older_unprocess_out <= out;
+ older_unprocess_out++) {
+ if (older_unprocess_out->sensor_id != i)
+ continue;
+ older_timestamp += time_period;
+ older_unprocess_out->timestamp =
+ older_timestamp;
+ }
+ count = 1;
+ /* The next_out sample has a valid timestamp, skip. */
+ next_out++;
+ older_unprocess_out = next_out;
+ }
+ }
+
+ /* Push the event into the kfifo */
+ for (out = sensorhub->ring; out < last_out; out++)
+ cros_sensorhub_send_sample(sensorhub, out);
+}
+
+/**
+ * cros_ec_sensorhub_ring_handler() - The trigger handler function
+ *
+ * @sensorhub: Sensor Hub object.
+ *
+ * Called by the notifier, process the EC sensor FIFO queue.
+ */
+static void cros_ec_sensorhub_ring_handler(struct cros_ec_sensorhub *sensorhub)
+{
+ struct ec_response_motion_sense_fifo_info *fifo_info =
+ sensorhub->fifo_info;
+ struct cros_ec_dev *ec = sensorhub->ec;
+ ktime_t fifo_timestamp, current_timestamp;
+ int i, j, number_data, ret;
+ unsigned long sensor_mask = 0;
+ struct ec_response_motion_sensor_data *in;
+ struct cros_ec_sensors_ring_sample *out, *last_out;
+
+ mutex_lock(&sensorhub->cmd_lock);
+
+ /* Get FIFO information if there are lost vectors. */
+ if (fifo_info->total_lost) {
+ int fifo_info_length =
+ sizeof(struct ec_response_motion_sense_fifo_info) +
+ sizeof(u16) * sensorhub->sensor_num;
+
+ /* Need to retrieve the number of lost vectors per sensor */
+ sensorhub->params->cmd = MOTIONSENSE_CMD_FIFO_INFO;
+ sensorhub->msg->outsize = 1;
+ sensorhub->msg->insize = fifo_info_length;
+
+ if (cros_ec_cmd_xfer_status(ec->ec_dev, sensorhub->msg) < 0)
+ goto error;
+
+ memcpy(fifo_info, &sensorhub->resp->fifo_info,
+ fifo_info_length);
+
+ /*
+ * Update collection time, will not be as precise as the
+ * non-error case.
+ */
+ fifo_timestamp = cros_ec_get_time_ns();
+ } else {
+ fifo_timestamp = sensorhub->fifo_timestamp[
+ CROS_EC_SENSOR_NEW_TS];
+ }
+
+ if (fifo_info->count > sensorhub->fifo_size ||
+ fifo_info->size != sensorhub->fifo_size) {
+ dev_warn(sensorhub->dev,
+ "Mismatch EC data: count %d, size %d - expected %d",
+ fifo_info->count, fifo_info->size,
+ sensorhub->fifo_size);
+ goto error;
+ }
+
+ /* Copy elements in the main fifo */
+ current_timestamp = sensorhub->fifo_timestamp[CROS_EC_SENSOR_LAST_TS];
+ out = sensorhub->ring;
+ for (i = 0; i < fifo_info->count; i += number_data) {
+ sensorhub->params->cmd = MOTIONSENSE_CMD_FIFO_READ;
+ sensorhub->params->fifo_read.max_data_vector =
+ fifo_info->count - i;
+ sensorhub->msg->outsize =
+ sizeof(struct ec_params_motion_sense);
+ sensorhub->msg->insize =
+ sizeof(sensorhub->resp->fifo_read) +
+ sensorhub->params->fifo_read.max_data_vector *
+ sizeof(struct ec_response_motion_sensor_data);
+ ret = cros_ec_cmd_xfer_status(ec->ec_dev, sensorhub->msg);
+ if (ret < 0) {
+ dev_warn(sensorhub->dev, "Fifo error: %d\n", ret);
+ break;
+ }
+ number_data = sensorhub->resp->fifo_read.number_data;
+ if (number_data == 0) {
+ dev_dbg(sensorhub->dev, "Unexpected empty FIFO\n");
+ break;
+ }
+ if (number_data > fifo_info->count - i) {
+ dev_warn(sensorhub->dev,
+ "Invalid EC data: too many entry received: %d, expected %d",
+ number_data, fifo_info->count - i);
+ break;
+ }
+ if (out + number_data >
+ sensorhub->ring + fifo_info->count) {
+ dev_warn(sensorhub->dev,
+ "Too many samples: %d (%zd data) to %d entries for expected %d entries",
+ i, out - sensorhub->ring, i + number_data,
+ fifo_info->count);
+ break;
+ }
+
+ for (in = sensorhub->resp->fifo_read.data, j = 0;
+ j < number_data; j++, in++) {
+ if (cros_ec_sensor_ring_process_event(
+ sensorhub, fifo_info,
+ fifo_timestamp,
+ &current_timestamp,
+ in, out)) {
+ sensor_mask |= BIT(in->sensor_num);
+ out++;
+ }
+ }
+ }
+ mutex_unlock(&sensorhub->cmd_lock);
+ last_out = out;
+
+ if (out == sensorhub->ring)
+ /* Unexpected empty FIFO. */
+ goto ring_handler_end;
+
+ /*
+ * Check if current_timestamp is ahead of the last sample. Normally,
+ * the EC appends a timestamp after the last sample, but if the AP
+ * is slow to respond to the IRQ, the EC may have added new samples.
+ * Use the FIFO info timestamp as last timestamp then.
+ */
+ if (!sensorhub->tight_timestamps &&
+ (last_out - 1)->timestamp == current_timestamp)
+ current_timestamp = fifo_timestamp;
+
+ /* Warn on lost samples. */
+ if (fifo_info->total_lost)
+ for (i = 0; i < sensorhub->sensor_num; i++) {
+ if (fifo_info->lost[i]) {
+ dev_warn_ratelimited(sensorhub->dev,
+ "Sensor %d: lost: %d out of %d\n",
+ i, fifo_info->lost[i],
+ fifo_info->total_lost);
+ if (sensorhub->tight_timestamps)
+ sensorhub->batch_state[i].last_len = 0;
+ }
+ }
+
+ /*
+ * Spread samples in case of batching, then add them to the
+ * ringbuffer.
+ */
+ if (sensorhub->tight_timestamps)
+ cros_ec_sensor_ring_spread_add(sensorhub, sensor_mask,
+ last_out);
+ else
+ cros_ec_sensor_ring_spread_add_legacy(sensorhub, sensor_mask,
+ current_timestamp,
+ last_out);
+
+ring_handler_end:
+ sensorhub->fifo_timestamp[CROS_EC_SENSOR_LAST_TS] = current_timestamp;
+ return;
+
+error:
+ mutex_unlock(&sensorhub->cmd_lock);
+}
+
+static int cros_ec_sensorhub_event(struct notifier_block *nb,
+ unsigned long queued_during_suspend,
+ void *_notify)
+{
+ struct cros_ec_sensorhub *sensorhub;
+ struct cros_ec_device *ec_dev;
+
+ sensorhub = container_of(nb, struct cros_ec_sensorhub, notifier);
+ ec_dev = sensorhub->ec->ec_dev;
+
+ if (ec_dev->event_data.event_type != EC_MKBP_EVENT_SENSOR_FIFO)
+ return NOTIFY_DONE;
+
+ if (ec_dev->event_size != sizeof(ec_dev->event_data.data.sensor_fifo)) {
+ dev_warn(ec_dev->dev, "Invalid fifo info size\n");
+ return NOTIFY_DONE;
+ }
+
+ if (queued_during_suspend)
+ return NOTIFY_OK;
+
+ memcpy(sensorhub->fifo_info, &ec_dev->event_data.data.sensor_fifo.info,
+ sizeof(*sensorhub->fifo_info));
+ sensorhub->fifo_timestamp[CROS_EC_SENSOR_NEW_TS] =
+ ec_dev->last_event_time;
+ cros_ec_sensorhub_ring_handler(sensorhub);
+
+ return NOTIFY_OK;
+}
+
+/**
+ * cros_ec_sensorhub_ring_add() - Add the FIFO functionality if the EC
+ * supports it.
+ *
+ * @sensorhub : Sensor Hub object.
+ *
+ * Return: 0 on success.
+ */
+int cros_ec_sensorhub_ring_add(struct cros_ec_sensorhub *sensorhub)
+{
+ struct cros_ec_dev *ec = sensorhub->ec;
+ int ret;
+ int fifo_info_length =
+ sizeof(struct ec_response_motion_sense_fifo_info) +
+ sizeof(u16) * sensorhub->sensor_num;
+
+ /* Allocate the array for lost events. */
+ sensorhub->fifo_info = devm_kzalloc(sensorhub->dev, fifo_info_length,
+ GFP_KERNEL);
+ if (!sensorhub->fifo_info)
+ return -ENOMEM;
+
+ /* Retrieve FIFO information */
+ sensorhub->msg->version = 2;
+ sensorhub->params->cmd = MOTIONSENSE_CMD_FIFO_INFO;
+ sensorhub->msg->outsize = 1;
+ sensorhub->msg->insize = fifo_info_length;
+
+ ret = cros_ec_cmd_xfer_status(ec->ec_dev, sensorhub->msg);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Allocate the full fifo. We need to copy the whole FIFO to set
+ * timestamps properly.
+ */
+ sensorhub->fifo_size = sensorhub->resp->fifo_info.size;
+ sensorhub->ring = devm_kcalloc(sensorhub->dev, sensorhub->fifo_size,
+ sizeof(*sensorhub->ring), GFP_KERNEL);
+ if (!sensorhub->ring)
+ return -ENOMEM;
+
+ /*
+ * Allocate the callback area based on the number of sensors.
+ */
+ sensorhub->push_data = devm_kcalloc(
+ sensorhub->dev, sensorhub->sensor_num,
+ sizeof(*sensorhub->push_data),
+ GFP_KERNEL);
+ if (!sensorhub->push_data)
+ return -ENOMEM;
+
+ sensorhub->fifo_timestamp[CROS_EC_SENSOR_LAST_TS] =
+ cros_ec_get_time_ns();
+
+ sensorhub->tight_timestamps = cros_ec_check_features(
+ ec, EC_FEATURE_MOTION_SENSE_TIGHT_TIMESTAMPS);
+
+ if (sensorhub->tight_timestamps) {
+ sensorhub->batch_state = devm_kcalloc(sensorhub->dev,
+ sensorhub->sensor_num,
+ sizeof(*sensorhub->batch_state),
+ GFP_KERNEL);
+ if (!sensorhub->batch_state)
+ return -ENOMEM;
+ }
+
+ /* Register the notifier that will act as a top half interrupt. */
+ sensorhub->notifier.notifier_call = cros_ec_sensorhub_event;
+ ret = blocking_notifier_chain_register(&ec->ec_dev->event_notifier,
+ &sensorhub->notifier);
+ if (ret < 0)
+ return ret;
+
+ /* Start collection samples. */
+ return cros_ec_sensorhub_ring_fifo_enable(sensorhub, true);
+}
+
+void cros_ec_sensorhub_ring_remove(void *arg)
+{
+ struct cros_ec_sensorhub *sensorhub = arg;
+ struct cros_ec_device *ec_dev = sensorhub->ec->ec_dev;
+
+ /* Disable the ring, prevent EC interrupt to the AP for nothing. */
+ cros_ec_sensorhub_ring_fifo_enable(sensorhub, false);
+ blocking_notifier_chain_unregister(&ec_dev->event_notifier,
+ &sensorhub->notifier);
+}
diff --git a/drivers/platform/chrome/cros_ec_spi.c b/drivers/platform/chrome/cros_ec_spi.c
index 46786d2d679a..debea5c4c829 100644
--- a/drivers/platform/chrome/cros_ec_spi.c
+++ b/drivers/platform/chrome/cros_ec_spi.c
@@ -127,7 +127,8 @@ static int terminate_request(struct cros_ec_device *ec_dev)
*/
spi_message_init(&msg);
memset(&trans, 0, sizeof(trans));
- trans.delay_usecs = ec_spi->end_of_msg_delay;
+ trans.delay.value = ec_spi->end_of_msg_delay;
+ trans.delay.unit = SPI_DELAY_UNIT_USECS;
spi_message_add_tail(&trans, &msg);
ret = spi_sync_locked(ec_spi->spi, &msg);
@@ -416,7 +417,8 @@ static int do_cros_ec_pkt_xfer_spi(struct cros_ec_device *ec_dev,
spi_message_init(&msg);
if (ec_spi->start_of_msg_delay) {
memset(&trans_delay, 0, sizeof(trans_delay));
- trans_delay.delay_usecs = ec_spi->start_of_msg_delay;
+ trans_delay.delay.value = ec_spi->start_of_msg_delay;
+ trans_delay.delay.unit = SPI_DELAY_UNIT_USECS;
spi_message_add_tail(&trans_delay, &msg);
}
diff --git a/drivers/platform/chrome/cros_ec_sysfs.c b/drivers/platform/chrome/cros_ec_sysfs.c
index 07dac97ad57c..d45ea5d5bfa4 100644
--- a/drivers/platform/chrome/cros_ec_sysfs.c
+++ b/drivers/platform/chrome/cros_ec_sysfs.c
@@ -149,14 +149,14 @@ static ssize_t version_show(struct device *dev,
/* Get build info. */
msg->command = EC_CMD_GET_BUILD_INFO + ec->cmd_offset;
msg->insize = EC_HOST_PARAM_SIZE;
- ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
- if (ret < 0)
- count += scnprintf(buf + count, PAGE_SIZE - count,
- "Build info: XFER ERROR %d\n", ret);
- else if (msg->result != EC_RES_SUCCESS)
+ ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
+ if (ret == -EPROTO) {
count += scnprintf(buf + count, PAGE_SIZE - count,
"Build info: EC error %d\n", msg->result);
- else {
+ } else if (ret < 0) {
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "Build info: XFER ERROR %d\n", ret);
+ } else {
msg->data[EC_HOST_PARAM_SIZE - 1] = '\0';
count += scnprintf(buf + count, PAGE_SIZE - count,
"Build info: %s\n", msg->data);
@@ -165,14 +165,14 @@ static ssize_t version_show(struct device *dev,
/* Get chip info. */
msg->command = EC_CMD_GET_CHIP_INFO + ec->cmd_offset;
msg->insize = sizeof(*r_chip);
- ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
- if (ret < 0)
- count += scnprintf(buf + count, PAGE_SIZE - count,
- "Chip info: XFER ERROR %d\n", ret);
- else if (msg->result != EC_RES_SUCCESS)
+ ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
+ if (ret == -EPROTO) {
count += scnprintf(buf + count, PAGE_SIZE - count,
"Chip info: EC error %d\n", msg->result);
- else {
+ } else if (ret < 0) {
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "Chip info: XFER ERROR %d\n", ret);
+ } else {
r_chip = (struct ec_response_get_chip_info *)msg->data;
r_chip->vendor[sizeof(r_chip->vendor) - 1] = '\0';
@@ -189,14 +189,14 @@ static ssize_t version_show(struct device *dev,
/* Get board version */
msg->command = EC_CMD_GET_BOARD_VERSION + ec->cmd_offset;
msg->insize = sizeof(*r_board);
- ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
- if (ret < 0)
- count += scnprintf(buf + count, PAGE_SIZE - count,
- "Board version: XFER ERROR %d\n", ret);
- else if (msg->result != EC_RES_SUCCESS)
+ ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
+ if (ret == -EPROTO) {
count += scnprintf(buf + count, PAGE_SIZE - count,
"Board version: EC error %d\n", msg->result);
- else {
+ } else if (ret < 0) {
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "Board version: XFER ERROR %d\n", ret);
+ } else {
r_board = (struct ec_response_board_version *)msg->data;
count += scnprintf(buf + count, PAGE_SIZE - count,
diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c
new file mode 100644
index 000000000000..874269c07073
--- /dev/null
+++ b/drivers/platform/chrome/cros_ec_typec.c
@@ -0,0 +1,357 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2020 Google LLC
+ *
+ * This driver provides the ability to view and manage Type C ports through the
+ * Chrome OS EC.
+ */
+
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_data/cros_ec_commands.h>
+#include <linux/platform_data/cros_ec_proto.h>
+#include <linux/platform_device.h>
+#include <linux/usb/typec.h>
+
+#define DRV_NAME "cros-ec-typec"
+
+/* Platform-specific data for the Chrome OS EC Type C controller. */
+struct cros_typec_data {
+ struct device *dev;
+ struct cros_ec_device *ec;
+ int num_ports;
+ unsigned int cmd_ver;
+ /* Array of ports, indexed by port number. */
+ struct typec_port *ports[EC_USB_PD_MAX_PORTS];
+ /* Initial capabilities for each port. */
+ struct typec_capability *caps[EC_USB_PD_MAX_PORTS];
+};
+
+static int cros_typec_parse_port_props(struct typec_capability *cap,
+ struct fwnode_handle *fwnode,
+ struct device *dev)
+{
+ const char *buf;
+ int ret;
+
+ memset(cap, 0, sizeof(*cap));
+ ret = fwnode_property_read_string(fwnode, "power-role", &buf);
+ if (ret) {
+ dev_err(dev, "power-role not found: %d\n", ret);
+ return ret;
+ }
+
+ ret = typec_find_port_power_role(buf);
+ if (ret < 0)
+ return ret;
+ cap->type = ret;
+
+ ret = fwnode_property_read_string(fwnode, "data-role", &buf);
+ if (ret) {
+ dev_err(dev, "data-role not found: %d\n", ret);
+ return ret;
+ }
+
+ ret = typec_find_port_data_role(buf);
+ if (ret < 0)
+ return ret;
+ cap->data = ret;
+
+ ret = fwnode_property_read_string(fwnode, "try-power-role", &buf);
+ if (ret) {
+ dev_err(dev, "try-power-role not found: %d\n", ret);
+ return ret;
+ }
+
+ ret = typec_find_power_role(buf);
+ if (ret < 0)
+ return ret;
+ cap->prefer_role = ret;
+
+ cap->fwnode = fwnode;
+
+ return 0;
+}
+
+static int cros_typec_init_ports(struct cros_typec_data *typec)
+{
+ struct device *dev = typec->dev;
+ struct typec_capability *cap;
+ struct fwnode_handle *fwnode;
+ const char *port_prop;
+ int ret;
+ int i;
+ int nports;
+ u32 port_num = 0;
+
+ nports = device_get_child_node_count(dev);
+ if (nports == 0) {
+ dev_err(dev, "No port entries found.\n");
+ return -ENODEV;
+ }
+
+ if (nports > typec->num_ports) {
+ dev_err(dev, "More ports listed than can be supported.\n");
+ return -EINVAL;
+ }
+
+ /* DT uses "reg" to specify port number. */
+ port_prop = dev->of_node ? "reg" : "port-number";
+ device_for_each_child_node(dev, fwnode) {
+ if (fwnode_property_read_u32(fwnode, port_prop, &port_num)) {
+ ret = -EINVAL;
+ dev_err(dev, "No port-number for port, aborting.\n");
+ goto unregister_ports;
+ }
+
+ if (port_num >= typec->num_ports) {
+ dev_err(dev, "Invalid port number.\n");
+ ret = -EINVAL;
+ goto unregister_ports;
+ }
+
+ dev_dbg(dev, "Registering port %d\n", port_num);
+
+ cap = devm_kzalloc(dev, sizeof(*cap), GFP_KERNEL);
+ if (!cap) {
+ ret = -ENOMEM;
+ goto unregister_ports;
+ }
+
+ typec->caps[port_num] = cap;
+
+ ret = cros_typec_parse_port_props(cap, fwnode, dev);
+ if (ret < 0)
+ goto unregister_ports;
+
+ typec->ports[port_num] = typec_register_port(dev, cap);
+ if (IS_ERR(typec->ports[port_num])) {
+ dev_err(dev, "Failed to register port %d\n", port_num);
+ ret = PTR_ERR(typec->ports[port_num]);
+ goto unregister_ports;
+ }
+ }
+
+ return 0;
+
+unregister_ports:
+ for (i = 0; i < typec->num_ports; i++)
+ typec_unregister_port(typec->ports[i]);
+ return ret;
+}
+
+static int cros_typec_ec_command(struct cros_typec_data *typec,
+ unsigned int version,
+ unsigned int command,
+ void *outdata,
+ unsigned int outsize,
+ void *indata,
+ unsigned int insize)
+{
+ struct cros_ec_command *msg;
+ int ret;
+
+ msg = kzalloc(sizeof(*msg) + max(outsize, insize), GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ msg->version = version;
+ msg->command = command;
+ msg->outsize = outsize;
+ msg->insize = insize;
+
+ if (outsize)
+ memcpy(msg->data, outdata, outsize);
+
+ ret = cros_ec_cmd_xfer_status(typec->ec, msg);
+ if (ret >= 0 && insize)
+ memcpy(indata, msg->data, insize);
+
+ kfree(msg);
+ return ret;
+}
+
+static void cros_typec_set_port_params_v0(struct cros_typec_data *typec,
+ int port_num, struct ec_response_usb_pd_control *resp)
+{
+ struct typec_port *port = typec->ports[port_num];
+ enum typec_orientation polarity;
+
+ if (!resp->enabled)
+ polarity = TYPEC_ORIENTATION_NONE;
+ else if (!resp->polarity)
+ polarity = TYPEC_ORIENTATION_NORMAL;
+ else
+ polarity = TYPEC_ORIENTATION_REVERSE;
+
+ typec_set_pwr_role(port, resp->role ? TYPEC_SOURCE : TYPEC_SINK);
+ typec_set_orientation(port, polarity);
+}
+
+static void cros_typec_set_port_params_v1(struct cros_typec_data *typec,
+ int port_num, struct ec_response_usb_pd_control_v1 *resp)
+{
+ struct typec_port *port = typec->ports[port_num];
+ enum typec_orientation polarity;
+
+ if (!(resp->enabled & PD_CTRL_RESP_ENABLED_CONNECTED))
+ polarity = TYPEC_ORIENTATION_NONE;
+ else if (!resp->polarity)
+ polarity = TYPEC_ORIENTATION_NORMAL;
+ else
+ polarity = TYPEC_ORIENTATION_REVERSE;
+ typec_set_orientation(port, polarity);
+ typec_set_data_role(port, resp->role & PD_CTRL_RESP_ROLE_DATA ?
+ TYPEC_HOST : TYPEC_DEVICE);
+ typec_set_pwr_role(port, resp->role & PD_CTRL_RESP_ROLE_POWER ?
+ TYPEC_SOURCE : TYPEC_SINK);
+ typec_set_vconn_role(port, resp->role & PD_CTRL_RESP_ROLE_VCONN ?
+ TYPEC_SOURCE : TYPEC_SINK);
+}
+
+static int cros_typec_port_update(struct cros_typec_data *typec, int port_num)
+{
+ struct ec_params_usb_pd_control req;
+ struct ec_response_usb_pd_control_v1 resp;
+ int ret;
+
+ if (port_num < 0 || port_num >= typec->num_ports) {
+ dev_err(typec->dev, "cannot get status for invalid port %d\n",
+ port_num);
+ return -EINVAL;
+ }
+
+ req.port = port_num;
+ req.role = USB_PD_CTRL_ROLE_NO_CHANGE;
+ req.mux = USB_PD_CTRL_MUX_NO_CHANGE;
+ req.swap = USB_PD_CTRL_SWAP_NONE;
+
+ ret = cros_typec_ec_command(typec, typec->cmd_ver,
+ EC_CMD_USB_PD_CONTROL, &req, sizeof(req),
+ &resp, sizeof(resp));
+ if (ret < 0)
+ return ret;
+
+ dev_dbg(typec->dev, "Enabled %d: 0x%hhx\n", port_num, resp.enabled);
+ dev_dbg(typec->dev, "Role %d: 0x%hhx\n", port_num, resp.role);
+ dev_dbg(typec->dev, "Polarity %d: 0x%hhx\n", port_num, resp.polarity);
+ dev_dbg(typec->dev, "State %d: %s\n", port_num, resp.state);
+
+ if (typec->cmd_ver == 1)
+ cros_typec_set_port_params_v1(typec, port_num, &resp);
+ else
+ cros_typec_set_port_params_v0(typec, port_num,
+ (struct ec_response_usb_pd_control *) &resp);
+
+ return 0;
+}
+
+static int cros_typec_get_cmd_version(struct cros_typec_data *typec)
+{
+ struct ec_params_get_cmd_versions_v1 req_v1;
+ struct ec_response_get_cmd_versions resp;
+ int ret;
+
+ /* We're interested in the PD control command version. */
+ req_v1.cmd = EC_CMD_USB_PD_CONTROL;
+ ret = cros_typec_ec_command(typec, 1, EC_CMD_GET_CMD_VERSIONS,
+ &req_v1, sizeof(req_v1), &resp,
+ sizeof(resp));
+ if (ret < 0)
+ return ret;
+
+ if (resp.version_mask & EC_VER_MASK(1))
+ typec->cmd_ver = 1;
+ else
+ typec->cmd_ver = 0;
+
+ dev_dbg(typec->dev, "PD Control has version mask 0x%hhx\n",
+ typec->cmd_ver);
+
+ return 0;
+}
+
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id cros_typec_acpi_id[] = {
+ { "GOOG0014", 0 },
+ {}
+};
+MODULE_DEVICE_TABLE(acpi, cros_typec_acpi_id);
+#endif
+
+#ifdef CONFIG_OF
+static const struct of_device_id cros_typec_of_match[] = {
+ { .compatible = "google,cros-ec-typec", },
+ {}
+};
+MODULE_DEVICE_TABLE(of, cros_typec_of_match);
+#endif
+
+static int cros_typec_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct cros_typec_data *typec;
+ struct ec_response_usb_pd_ports resp;
+ int ret, i;
+
+ typec = devm_kzalloc(dev, sizeof(*typec), GFP_KERNEL);
+ if (!typec)
+ return -ENOMEM;
+
+ typec->dev = dev;
+ typec->ec = dev_get_drvdata(pdev->dev.parent);
+ platform_set_drvdata(pdev, typec);
+
+ ret = cros_typec_get_cmd_version(typec);
+ if (ret < 0) {
+ dev_err(dev, "failed to get PD command version info\n");
+ return ret;
+ }
+
+ ret = cros_typec_ec_command(typec, 0, EC_CMD_USB_PD_PORTS, NULL, 0,
+ &resp, sizeof(resp));
+ if (ret < 0)
+ return ret;
+
+ typec->num_ports = resp.num_ports;
+ if (typec->num_ports > EC_USB_PD_MAX_PORTS) {
+ dev_warn(typec->dev,
+ "Too many ports reported: %d, limiting to max: %d\n",
+ typec->num_ports, EC_USB_PD_MAX_PORTS);
+ typec->num_ports = EC_USB_PD_MAX_PORTS;
+ }
+
+ ret = cros_typec_init_ports(typec);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < typec->num_ports; i++) {
+ ret = cros_typec_port_update(typec, i);
+ if (ret < 0)
+ goto unregister_ports;
+ }
+
+ return 0;
+
+unregister_ports:
+ for (i = 0; i < typec->num_ports; i++)
+ if (typec->ports[i])
+ typec_unregister_port(typec->ports[i]);
+ return ret;
+}
+
+static struct platform_driver cros_typec_driver = {
+ .driver = {
+ .name = DRV_NAME,
+ .acpi_match_table = ACPI_PTR(cros_typec_acpi_id),
+ .of_match_table = of_match_ptr(cros_typec_of_match),
+ },
+ .probe = cros_typec_probe,
+};
+
+module_platform_driver(cros_typec_driver);
+
+MODULE_AUTHOR("Prashant Malani <pmalani@chromium.org>");
+MODULE_DESCRIPTION("Chrome OS EC Type C control");
+MODULE_LICENSE("GPL");
diff --git a/drivers/platform/chrome/cros_ec_vbc.c b/drivers/platform/chrome/cros_ec_vbc.c
index 8edae465105c..46482d12cffe 100644
--- a/drivers/platform/chrome/cros_ec_vbc.c
+++ b/drivers/platform/chrome/cros_ec_vbc.c
@@ -40,7 +40,7 @@ static ssize_t vboot_context_read(struct file *filp, struct kobject *kobj,
msg->outsize = para_sz;
msg->insize = resp_sz;
- err = cros_ec_cmd_xfer(ecdev, msg);
+ err = cros_ec_cmd_xfer_status(ecdev, msg);
if (err < 0) {
dev_err(dev, "Error sending read request: %d\n", err);
kfree(msg);
@@ -83,7 +83,7 @@ static ssize_t vboot_context_write(struct file *filp, struct kobject *kobj,
msg->outsize = para_sz;
msg->insize = 0;
- err = cros_ec_cmd_xfer(ecdev, msg);
+ err = cros_ec_cmd_xfer_status(ecdev, msg);
if (err < 0) {
dev_err(dev, "Error sending write request: %d\n", err);
kfree(msg);
diff --git a/drivers/platform/chrome/cros_usbpd_notify.c b/drivers/platform/chrome/cros_usbpd_notify.c
new file mode 100644
index 000000000000..7f36142ab12a
--- /dev/null
+++ b/drivers/platform/chrome/cros_usbpd_notify.c
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2020 Google LLC
+ *
+ * This driver serves as the receiver of cros_ec PD host events.
+ */
+
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/platform_data/cros_ec_proto.h>
+#include <linux/platform_data/cros_usbpd_notify.h>
+#include <linux/platform_device.h>
+
+#define DRV_NAME "cros-usbpd-notify"
+#define DRV_NAME_PLAT_ACPI "cros-usbpd-notify-acpi"
+#define ACPI_DRV_NAME "GOOG0003"
+
+static BLOCKING_NOTIFIER_HEAD(cros_usbpd_notifier_list);
+
+struct cros_usbpd_notify_data {
+ struct device *dev;
+ struct cros_ec_device *ec;
+ struct notifier_block nb;
+};
+
+/**
+ * cros_usbpd_register_notify - Register a notifier callback for PD events.
+ * @nb: Notifier block pointer to register
+ *
+ * On ACPI platforms this corresponds to host events on the ECPD
+ * "GOOG0003" ACPI device. On non-ACPI platforms this will filter mkbp events
+ * for USB PD events.
+ *
+ * Return: 0 on success or negative error code.
+ */
+int cros_usbpd_register_notify(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&cros_usbpd_notifier_list,
+ nb);
+}
+EXPORT_SYMBOL_GPL(cros_usbpd_register_notify);
+
+/**
+ * cros_usbpd_unregister_notify - Unregister notifier callback for PD events.
+ * @nb: Notifier block pointer to unregister
+ *
+ * Unregister a notifier callback that was previously registered with
+ * cros_usbpd_register_notify().
+ */
+void cros_usbpd_unregister_notify(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&cros_usbpd_notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(cros_usbpd_unregister_notify);
+
+/**
+ * cros_ec_pd_command - Send a command to the EC.
+ *
+ * @ec_dev: EC device
+ * @command: EC command
+ * @outdata: EC command output data
+ * @outsize: Size of outdata
+ * @indata: EC command input data
+ * @insize: Size of indata
+ *
+ * Return: >= 0 on success, negative error number on failure.
+ */
+static int cros_ec_pd_command(struct cros_ec_device *ec_dev,
+ int command,
+ uint8_t *outdata,
+ int outsize,
+ uint8_t *indata,
+ int insize)
+{
+ struct cros_ec_command *msg;
+ int ret;
+
+ msg = kzalloc(sizeof(*msg) + max(insize, outsize), GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ msg->command = command;
+ msg->outsize = outsize;
+ msg->insize = insize;
+
+ if (outsize)
+ memcpy(msg->data, outdata, outsize);
+
+ ret = cros_ec_cmd_xfer_status(ec_dev, msg);
+ if (ret < 0)
+ goto error;
+
+ if (insize)
+ memcpy(indata, msg->data, insize);
+error:
+ kfree(msg);
+ return ret;
+}
+
+static void cros_usbpd_get_event_and_notify(struct device *dev,
+ struct cros_ec_device *ec_dev)
+{
+ struct ec_response_host_event_status host_event_status;
+ u32 event = 0;
+ int ret;
+
+ /*
+ * We still send a 0 event out to older devices which don't
+ * have the updated device heirarchy.
+ */
+ if (!ec_dev) {
+ dev_dbg(dev,
+ "EC device inaccessible; sending 0 event status.\n");
+ goto send_notify;
+ }
+
+ /* Check for PD host events on EC. */
+ ret = cros_ec_pd_command(ec_dev, EC_CMD_PD_HOST_EVENT_STATUS,
+ NULL, 0,
+ (uint8_t *)&host_event_status,
+ sizeof(host_event_status));
+ if (ret < 0) {
+ dev_warn(dev, "Can't get host event status (err: %d)\n", ret);
+ goto send_notify;
+ }
+
+ event = host_event_status.status;
+
+send_notify:
+ blocking_notifier_call_chain(&cros_usbpd_notifier_list, event, NULL);
+}
+
+#ifdef CONFIG_ACPI
+
+static void cros_usbpd_notify_acpi(acpi_handle device, u32 event, void *data)
+{
+ struct cros_usbpd_notify_data *pdnotify = data;
+
+ cros_usbpd_get_event_and_notify(pdnotify->dev, pdnotify->ec);
+}
+
+static int cros_usbpd_notify_probe_acpi(struct platform_device *pdev)
+{
+ struct cros_usbpd_notify_data *pdnotify;
+ struct device *dev = &pdev->dev;
+ struct acpi_device *adev;
+ struct cros_ec_device *ec_dev;
+ acpi_status status;
+
+ adev = ACPI_COMPANION(dev);
+
+ pdnotify = devm_kzalloc(dev, sizeof(*pdnotify), GFP_KERNEL);
+ if (!pdnotify)
+ return -ENOMEM;
+
+ /* Get the EC device pointer needed to talk to the EC. */
+ ec_dev = dev_get_drvdata(dev->parent);
+ if (!ec_dev) {
+ /*
+ * We continue even for older devices which don't have the
+ * correct device heirarchy, namely, GOOG0003 is a child
+ * of GOOG0004.
+ */
+ dev_warn(dev, "Couldn't get Chrome EC device pointer.\n");
+ }
+
+ pdnotify->dev = dev;
+ pdnotify->ec = ec_dev;
+
+ status = acpi_install_notify_handler(adev->handle,
+ ACPI_ALL_NOTIFY,
+ cros_usbpd_notify_acpi,
+ pdnotify);
+ if (ACPI_FAILURE(status)) {
+ dev_warn(dev, "Failed to register notify handler %08x\n",
+ status);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int cros_usbpd_notify_remove_acpi(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct acpi_device *adev = ACPI_COMPANION(dev);
+
+ acpi_remove_notify_handler(adev->handle, ACPI_ALL_NOTIFY,
+ cros_usbpd_notify_acpi);
+
+ return 0;
+}
+
+static const struct acpi_device_id cros_usbpd_notify_acpi_device_ids[] = {
+ { ACPI_DRV_NAME, 0 },
+ { }
+};
+MODULE_DEVICE_TABLE(acpi, cros_usbpd_notify_acpi_device_ids);
+
+static struct platform_driver cros_usbpd_notify_acpi_driver = {
+ .driver = {
+ .name = DRV_NAME_PLAT_ACPI,
+ .acpi_match_table = cros_usbpd_notify_acpi_device_ids,
+ },
+ .probe = cros_usbpd_notify_probe_acpi,
+ .remove = cros_usbpd_notify_remove_acpi,
+};
+
+#endif /* CONFIG_ACPI */
+
+static int cros_usbpd_notify_plat(struct notifier_block *nb,
+ unsigned long queued_during_suspend,
+ void *data)
+{
+ struct cros_usbpd_notify_data *pdnotify = container_of(nb,
+ struct cros_usbpd_notify_data, nb);
+ struct cros_ec_device *ec_dev = (struct cros_ec_device *)data;
+ u32 host_event = cros_ec_get_host_event(ec_dev);
+
+ if (!host_event)
+ return NOTIFY_DONE;
+
+ if (host_event & EC_HOST_EVENT_MASK(EC_HOST_EVENT_PD_MCU)) {
+ cros_usbpd_get_event_and_notify(pdnotify->dev, ec_dev);
+ return NOTIFY_OK;
+ }
+ return NOTIFY_DONE;
+}
+
+static int cros_usbpd_notify_probe_plat(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct cros_ec_dev *ecdev = dev_get_drvdata(dev->parent);
+ struct cros_usbpd_notify_data *pdnotify;
+ int ret;
+
+ pdnotify = devm_kzalloc(dev, sizeof(*pdnotify), GFP_KERNEL);
+ if (!pdnotify)
+ return -ENOMEM;
+
+ pdnotify->dev = dev;
+ pdnotify->ec = ecdev->ec_dev;
+ pdnotify->nb.notifier_call = cros_usbpd_notify_plat;
+
+ dev_set_drvdata(dev, pdnotify);
+
+ ret = blocking_notifier_chain_register(&ecdev->ec_dev->event_notifier,
+ &pdnotify->nb);
+ if (ret < 0) {
+ dev_err(dev, "Failed to register notifier\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int cros_usbpd_notify_remove_plat(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct cros_ec_dev *ecdev = dev_get_drvdata(dev->parent);
+ struct cros_usbpd_notify_data *pdnotify =
+ (struct cros_usbpd_notify_data *)dev_get_drvdata(dev);
+
+ blocking_notifier_chain_unregister(&ecdev->ec_dev->event_notifier,
+ &pdnotify->nb);
+
+ return 0;
+}
+
+static struct platform_driver cros_usbpd_notify_plat_driver = {
+ .driver = {
+ .name = DRV_NAME,
+ },
+ .probe = cros_usbpd_notify_probe_plat,
+ .remove = cros_usbpd_notify_remove_plat,
+};
+
+static int __init cros_usbpd_notify_init(void)
+{
+ int ret;
+
+ ret = platform_driver_register(&cros_usbpd_notify_plat_driver);
+ if (ret < 0)
+ return ret;
+
+#ifdef CONFIG_ACPI
+ platform_driver_register(&cros_usbpd_notify_acpi_driver);
+#endif
+ return 0;
+}
+
+static void __exit cros_usbpd_notify_exit(void)
+{
+#ifdef CONFIG_ACPI
+ platform_driver_unregister(&cros_usbpd_notify_acpi_driver);
+#endif
+ platform_driver_unregister(&cros_usbpd_notify_plat_driver);
+}
+
+module_init(cros_usbpd_notify_init);
+module_exit(cros_usbpd_notify_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ChromeOS power delivery notifier device");
+MODULE_AUTHOR("Jon Flatley <jflat@chromium.org>");
+MODULE_ALIAS("platform:" DRV_NAME);
diff --git a/drivers/platform/chrome/wilco_ec/event.c b/drivers/platform/chrome/wilco_ec/event.c
index dba3d445623f..814518509739 100644
--- a/drivers/platform/chrome/wilco_ec/event.c
+++ b/drivers/platform/chrome/wilco_ec/event.c
@@ -79,7 +79,7 @@ static DEFINE_IDA(event_ida);
struct ec_event {
u16 size;
u16 type;
- u16 event[0];
+ u16 event[];
} __packed;
#define ec_event_num_words(ev) (ev->size - 1)
@@ -96,7 +96,7 @@ struct ec_event_queue {
int capacity;
int head;
int tail;
- struct ec_event *entries[0];
+ struct ec_event *entries[];
};
/* Maximum number of events to store in ec_event_queue */
diff --git a/drivers/platform/chrome/wilco_ec/properties.c b/drivers/platform/chrome/wilco_ec/properties.c
index 62f27610dd33..c2bf4c95c5d2 100644
--- a/drivers/platform/chrome/wilco_ec/properties.c
+++ b/drivers/platform/chrome/wilco_ec/properties.c
@@ -3,8 +3,11 @@
* Copyright 2019 Google LLC
*/
+#include <linux/errno.h>
+#include <linux/export.h>
#include <linux/platform_data/wilco-ec.h>
#include <linux/string.h>
+#include <linux/types.h>
#include <asm/unaligned.h>
/* Operation code; what the EC should do with the property */
diff --git a/drivers/platform/chrome/wilco_ec/sysfs.c b/drivers/platform/chrome/wilco_ec/sysfs.c
index f0d174b6bb21..3c587b4054a5 100644
--- a/drivers/platform/chrome/wilco_ec/sysfs.c
+++ b/drivers/platform/chrome/wilco_ec/sysfs.c
@@ -8,8 +8,12 @@
* See Documentation/ABI/testing/sysfs-platform-wilco-ec for more information.
*/
+#include <linux/device.h>
+#include <linux/kernel.h>
#include <linux/platform_data/wilco-ec.h>
+#include <linux/string.h>
#include <linux/sysfs.h>
+#include <linux/types.h>
#define CMD_KB_CMOS 0x7C
#define SUB_CMD_KB_CMOS_AUTO_ON 0x03
diff --git a/drivers/platform/x86/dell-laptop.c b/drivers/platform/x86/dell-laptop.c
index 74e988f839e8..f8d3e3bd1bb5 100644
--- a/drivers/platform/x86/dell-laptop.c
+++ b/drivers/platform/x86/dell-laptop.c
@@ -4,7 +4,7 @@
*
* Copyright (c) Red Hat <mjg@redhat.com>
* Copyright (c) 2014 Gabriele Mazzotta <gabriele.mzt@gmail.com>
- * Copyright (c) 2014 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (c) 2014 Pali Rohár <pali@kernel.org>
*
* Based on documentation in the libsmbios package:
* Copyright (C) 2005-2014 Dell Inc.
@@ -2295,6 +2295,6 @@ module_exit(dell_exit);
MODULE_AUTHOR("Matthew Garrett <mjg@redhat.com>");
MODULE_AUTHOR("Gabriele Mazzotta <gabriele.mzt@gmail.com>");
-MODULE_AUTHOR("Pali Rohár <pali.rohar@gmail.com>");
+MODULE_AUTHOR("Pali Rohár <pali@kernel.org>");
MODULE_DESCRIPTION("Dell laptop driver");
MODULE_LICENSE("GPL");
diff --git a/drivers/platform/x86/dell-rbtn.c b/drivers/platform/x86/dell-rbtn.c
index a6b856cd86bd..a89fad47ff13 100644
--- a/drivers/platform/x86/dell-rbtn.c
+++ b/drivers/platform/x86/dell-rbtn.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
Dell Airplane Mode Switch driver
- Copyright (C) 2014-2015 Pali Rohár <pali.rohar@gmail.com>
+ Copyright (C) 2014-2015 Pali Rohár <pali@kernel.org>
*/
@@ -495,5 +495,5 @@ MODULE_PARM_DESC(auto_remove_rfkill, "Automatically remove rfkill devices when "
"(default true)");
MODULE_DEVICE_TABLE(acpi, rbtn_ids);
MODULE_DESCRIPTION("Dell Airplane Mode Switch driver");
-MODULE_AUTHOR("Pali Rohár <pali.rohar@gmail.com>");
+MODULE_AUTHOR("Pali Rohár <pali@kernel.org>");
MODULE_LICENSE("GPL");
diff --git a/drivers/platform/x86/dell-rbtn.h b/drivers/platform/x86/dell-rbtn.h
index 0fdc81644458..5e030f926c58 100644
--- a/drivers/platform/x86/dell-rbtn.h
+++ b/drivers/platform/x86/dell-rbtn.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
Dell Airplane Mode Switch driver
- Copyright (C) 2014-2015 Pali Rohár <pali.rohar@gmail.com>
+ Copyright (C) 2014-2015 Pali Rohár <pali@kernel.org>
*/
diff --git a/drivers/platform/x86/dell-smbios-base.c b/drivers/platform/x86/dell-smbios-base.c
index fe59b0ebff31..2e2cd565926a 100644
--- a/drivers/platform/x86/dell-smbios-base.c
+++ b/drivers/platform/x86/dell-smbios-base.c
@@ -4,7 +4,7 @@
*
* Copyright (c) Red Hat <mjg@redhat.com>
* Copyright (c) 2014 Gabriele Mazzotta <gabriele.mzt@gmail.com>
- * Copyright (c) 2014 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (c) 2014 Pali Rohár <pali@kernel.org>
*
* Based on documentation in the libsmbios package:
* Copyright (C) 2005-2014 Dell Inc.
@@ -645,7 +645,7 @@ module_exit(dell_smbios_exit);
MODULE_AUTHOR("Matthew Garrett <mjg@redhat.com>");
MODULE_AUTHOR("Gabriele Mazzotta <gabriele.mzt@gmail.com>");
-MODULE_AUTHOR("Pali Rohár <pali.rohar@gmail.com>");
+MODULE_AUTHOR("Pali Rohár <pali@kernel.org>");
MODULE_AUTHOR("Mario Limonciello <mario.limonciello@dell.com>");
MODULE_DESCRIPTION("Common functions for kernel modules using Dell SMBIOS");
MODULE_LICENSE("GPL");
diff --git a/drivers/platform/x86/dell-smbios-smm.c b/drivers/platform/x86/dell-smbios-smm.c
index d6854d1c4119..97c52a839a3e 100644
--- a/drivers/platform/x86/dell-smbios-smm.c
+++ b/drivers/platform/x86/dell-smbios-smm.c
@@ -4,7 +4,7 @@
*
* Copyright (c) Red Hat <mjg@redhat.com>
* Copyright (c) 2014 Gabriele Mazzotta <gabriele.mzt@gmail.com>
- * Copyright (c) 2014 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (c) 2014 Pali Rohár <pali@kernel.org>
* Copyright (c) 2017 Dell Inc.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/drivers/platform/x86/dell-smbios.h b/drivers/platform/x86/dell-smbios.h
index a7ff9803f41a..75fa8ea0476d 100644
--- a/drivers/platform/x86/dell-smbios.h
+++ b/drivers/platform/x86/dell-smbios.h
@@ -4,7 +4,7 @@
*
* Copyright (c) Red Hat <mjg@redhat.com>
* Copyright (c) 2014 Gabriele Mazzotta <gabriele.mzt@gmail.com>
- * Copyright (c) 2014 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (c) 2014 Pali Rohár <pali@kernel.org>
*
* Based on documentation in the libsmbios package:
* Copyright (C) 2005-2014 Dell Inc.
diff --git a/drivers/platform/x86/dell-smo8800.c b/drivers/platform/x86/dell-smo8800.c
index b531fe8ab7e0..5d9304a7de1b 100644
--- a/drivers/platform/x86/dell-smo8800.c
+++ b/drivers/platform/x86/dell-smo8800.c
@@ -3,7 +3,7 @@
* dell-smo8800.c - Dell Latitude ACPI SMO88XX freefall sensor driver
*
* Copyright (C) 2012 Sonal Santan <sonal.santan@gmail.com>
- * Copyright (C) 2014 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (C) 2014 Pali Rohár <pali@kernel.org>
*
* This is loosely based on lis3lv02d driver.
*/
diff --git a/drivers/platform/x86/dell-wmi.c b/drivers/platform/x86/dell-wmi.c
index 6669db2555fb..86e8dd6a8b33 100644
--- a/drivers/platform/x86/dell-wmi.c
+++ b/drivers/platform/x86/dell-wmi.c
@@ -3,7 +3,7 @@
* Dell WMI hotkeys
*
* Copyright (C) 2008 Red Hat <mjg@redhat.com>
- * Copyright (C) 2014-2015 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (C) 2014-2015 Pali Rohár <pali@kernel.org>
*
* Portions based on wistron_btns.c:
* Copyright (C) 2005 Miloslav Trmac <mitr@volny.cz>
@@ -29,7 +29,7 @@
#include "dell-wmi-descriptor.h"
MODULE_AUTHOR("Matthew Garrett <mjg@redhat.com>");
-MODULE_AUTHOR("Pali Rohár <pali.rohar@gmail.com>");
+MODULE_AUTHOR("Pali Rohár <pali@kernel.org>");
MODULE_DESCRIPTION("Dell laptop WMI hotkeys driver");
MODULE_LICENSE("GPL");
diff --git a/drivers/power/supply/Kconfig b/drivers/power/supply/Kconfig
index 195bc0462d3e..f3424fdce341 100644
--- a/drivers/power/supply/Kconfig
+++ b/drivers/power/supply/Kconfig
@@ -659,7 +659,7 @@ config CHARGER_RT9455
config CHARGER_CROS_USBPD
tristate "ChromeOS EC based USBPD charger"
- depends on CROS_EC
+ depends on CROS_USBPD_NOTIFY
default n
help
Say Y here to enable ChromeOS EC based USBPD charger
diff --git a/drivers/power/supply/bq2415x_charger.c b/drivers/power/supply/bq2415x_charger.c
index 532f6e4fcafb..a1f00ae1c180 100644
--- a/drivers/power/supply/bq2415x_charger.c
+++ b/drivers/power/supply/bq2415x_charger.c
@@ -2,7 +2,7 @@
/*
* bq2415x charger driver
*
- * Copyright (C) 2011-2013 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (C) 2011-2013 Pali Rohár <pali@kernel.org>
*
* Datasheets:
* http://www.ti.com/product/bq24150
@@ -1788,6 +1788,6 @@ static struct i2c_driver bq2415x_driver = {
};
module_i2c_driver(bq2415x_driver);
-MODULE_AUTHOR("Pali Rohár <pali.rohar@gmail.com>");
+MODULE_AUTHOR("Pali Rohár <pali@kernel.org>");
MODULE_DESCRIPTION("bq2415x charger driver");
MODULE_LICENSE("GPL");
diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index 664e50103eaa..942c92127b6d 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -4,7 +4,7 @@
* Copyright (C) 2008 Rodolfo Giometti <giometti@linux.it>
* Copyright (C) 2008 Eurotech S.p.A. <info@eurotech.it>
* Copyright (C) 2010-2011 Lars-Peter Clausen <lars@metafoo.de>
- * Copyright (C) 2011 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (C) 2011 Pali Rohár <pali@kernel.org>
* Copyright (C) 2017 Liam Breck <kernel@networkimprov.net>
*
* Based on a previous work by Copyright (C) 2008 Texas Instruments, Inc.
diff --git a/drivers/power/supply/cros_usbpd-charger.c b/drivers/power/supply/cros_usbpd-charger.c
index 30c3d37511c9..2a45e84447fe 100644
--- a/drivers/power/supply/cros_usbpd-charger.c
+++ b/drivers/power/supply/cros_usbpd-charger.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/platform_data/cros_ec_commands.h>
#include <linux/platform_data/cros_ec_proto.h>
+#include <linux/platform_data/cros_usbpd_notify.h>
#include <linux/platform_device.h>
#include <linux/power_supply.h>
#include <linux/slab.h>
@@ -517,32 +518,21 @@ static int cros_usbpd_charger_property_is_writeable(struct power_supply *psy,
}
static int cros_usbpd_charger_ec_event(struct notifier_block *nb,
- unsigned long queued_during_suspend,
+ unsigned long host_event,
void *_notify)
{
- struct cros_ec_device *ec_device;
- struct charger_data *charger;
- u32 host_event;
+ struct charger_data *charger = container_of(nb, struct charger_data,
+ notifier);
- charger = container_of(nb, struct charger_data, notifier);
- ec_device = charger->ec_device;
-
- host_event = cros_ec_get_host_event(ec_device);
- if (host_event & EC_HOST_EVENT_MASK(EC_HOST_EVENT_PD_MCU)) {
- cros_usbpd_charger_power_changed(charger->ports[0]->psy);
- return NOTIFY_OK;
- } else {
- return NOTIFY_DONE;
- }
+ cros_usbpd_charger_power_changed(charger->ports[0]->psy);
+ return NOTIFY_OK;
}
static void cros_usbpd_charger_unregister_notifier(void *data)
{
struct charger_data *charger = data;
- struct cros_ec_device *ec_device = charger->ec_device;
- blocking_notifier_chain_unregister(&ec_device->event_notifier,
- &charger->notifier);
+ cros_usbpd_unregister_notify(&charger->notifier);
}
static int cros_usbpd_charger_probe(struct platform_device *pd)
@@ -676,21 +666,17 @@ static int cros_usbpd_charger_probe(struct platform_device *pd)
goto fail;
}
- if (ec_device->mkbp_event_supported) {
- /* Get PD events from the EC */
- charger->notifier.notifier_call = cros_usbpd_charger_ec_event;
- ret = blocking_notifier_chain_register(
- &ec_device->event_notifier,
- &charger->notifier);
- if (ret < 0) {
- dev_warn(dev, "failed to register notifier\n");
- } else {
- ret = devm_add_action_or_reset(dev,
- cros_usbpd_charger_unregister_notifier,
- charger);
- if (ret < 0)
- goto fail;
- }
+ /* Get PD events from the EC */
+ charger->notifier.notifier_call = cros_usbpd_charger_ec_event;
+ ret = cros_usbpd_register_notify(&charger->notifier);
+ if (ret < 0) {
+ dev_warn(dev, "failed to register notifier\n");
+ } else {
+ ret = devm_add_action_or_reset(dev,
+ cros_usbpd_charger_unregister_notifier,
+ charger);
+ if (ret < 0)
+ goto fail;
}
return 0;
diff --git a/drivers/power/supply/isp1704_charger.c b/drivers/power/supply/isp1704_charger.c
index 4812ac1ff2df..b6efc454e4f0 100644
--- a/drivers/power/supply/isp1704_charger.c
+++ b/drivers/power/supply/isp1704_charger.c
@@ -3,7 +3,7 @@
* ISP1704 USB Charger Detection driver
*
* Copyright (C) 2010 Nokia Corporation
- * Copyright (C) 2012 - 2013 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (C) 2012 - 2013 Pali Rohár <pali@kernel.org>
*/
#include <linux/kernel.h>
diff --git a/drivers/power/supply/rx51_battery.c b/drivers/power/supply/rx51_battery.c
index 8548b639ff2f..6e488ecf4dcb 100644
--- a/drivers/power/supply/rx51_battery.c
+++ b/drivers/power/supply/rx51_battery.c
@@ -2,7 +2,7 @@
/*
* Nokia RX-51 battery driver
*
- * Copyright (C) 2012 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (C) 2012 Pali Rohár <pali@kernel.org>
*/
#include <linux/module.h>
@@ -278,6 +278,6 @@ static struct platform_driver rx51_battery_driver = {
module_platform_driver(rx51_battery_driver);
MODULE_ALIAS("platform:rx51-battery");
-MODULE_AUTHOR("Pali Rohár <pali.rohar@gmail.com>");
+MODULE_AUTHOR("Pali Rohár <pali@kernel.org>");
MODULE_DESCRIPTION("Nokia RX-51 battery driver");
MODULE_LICENSE("GPL");
diff --git a/drivers/ps3/sys-manager-core.c b/drivers/ps3/sys-manager-core.c
index 24709c572c0c..e061b7d0632b 100644
--- a/drivers/ps3/sys-manager-core.c
+++ b/drivers/ps3/sys-manager-core.c
@@ -31,7 +31,7 @@ void ps3_sys_manager_register_ops(const struct ps3_sys_manager_ops *ops)
{
BUG_ON(!ops);
BUG_ON(!ops->dev);
- ps3_sys_manager_ops = ops ? *ops : ps3_sys_manager_ops;
+ ps3_sys_manager_ops = *ops;
}
EXPORT_SYMBOL_GPL(ps3_sys_manager_register_ops);
diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig
index 30190beeb6e9..eebbc917ac97 100644
--- a/drivers/pwm/Kconfig
+++ b/drivers/pwm/Kconfig
@@ -33,6 +33,15 @@ config PWM_SYSFS
bool
default y if SYSFS
+config PWM_DEBUG
+ bool "PWM lowlevel drivers additional checks and debug messages"
+ depends on DEBUG_KERNEL
+ help
+ This option enables some additional checks to help lowlevel driver
+ authors to get their callbacks implemented correctly.
+ It is expected to introduce some runtime overhead and diagnostic
+ output to the kernel log, so only enable while working on a driver.
+
config PWM_AB8500
tristate "AB8500 PWM support"
depends on AB8500_CORE && ARCH_U8500
@@ -44,7 +53,8 @@ config PWM_AB8500
config PWM_ATMEL
tristate "Atmel PWM support"
- depends on ARCH_AT91 && OF
+ depends on OF
+ depends on ARCH_AT91 || COMPILE_TEST
help
Generic PWM framework driver for Atmel SoC.
@@ -100,7 +110,7 @@ config PWM_BCM_KONA
config PWM_BCM2835
tristate "BCM2835 PWM support"
- depends on ARCH_BCM2835 || ARCH_BRCMSTB
+ depends on ARCH_BCM2835 || ARCH_BRCMSTB || COMPILE_TEST
help
PWM framework driver for BCM2835 controller (Raspberry Pi)
@@ -109,7 +119,7 @@ config PWM_BCM2835
config PWM_BERLIN
tristate "Marvell Berlin PWM support"
- depends on ARCH_BERLIN
+ depends on ARCH_BERLIN || COMPILE_TEST
help
PWM framework driver for Marvell Berlin SoCs.
@@ -118,7 +128,7 @@ config PWM_BERLIN
config PWM_BRCMSTB
tristate "Broadcom STB PWM support"
- depends on ARCH_BRCMSTB || BMIPS_GENERIC
+ depends on ARCH_BRCMSTB || BMIPS_GENERIC || COMPILE_TEST
help
Generic PWM framework driver for the Broadcom Set-top-Box
SoCs (BCM7xxx).
@@ -152,7 +162,7 @@ config PWM_CROS_EC
config PWM_EP93XX
tristate "Cirrus Logic EP93xx PWM support"
- depends on ARCH_EP93XX
+ depends on ARCH_EP93XX || COMPILE_TEST
help
Generic PWM framework driver for Cirrus Logic EP93xx.
@@ -195,7 +205,7 @@ config PWM_IMG
config PWM_IMX1
tristate "i.MX1 PWM support"
- depends on ARCH_MXC
+ depends on ARCH_MXC || COMPILE_TEST
help
Generic PWM framework driver for i.MX1 and i.MX21
@@ -204,7 +214,7 @@ config PWM_IMX1
config PWM_IMX27
tristate "i.MX27 PWM support"
- depends on ARCH_MXC
+ depends on ARCH_MXC || COMPILE_TEST
help
Generic PWM framework driver for i.MX27 and later i.MX SoCs.
@@ -225,6 +235,8 @@ config PWM_IMX_TPM
config PWM_JZ4740
tristate "Ingenic JZ47xx PWM support"
depends on MACH_INGENIC
+ depends on COMMON_CLK
+ select MFD_SYSCON
help
Generic PWM framework driver for Ingenic JZ47xx based
machines.
@@ -244,7 +256,7 @@ config PWM_LP3943
config PWM_LPC18XX_SCT
tristate "LPC18xx/43xx PWM/SCT support"
- depends on ARCH_LPC18XX
+ depends on ARCH_LPC18XX || COMPILE_TEST
help
Generic PWM framework driver for NXP LPC18xx PWM/SCT which
supports 16 channels.
@@ -256,7 +268,7 @@ config PWM_LPC18XX_SCT
config PWM_LPC32XX
tristate "LPC32XX PWM support"
- depends on ARCH_LPC32XX
+ depends on ARCH_LPC32XX || COMPILE_TEST
help
Generic PWM framework driver for LPC32XX. The LPC32XX SOC has two
PWM controllers.
@@ -289,7 +301,8 @@ config PWM_LPSS_PLATFORM
config PWM_MESON
tristate "Amlogic Meson PWM driver"
- depends on ARCH_MESON
+ depends on ARCH_MESON || COMPILE_TEST
+ depends on COMMON_CLK
help
The platform driver for Amlogic Meson PWM controller.
@@ -318,7 +331,8 @@ config PWM_MEDIATEK
config PWM_MXS
tristate "Freescale MXS PWM support"
- depends on ARCH_MXS && OF
+ depends on OF
+ depends on ARCH_MXS || COMPILE_TEST
select STMP_DEVICE
help
Generic PWM framework driver for Freescale MXS.
@@ -357,7 +371,7 @@ config PWM_PUV3
config PWM_PXA
tristate "PXA PWM support"
- depends on ARCH_PXA
+ depends on ARCH_PXA || COMPILE_TEST
help
Generic PWM framework driver for PXA.
@@ -388,14 +402,14 @@ config PWM_RENESAS_TPU
config PWM_ROCKCHIP
tristate "Rockchip PWM support"
- depends on ARCH_ROCKCHIP
+ depends on ARCH_ROCKCHIP || COMPILE_TEST
help
Generic PWM framework driver for the PWM controller found on
Rockchip SoCs.
config PWM_SAMSUNG
tristate "Samsung PWM support"
- depends on PLAT_SAMSUNG || ARCH_EXYNOS
+ depends on PLAT_SAMSUNG || ARCH_EXYNOS || COMPILE_TEST
help
Generic PWM framework driver for Samsung.
@@ -415,7 +429,7 @@ config PWM_SIFIVE
config PWM_SPEAR
tristate "STMicroelectronics SPEAr PWM support"
- depends on PLAT_SPEAR
+ depends on PLAT_SPEAR || COMPILE_TEST
depends on OF
help
Generic PWM framework driver for the PWM controller on ST
@@ -437,7 +451,7 @@ config PWM_SPRD
config PWM_STI
tristate "STiH4xx PWM support"
- depends on ARCH_STI
+ depends on ARCH_STI || COMPILE_TEST
depends on OF
help
Generic PWM framework driver for STiH4xx SoCs.
@@ -447,7 +461,7 @@ config PWM_STI
config PWM_STM32
tristate "STMicroelectronics STM32 PWM"
- depends on MFD_STM32_TIMERS
+ depends on MFD_STM32_TIMERS || COMPILE_TEST
help
Generic PWM framework driver for STM32 SoCs.
@@ -483,7 +497,7 @@ config PWM_SUN4I
config PWM_TEGRA
tristate "NVIDIA Tegra PWM support"
- depends on ARCH_TEGRA
+ depends on ARCH_TEGRA || COMPILE_TEST
help
Generic PWM framework driver for the PWFM controller found on NVIDIA
Tegra SoCs.
@@ -493,7 +507,7 @@ config PWM_TEGRA
config PWM_TIECAP
tristate "ECAP PWM support"
- depends on ARCH_OMAP2PLUS || ARCH_DAVINCI_DA8XX || ARCH_KEYSTONE || ARCH_K3
+ depends on ARCH_OMAP2PLUS || ARCH_DAVINCI_DA8XX || ARCH_KEYSTONE || ARCH_K3 || COMPILE_TEST
help
PWM driver support for the ECAP APWM controller found on TI SOCs
@@ -502,7 +516,7 @@ config PWM_TIECAP
config PWM_TIEHRPWM
tristate "EHRPWM PWM support"
- depends on ARCH_OMAP2PLUS || ARCH_DAVINCI_DA8XX || ARCH_K3
+ depends on ARCH_OMAP2PLUS || ARCH_DAVINCI_DA8XX || ARCH_K3 || COMPILE_TEST
help
PWM driver support for the EHRPWM controller found on TI SOCs
@@ -529,7 +543,7 @@ config PWM_TWL_LED
config PWM_VT8500
tristate "vt8500 PWM support"
- depends on ARCH_VT8500
+ depends on ARCH_VT8500 || COMPILE_TEST
help
Generic PWM framework driver for vt8500.
@@ -538,7 +552,7 @@ config PWM_VT8500
config PWM_ZX
tristate "ZTE ZX PWM support"
- depends on ARCH_ZX
+ depends on ARCH_ZX || COMPILE_TEST
help
Generic PWM framework driver for ZTE ZX family SoCs.
diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 5a7f6598c05f..9973c442b455 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -120,6 +120,9 @@ static int pwm_device_request(struct pwm_device *pwm, const char *label)
if (pwm->chip->ops->get_state) {
pwm->chip->ops->get_state(pwm->chip, pwm, &pwm->state);
trace_pwm_get(pwm, &pwm->state);
+
+ if (IS_ENABLED(PWM_DEBUG))
+ pwm->last = pwm->state;
}
set_bit(PWMF_REQUESTED, &pwm->flags);
@@ -232,17 +235,28 @@ void *pwm_get_chip_data(struct pwm_device *pwm)
}
EXPORT_SYMBOL_GPL(pwm_get_chip_data);
-static bool pwm_ops_check(const struct pwm_ops *ops)
+static bool pwm_ops_check(const struct pwm_chip *chip)
{
+
+ const struct pwm_ops *ops = chip->ops;
+
/* driver supports legacy, non-atomic operation */
- if (ops->config && ops->enable && ops->disable)
- return true;
+ if (ops->config && ops->enable && ops->disable) {
+ if (IS_ENABLED(CONFIG_PWM_DEBUG))
+ dev_warn(chip->dev,
+ "Driver needs updating to atomic API\n");
- /* driver supports atomic operation */
- if (ops->apply)
return true;
+ }
- return false;
+ if (!ops->apply)
+ return false;
+
+ if (IS_ENABLED(CONFIG_PWM_DEBUG) && !ops->get_state)
+ dev_warn(chip->dev,
+ "Please implement the .get_state() callback\n");
+
+ return true;
}
/**
@@ -266,7 +280,7 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
if (!chip || !chip->dev || !chip->ops || !chip->npwm)
return -EINVAL;
- if (!pwm_ops_check(chip->ops))
+ if (!pwm_ops_check(chip))
return -EINVAL;
mutex_lock(&pwm_lock);
@@ -450,6 +464,107 @@ void pwm_free(struct pwm_device *pwm)
}
EXPORT_SYMBOL_GPL(pwm_free);
+static void pwm_apply_state_debug(struct pwm_device *pwm,
+ const struct pwm_state *state)
+{
+ struct pwm_state *last = &pwm->last;
+ struct pwm_chip *chip = pwm->chip;
+ struct pwm_state s1, s2;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_PWM_DEBUG))
+ return;
+
+ /* No reasonable diagnosis possible without .get_state() */
+ if (!chip->ops->get_state)
+ return;
+
+ /*
+ * *state was just applied. Read out the hardware state and do some
+ * checks.
+ */
+
+ chip->ops->get_state(chip, pwm, &s1);
+ trace_pwm_get(pwm, &s1);
+
+ /*
+ * The lowlevel driver either ignored .polarity (which is a bug) or as
+ * best effort inverted .polarity and fixed .duty_cycle respectively.
+ * Undo this inversion and fixup for further tests.
+ */
+ if (s1.enabled && s1.polarity != state->polarity) {
+ s2.polarity = state->polarity;
+ s2.duty_cycle = s1.period - s1.duty_cycle;
+ s2.period = s1.period;
+ s2.enabled = s1.enabled;
+ } else {
+ s2 = s1;
+ }
+
+ if (s2.polarity != state->polarity &&
+ state->duty_cycle < state->period)
+ dev_warn(chip->dev, ".apply ignored .polarity\n");
+
+ if (state->enabled &&
+ last->polarity == state->polarity &&
+ last->period > s2.period &&
+ last->period <= state->period)
+ dev_warn(chip->dev,
+ ".apply didn't pick the best available period (requested: %u, applied: %u, possible: %u)\n",
+ state->period, s2.period, last->period);
+
+ if (state->enabled && state->period < s2.period)
+ dev_warn(chip->dev,
+ ".apply is supposed to round down period (requested: %u, applied: %u)\n",
+ state->period, s2.period);
+
+ if (state->enabled &&
+ last->polarity == state->polarity &&
+ last->period == s2.period &&
+ last->duty_cycle > s2.duty_cycle &&
+ last->duty_cycle <= state->duty_cycle)
+ dev_warn(chip->dev,
+ ".apply didn't pick the best available duty cycle (requested: %u/%u, applied: %u/%u, possible: %u/%u)\n",
+ state->duty_cycle, state->period,
+ s2.duty_cycle, s2.period,
+ last->duty_cycle, last->period);
+
+ if (state->enabled && state->duty_cycle < s2.duty_cycle)
+ dev_warn(chip->dev,
+ ".apply is supposed to round down duty_cycle (requested: %u/%u, applied: %u/%u)\n",
+ state->duty_cycle, state->period,
+ s2.duty_cycle, s2.period);
+
+ if (!state->enabled && s2.enabled && s2.duty_cycle > 0)
+ dev_warn(chip->dev,
+ "requested disabled, but yielded enabled with duty > 0");
+
+ /* reapply the state that the driver reported being configured. */
+ err = chip->ops->apply(chip, pwm, &s1);
+ if (err) {
+ *last = s1;
+ dev_err(chip->dev, "failed to reapply current setting\n");
+ return;
+ }
+
+ trace_pwm_apply(pwm, &s1);
+
+ chip->ops->get_state(chip, pwm, last);
+ trace_pwm_get(pwm, last);
+
+ /* reapplication of the current state should give an exact match */
+ if (s1.enabled != last->enabled ||
+ s1.polarity != last->polarity ||
+ (s1.enabled && s1.period != last->period) ||
+ (s1.enabled && s1.duty_cycle != last->duty_cycle)) {
+ dev_err(chip->dev,
+ ".apply is not idempotent (ena=%d pol=%d %u/%u) -> (ena=%d pol=%d %u/%u)\n",
+ s1.enabled, s1.polarity, s1.duty_cycle, s1.period,
+ last->enabled, last->polarity, last->duty_cycle,
+ last->period);
+ }
+}
+
/**
* pwm_apply_state() - atomically apply a new state to a PWM device
* @pwm: PWM device
@@ -480,6 +595,12 @@ int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state)
trace_pwm_apply(pwm, state);
pwm->state = *state;
+
+ /*
+ * only do this after pwm->state was applied as some
+ * implementations of .get_state depend on this
+ */
+ pwm_apply_state_debug(pwm, state);
} else {
/*
* FIXME: restore the initial state in case of error.
diff --git a/drivers/pwm/pwm-bcm2835.c b/drivers/pwm/pwm-bcm2835.c
index 91e24f01b54e..d78f86f8e462 100644
--- a/drivers/pwm/pwm-bcm2835.c
+++ b/drivers/pwm/pwm-bcm2835.c
@@ -166,6 +166,7 @@ static int bcm2835_pwm_probe(struct platform_device *pdev)
pc->chip.dev = &pdev->dev;
pc->chip.ops = &bcm2835_pwm_ops;
+ pc->chip.base = -1;
pc->chip.npwm = 2;
pc->chip.of_xlate = of_pwm_xlate_with_flags;
pc->chip.of_pwm_n_cells = 3;
diff --git a/drivers/pwm/pwm-imx-tpm.c b/drivers/pwm/pwm-imx-tpm.c
index 9145f6160649..5f3d7f7e6aef 100644
--- a/drivers/pwm/pwm-imx-tpm.c
+++ b/drivers/pwm/pwm-imx-tpm.c
@@ -18,10 +18,8 @@
#include <linux/clk.h>
#include <linux/err.h>
#include <linux/io.h>
-#include <linux/log2.h>
#include <linux/module.h>
#include <linux/of.h>
-#include <linux/of_address.h>
#include <linux/platform_device.h>
#include <linux/pwm.h>
#include <linux/slab.h>
diff --git a/drivers/pwm/pwm-imx27.c b/drivers/pwm/pwm-imx27.c
index 35a7ac42269c..a6e40d4c485f 100644
--- a/drivers/pwm/pwm-imx27.c
+++ b/drivers/pwm/pwm-imx27.c
@@ -18,7 +18,6 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/of.h>
-#include <linux/of_device.h>
#include <linux/platform_device.h>
#include <linux/pwm.h>
#include <linux/slab.h>
@@ -96,9 +95,8 @@ struct pwm_imx27_chip {
#define to_pwm_imx27_chip(chip) container_of(chip, struct pwm_imx27_chip, chip)
-static int pwm_imx27_clk_prepare_enable(struct pwm_chip *chip)
+static int pwm_imx27_clk_prepare_enable(struct pwm_imx27_chip *imx)
{
- struct pwm_imx27_chip *imx = to_pwm_imx27_chip(chip);
int ret;
ret = clk_prepare_enable(imx->clk_ipg);
@@ -114,10 +112,8 @@ static int pwm_imx27_clk_prepare_enable(struct pwm_chip *chip)
return 0;
}
-static void pwm_imx27_clk_disable_unprepare(struct pwm_chip *chip)
+static void pwm_imx27_clk_disable_unprepare(struct pwm_imx27_chip *imx)
{
- struct pwm_imx27_chip *imx = to_pwm_imx27_chip(chip);
-
clk_disable_unprepare(imx->clk_per);
clk_disable_unprepare(imx->clk_ipg);
}
@@ -130,7 +126,7 @@ static void pwm_imx27_get_state(struct pwm_chip *chip,
u64 tmp;
int ret;
- ret = pwm_imx27_clk_prepare_enable(chip);
+ ret = pwm_imx27_clk_prepare_enable(imx);
if (ret < 0)
return;
@@ -174,8 +170,7 @@ static void pwm_imx27_get_state(struct pwm_chip *chip,
tmp = NSEC_PER_SEC * (u64)(val);
state->duty_cycle = DIV_ROUND_CLOSEST_ULL(tmp, pwm_clk);
- if (!state->enabled)
- pwm_imx27_clk_disable_unprepare(chip);
+ pwm_imx27_clk_disable_unprepare(imx);
}
static void pwm_imx27_sw_reset(struct pwm_chip *chip)
@@ -259,7 +254,7 @@ static int pwm_imx27_apply(struct pwm_chip *chip, struct pwm_device *pwm,
if (cstate.enabled) {
pwm_imx27_wait_fifo_slot(chip, pwm);
} else {
- ret = pwm_imx27_clk_prepare_enable(chip);
+ ret = pwm_imx27_clk_prepare_enable(imx);
if (ret)
return ret;
@@ -289,8 +284,8 @@ static int pwm_imx27_apply(struct pwm_chip *chip, struct pwm_device *pwm,
writel(cr, imx->mmio_base + MX3_PWMCR);
- if (!state->enabled && cstate.enabled)
- pwm_imx27_clk_disable_unprepare(chip);
+ if (!state->enabled)
+ pwm_imx27_clk_disable_unprepare(imx);
return 0;
}
@@ -310,6 +305,8 @@ MODULE_DEVICE_TABLE(of, pwm_imx27_dt_ids);
static int pwm_imx27_probe(struct platform_device *pdev)
{
struct pwm_imx27_chip *imx;
+ int ret;
+ u32 pwmcr;
imx = devm_kzalloc(&pdev->dev, sizeof(*imx), GFP_KERNEL);
if (imx == NULL)
@@ -352,6 +349,15 @@ static int pwm_imx27_probe(struct platform_device *pdev)
if (IS_ERR(imx->mmio_base))
return PTR_ERR(imx->mmio_base);
+ ret = pwm_imx27_clk_prepare_enable(imx);
+ if (ret)
+ return ret;
+
+ /* keep clks on if pwm is running */
+ pwmcr = readl(imx->mmio_base + MX3_PWMCR);
+ if (!(pwmcr & MX3_PWMCR_EN))
+ pwm_imx27_clk_disable_unprepare(imx);
+
return pwmchip_add(&imx->chip);
}
@@ -361,8 +367,6 @@ static int pwm_imx27_remove(struct platform_device *pdev)
imx = platform_get_drvdata(pdev);
- pwm_imx27_clk_disable_unprepare(&imx->chip);
-
return pwmchip_remove(&imx->chip);
}
diff --git a/drivers/pwm/pwm-jz4740.c b/drivers/pwm/pwm-jz4740.c
index 9d78cc21cb12..3cd5c054ad9a 100644
--- a/drivers/pwm/pwm-jz4740.c
+++ b/drivers/pwm/pwm-jz4740.c
@@ -13,18 +13,19 @@
#include <linux/err.h>
#include <linux/gpio.h>
#include <linux/kernel.h>
+#include <linux/mfd/ingenic-tcu.h>
+#include <linux/mfd/syscon.h>
#include <linux/module.h>
#include <linux/of_device.h>
#include <linux/platform_device.h>
#include <linux/pwm.h>
-
-#include <asm/mach-jz4740/timer.h>
+#include <linux/regmap.h>
#define NUM_PWM 8
struct jz4740_pwm_chip {
struct pwm_chip chip;
- struct clk *clk;
+ struct regmap *map;
};
static inline struct jz4740_pwm_chip *to_jz4740(struct pwm_chip *chip)
@@ -32,82 +33,134 @@ static inline struct jz4740_pwm_chip *to_jz4740(struct pwm_chip *chip)
return container_of(chip, struct jz4740_pwm_chip, chip);
}
+static bool jz4740_pwm_can_use_chn(struct jz4740_pwm_chip *jz,
+ unsigned int channel)
+{
+ /* Enable all TCU channels for PWM use by default except channels 0/1 */
+ u32 pwm_channels_mask = GENMASK(NUM_PWM - 1, 2);
+
+ device_property_read_u32(jz->chip.dev->parent,
+ "ingenic,pwm-channels-mask",
+ &pwm_channels_mask);
+
+ return !!(pwm_channels_mask & BIT(channel));
+}
+
static int jz4740_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
{
- /*
- * Timers 0 and 1 are used for system tasks, so they are unavailable
- * for use as PWMs.
- */
- if (pwm->hwpwm < 2)
+ struct jz4740_pwm_chip *jz = to_jz4740(chip);
+ struct clk *clk;
+ char name[16];
+ int err;
+
+ if (!jz4740_pwm_can_use_chn(jz, pwm->hwpwm))
return -EBUSY;
- jz4740_timer_start(pwm->hwpwm);
+ snprintf(name, sizeof(name), "timer%u", pwm->hwpwm);
+
+ clk = clk_get(chip->dev, name);
+ if (IS_ERR(clk)) {
+ if (PTR_ERR(clk) != -EPROBE_DEFER)
+ dev_err(chip->dev, "Failed to get clock: %pe", clk);
+
+ return PTR_ERR(clk);
+ }
+
+ err = clk_prepare_enable(clk);
+ if (err < 0) {
+ clk_put(clk);
+ return err;
+ }
+
+ pwm_set_chip_data(pwm, clk);
return 0;
}
static void jz4740_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
{
- jz4740_timer_set_ctrl(pwm->hwpwm, 0);
+ struct clk *clk = pwm_get_chip_data(pwm);
- jz4740_timer_stop(pwm->hwpwm);
+ clk_disable_unprepare(clk);
+ clk_put(clk);
}
static int jz4740_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
{
- uint32_t ctrl = jz4740_timer_get_ctrl(pwm->pwm);
+ struct jz4740_pwm_chip *jz = to_jz4740(chip);
+
+ /* Enable PWM output */
+ regmap_update_bits(jz->map, TCU_REG_TCSRc(pwm->hwpwm),
+ TCU_TCSR_PWM_EN, TCU_TCSR_PWM_EN);
- ctrl |= JZ_TIMER_CTRL_PWM_ENABLE;
- jz4740_timer_set_ctrl(pwm->hwpwm, ctrl);
- jz4740_timer_enable(pwm->hwpwm);
+ /* Start counter */
+ regmap_write(jz->map, TCU_REG_TESR, BIT(pwm->hwpwm));
return 0;
}
static void jz4740_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
{
- uint32_t ctrl = jz4740_timer_get_ctrl(pwm->hwpwm);
+ struct jz4740_pwm_chip *jz = to_jz4740(chip);
/*
* Set duty > period. This trick allows the TCU channels in TCU2 mode to
* properly return to their init level.
*/
- jz4740_timer_set_duty(pwm->hwpwm, 0xffff);
- jz4740_timer_set_period(pwm->hwpwm, 0x0);
+ regmap_write(jz->map, TCU_REG_TDHRc(pwm->hwpwm), 0xffff);
+ regmap_write(jz->map, TCU_REG_TDFRc(pwm->hwpwm), 0x0);
/*
* Disable PWM output.
* In TCU2 mode (channel 1/2 on JZ4750+), this must be done before the
* counter is stopped, while in TCU1 mode the order does not matter.
*/
- ctrl &= ~JZ_TIMER_CTRL_PWM_ENABLE;
- jz4740_timer_set_ctrl(pwm->hwpwm, ctrl);
+ regmap_update_bits(jz->map, TCU_REG_TCSRc(pwm->hwpwm),
+ TCU_TCSR_PWM_EN, 0);
/* Stop counter */
- jz4740_timer_disable(pwm->hwpwm);
+ regmap_write(jz->map, TCU_REG_TECR, BIT(pwm->hwpwm));
}
static int jz4740_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
const struct pwm_state *state)
{
struct jz4740_pwm_chip *jz4740 = to_jz4740(pwm->chip);
- unsigned long long tmp;
+ unsigned long long tmp = 0xffffull * NSEC_PER_SEC;
+ struct clk *clk = pwm_get_chip_data(pwm);
unsigned long period, duty;
- unsigned int prescaler = 0;
- uint16_t ctrl;
+ long rate;
+ int err;
- tmp = (unsigned long long)clk_get_rate(jz4740->clk) * state->period;
- do_div(tmp, 1000000000);
- period = tmp;
+ /*
+ * Limit the clock to a maximum rate that still gives us a period value
+ * which fits in 16 bits.
+ */
+ do_div(tmp, state->period);
- while (period > 0xffff && prescaler < 6) {
- period >>= 2;
- ++prescaler;
+ /*
+ * /!\ IMPORTANT NOTE:
+ * -------------------
+ * This code relies on the fact that clk_round_rate() will always round
+ * down, which is not a valid assumption given by the clk API, but only
+ * happens to be true with the clk drivers used for Ingenic SoCs.
+ *
+ * Right now, there is no alternative as the clk API does not have a
+ * round-down function (and won't have one for a while), but if it ever
+ * comes to light, a round-down function should be used instead.
+ */
+ rate = clk_round_rate(clk, tmp);
+ if (rate < 0) {
+ dev_err(chip->dev, "Unable to round rate: %ld", rate);
+ return rate;
}
- if (prescaler == 6)
- return -EINVAL;
+ /* Calculate period value */
+ tmp = (unsigned long long)rate * state->period;
+ do_div(tmp, NSEC_PER_SEC);
+ period = (unsigned long)tmp;
+ /* Calculate duty value */
tmp = (unsigned long long)period * state->duty_cycle;
do_div(tmp, state->period);
duty = period - tmp;
@@ -117,26 +170,38 @@ static int jz4740_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
jz4740_pwm_disable(chip, pwm);
- jz4740_timer_set_count(pwm->hwpwm, 0);
- jz4740_timer_set_duty(pwm->hwpwm, duty);
- jz4740_timer_set_period(pwm->hwpwm, period);
+ err = clk_set_rate(clk, rate);
+ if (err) {
+ dev_err(chip->dev, "Unable to set rate: %d", err);
+ return err;
+ }
+
+ /* Reset counter to 0 */
+ regmap_write(jz4740->map, TCU_REG_TCNTc(pwm->hwpwm), 0);
- ctrl = JZ_TIMER_CTRL_PRESCALER(prescaler) | JZ_TIMER_CTRL_SRC_EXT |
- JZ_TIMER_CTRL_PWM_ABBRUPT_SHUTDOWN;
+ /* Set duty */
+ regmap_write(jz4740->map, TCU_REG_TDHRc(pwm->hwpwm), duty);
- jz4740_timer_set_ctrl(pwm->hwpwm, ctrl);
+ /* Set period */
+ regmap_write(jz4740->map, TCU_REG_TDFRc(pwm->hwpwm), period);
+ /* Set abrupt shutdown */
+ regmap_update_bits(jz4740->map, TCU_REG_TCSRc(pwm->hwpwm),
+ TCU_TCSR_PWM_SD, TCU_TCSR_PWM_SD);
+
+ /* Set polarity */
switch (state->polarity) {
case PWM_POLARITY_NORMAL:
- ctrl &= ~JZ_TIMER_CTRL_PWM_ACTIVE_LOW;
+ regmap_update_bits(jz4740->map, TCU_REG_TCSRc(pwm->hwpwm),
+ TCU_TCSR_PWM_INITL_HIGH, 0);
break;
case PWM_POLARITY_INVERSED:
- ctrl |= JZ_TIMER_CTRL_PWM_ACTIVE_LOW;
+ regmap_update_bits(jz4740->map, TCU_REG_TCSRc(pwm->hwpwm),
+ TCU_TCSR_PWM_INITL_HIGH,
+ TCU_TCSR_PWM_INITL_HIGH);
break;
}
- jz4740_timer_set_ctrl(pwm->hwpwm, ctrl);
-
if (state->enabled)
jz4740_pwm_enable(chip, pwm);
@@ -152,17 +217,20 @@ static const struct pwm_ops jz4740_pwm_ops = {
static int jz4740_pwm_probe(struct platform_device *pdev)
{
+ struct device *dev = &pdev->dev;
struct jz4740_pwm_chip *jz4740;
- jz4740 = devm_kzalloc(&pdev->dev, sizeof(*jz4740), GFP_KERNEL);
+ jz4740 = devm_kzalloc(dev, sizeof(*jz4740), GFP_KERNEL);
if (!jz4740)
return -ENOMEM;
- jz4740->clk = devm_clk_get(&pdev->dev, "ext");
- if (IS_ERR(jz4740->clk))
- return PTR_ERR(jz4740->clk);
+ jz4740->map = device_node_to_regmap(dev->parent->of_node);
+ if (IS_ERR(jz4740->map)) {
+ dev_err(dev, "regmap not found: %ld\n", PTR_ERR(jz4740->map));
+ return PTR_ERR(jz4740->map);
+ }
- jz4740->chip.dev = &pdev->dev;
+ jz4740->chip.dev = dev;
jz4740->chip.ops = &jz4740_pwm_ops;
jz4740->chip.npwm = NUM_PWM;
jz4740->chip.base = -1;
diff --git a/drivers/pwm/pwm-meson.c b/drivers/pwm/pwm-meson.c
index 6245bbdb6e6c..bd0d7336b898 100644
--- a/drivers/pwm/pwm-meson.c
+++ b/drivers/pwm/pwm-meson.c
@@ -136,7 +136,7 @@ static int meson_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
dev_err(dev, "failed to set parent %s for %s: %d\n",
__clk_get_name(channel->clk_parent),
__clk_get_name(channel->clk), err);
- return err;
+ return err;
}
}
@@ -163,7 +163,7 @@ static int meson_pwm_calc(struct meson_pwm *meson, struct pwm_device *pwm,
{
struct meson_pwm_channel *channel = pwm_get_chip_data(pwm);
unsigned int duty, period, pre_div, cnt, duty_cnt;
- unsigned long fin_freq = -1;
+ unsigned long fin_freq;
duty = state->duty_cycle;
period = state->period;
diff --git a/drivers/pwm/pwm-mxs.c b/drivers/pwm/pwm-mxs.c
index f2e57fcf8f8b..7ce616923c52 100644
--- a/drivers/pwm/pwm-mxs.c
+++ b/drivers/pwm/pwm-mxs.c
@@ -9,7 +9,6 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/of.h>
-#include <linux/of_address.h>
#include <linux/platform_device.h>
#include <linux/pwm.h>
#include <linux/slab.h>
diff --git a/drivers/pwm/pwm-omap-dmtimer.c b/drivers/pwm/pwm-omap-dmtimer.c
index 9e4378dc6897..0d31833db2e2 100644
--- a/drivers/pwm/pwm-omap-dmtimer.c
+++ b/drivers/pwm/pwm-omap-dmtimer.c
@@ -10,7 +10,27 @@
*
* Description:
* This file is the core OMAP support for the generic, Linux
- * PWM driver / controller, using the OMAP's dual-mode timers.
+ * PWM driver / controller, using the OMAP's dual-mode timers
+ * with a timer counter that goes up. When it overflows it gets
+ * reloaded with the load value and the pwm output goes up.
+ * When counter matches with match register, the output goes down.
+ * Reference Manual: http://www.ti.com/lit/ug/spruh73q/spruh73q.pdf
+ *
+ * Limitations:
+ * - When PWM is stopped, timer counter gets stopped immediately. This
+ * doesn't allow the current PWM period to complete and stops abruptly.
+ * - When PWM is running and changing both duty cycle and period,
+ * we cannot prevent in software that the output might produce
+ * a period with mixed settings. Especially when period/duty_cyle
+ * is updated while the pwm pin is high, current pwm period/duty_cycle
+ * can get updated as below based on the current timer counter:
+ * - period for current cycle = current_period + new period
+ * - duty_cycle for current period = current period + new duty_cycle.
+ * - PWM OMAP DM timer cannot change the polarity when pwm is active. When
+ * user requests a change in polarity when in active state:
+ * - PWM is stopped abruptly(without completing the current cycle)
+ * - Polarity is changed
+ * - A fresh cycle is started.
*/
#include <linux/clk.h>
@@ -20,8 +40,8 @@
#include <linux/mutex.h>
#include <linux/of.h>
#include <linux/of_platform.h>
+#include <clocksource/timer-ti-dm.h>
#include <linux/platform_data/dmtimer-omap.h>
-#include <linux/platform_data/pwm_omap_dmtimer.h>
#include <linux/platform_device.h>
#include <linux/pm_runtime.h>
#include <linux/pwm.h>
@@ -31,10 +51,20 @@
#define DM_TIMER_LOAD_MIN 0xfffffffe
#define DM_TIMER_MAX 0xffffffff
+/**
+ * struct pwm_omap_dmtimer_chip - Structure representing a pwm chip
+ * corresponding to omap dmtimer.
+ * @chip: PWM chip structure representing PWM controller
+ * @mutex: Mutex to protect pwm apply state
+ * @dm_timer: Pointer to omap dm timer.
+ * @pdata: Pointer to omap dm timer ops.
+ * dm_timer_pdev: Pointer to omap dm timer platform device
+ */
struct pwm_omap_dmtimer_chip {
struct pwm_chip chip;
+ /* Mutex to protect pwm apply state */
struct mutex mutex;
- pwm_omap_dmtimer *dm_timer;
+ struct omap_dm_timer *dm_timer;
const struct omap_dm_timer_ops *pdata;
struct platform_device *dm_timer_pdev;
};
@@ -45,11 +75,22 @@ to_pwm_omap_dmtimer_chip(struct pwm_chip *chip)
return container_of(chip, struct pwm_omap_dmtimer_chip, chip);
}
+/**
+ * pwm_omap_dmtimer_get_clock_cycles() - Get clock cycles in a time frame
+ * @clk_rate: pwm timer clock rate
+ * @ns: time frame in nano seconds.
+ *
+ * Return number of clock cycles in a given period(ins ns).
+ */
static u32 pwm_omap_dmtimer_get_clock_cycles(unsigned long clk_rate, int ns)
{
return DIV_ROUND_CLOSEST_ULL((u64)clk_rate * ns, NSEC_PER_SEC);
}
+/**
+ * pwm_omap_dmtimer_start() - Start the pwm omap dm timer in pwm mode
+ * @omap: Pointer to pwm omap dm timer chip
+ */
static void pwm_omap_dmtimer_start(struct pwm_omap_dmtimer_chip *omap)
{
/*
@@ -67,28 +108,46 @@ static void pwm_omap_dmtimer_start(struct pwm_omap_dmtimer_chip *omap)
omap->pdata->start(omap->dm_timer);
}
-static int pwm_omap_dmtimer_enable(struct pwm_chip *chip,
- struct pwm_device *pwm)
+/**
+ * pwm_omap_dmtimer_is_enabled() - Detect if the pwm is enabled.
+ * @omap: Pointer to pwm omap dm timer chip
+ *
+ * Return true if pwm is enabled else false.
+ */
+static bool pwm_omap_dmtimer_is_enabled(struct pwm_omap_dmtimer_chip *omap)
{
- struct pwm_omap_dmtimer_chip *omap = to_pwm_omap_dmtimer_chip(chip);
+ u32 status;
- mutex_lock(&omap->mutex);
- pwm_omap_dmtimer_start(omap);
- mutex_unlock(&omap->mutex);
+ status = omap->pdata->get_pwm_status(omap->dm_timer);
- return 0;
+ return !!(status & OMAP_TIMER_CTRL_ST);
}
-static void pwm_omap_dmtimer_disable(struct pwm_chip *chip,
- struct pwm_device *pwm)
+/**
+ * pwm_omap_dmtimer_polarity() - Detect the polarity of pwm.
+ * @omap: Pointer to pwm omap dm timer chip
+ *
+ * Return the polarity of pwm.
+ */
+static int pwm_omap_dmtimer_polarity(struct pwm_omap_dmtimer_chip *omap)
{
- struct pwm_omap_dmtimer_chip *omap = to_pwm_omap_dmtimer_chip(chip);
+ u32 status;
- mutex_lock(&omap->mutex);
- omap->pdata->stop(omap->dm_timer);
- mutex_unlock(&omap->mutex);
+ status = omap->pdata->get_pwm_status(omap->dm_timer);
+
+ return !!(status & OMAP_TIMER_CTRL_SCPWM);
}
+/**
+ * pwm_omap_dmtimer_config() - Update the configuration of pwm omap dm timer
+ * @chip: Pointer to PWM controller
+ * @pwm: Pointer to PWM channel
+ * @duty_ns: New duty cycle in nano seconds
+ * @period_ns: New period in nano seconds
+ *
+ * Return 0 if successfully changed the period/duty_cycle else appropriate
+ * error.
+ */
static int pwm_omap_dmtimer_config(struct pwm_chip *chip,
struct pwm_device *pwm,
int duty_ns, int period_ns)
@@ -96,31 +155,26 @@ static int pwm_omap_dmtimer_config(struct pwm_chip *chip,
struct pwm_omap_dmtimer_chip *omap = to_pwm_omap_dmtimer_chip(chip);
u32 period_cycles, duty_cycles;
u32 load_value, match_value;
- struct clk *fclk;
unsigned long clk_rate;
- bool timer_active;
+ struct clk *fclk;
dev_dbg(chip->dev, "requested duty cycle: %d ns, period: %d ns\n",
duty_ns, period_ns);
- mutex_lock(&omap->mutex);
if (duty_ns == pwm_get_duty_cycle(pwm) &&
- period_ns == pwm_get_period(pwm)) {
- /* No change - don't cause any transients. */
- mutex_unlock(&omap->mutex);
+ period_ns == pwm_get_period(pwm))
return 0;
- }
fclk = omap->pdata->get_fclk(omap->dm_timer);
if (!fclk) {
dev_err(chip->dev, "invalid pmtimer fclk\n");
- goto err_einval;
+ return -EINVAL;
}
clk_rate = clk_get_rate(fclk);
if (!clk_rate) {
dev_err(chip->dev, "invalid pmtimer fclk rate\n");
- goto err_einval;
+ return -EINVAL;
}
dev_dbg(chip->dev, "clk rate: %luHz\n", clk_rate);
@@ -148,7 +202,7 @@ static int pwm_omap_dmtimer_config(struct pwm_chip *chip,
dev_info(chip->dev,
"period %d ns too short for clock rate %lu Hz\n",
period_ns, clk_rate);
- goto err_einval;
+ return -EINVAL;
}
if (duty_cycles < 1) {
@@ -174,81 +228,103 @@ static int pwm_omap_dmtimer_config(struct pwm_chip *chip,
load_value = (DM_TIMER_MAX - period_cycles) + 1;
match_value = load_value + duty_cycles - 1;
- /*
- * We MUST stop the associated dual-mode timer before attempting to
- * write its registers, but calls to omap_dm_timer_start/stop must
- * be balanced so check if timer is active before calling timer_stop.
- */
- timer_active = pm_runtime_active(&omap->dm_timer_pdev->dev);
- if (timer_active)
- omap->pdata->stop(omap->dm_timer);
-
omap->pdata->set_load(omap->dm_timer, load_value);
omap->pdata->set_match(omap->dm_timer, true, match_value);
dev_dbg(chip->dev, "load value: %#08x (%d), match value: %#08x (%d)\n",
load_value, load_value, match_value, match_value);
- omap->pdata->set_pwm(omap->dm_timer,
- pwm_get_polarity(pwm) == PWM_POLARITY_INVERSED,
- true,
- PWM_OMAP_DMTIMER_TRIGGER_OVERFLOW_AND_COMPARE,
- true);
-
- /* If config was called while timer was running it must be reenabled. */
- if (timer_active)
- pwm_omap_dmtimer_start(omap);
+ return 0;
+}
- mutex_unlock(&omap->mutex);
+/**
+ * pwm_omap_dmtimer_set_polarity() - Changes the polarity of the pwm dm timer.
+ * @chip: Pointer to PWM controller
+ * @pwm: Pointer to PWM channel
+ * @polarity: New pwm polarity to be set
+ */
+static void pwm_omap_dmtimer_set_polarity(struct pwm_chip *chip,
+ struct pwm_device *pwm,
+ enum pwm_polarity polarity)
+{
+ struct pwm_omap_dmtimer_chip *omap = to_pwm_omap_dmtimer_chip(chip);
+ bool enabled;
- return 0;
+ /* Disable the PWM before changing the polarity. */
+ enabled = pwm_omap_dmtimer_is_enabled(omap);
+ if (enabled)
+ omap->pdata->stop(omap->dm_timer);
-err_einval:
- mutex_unlock(&omap->mutex);
+ omap->pdata->set_pwm(omap->dm_timer,
+ polarity == PWM_POLARITY_INVERSED,
+ true, OMAP_TIMER_TRIGGER_OVERFLOW_AND_COMPARE,
+ true);
- return -EINVAL;
+ if (enabled)
+ pwm_omap_dmtimer_start(omap);
}
-static int pwm_omap_dmtimer_set_polarity(struct pwm_chip *chip,
- struct pwm_device *pwm,
- enum pwm_polarity polarity)
+/**
+ * pwm_omap_dmtimer_apply() - Changes the state of the pwm omap dm timer.
+ * @chip: Pointer to PWM controller
+ * @pwm: Pointer to PWM channel
+ * @state: New state to apply
+ *
+ * Return 0 if successfully changed the state else appropriate error.
+ */
+static int pwm_omap_dmtimer_apply(struct pwm_chip *chip,
+ struct pwm_device *pwm,
+ const struct pwm_state *state)
{
struct pwm_omap_dmtimer_chip *omap = to_pwm_omap_dmtimer_chip(chip);
+ int ret = 0;
- /*
- * PWM core will not call set_polarity while PWM is enabled so it's
- * safe to reconfigure the timer here without stopping it first.
- */
mutex_lock(&omap->mutex);
- omap->pdata->set_pwm(omap->dm_timer,
- polarity == PWM_POLARITY_INVERSED,
- true,
- PWM_OMAP_DMTIMER_TRIGGER_OVERFLOW_AND_COMPARE,
- true);
+
+ if (pwm_omap_dmtimer_is_enabled(omap) && !state->enabled) {
+ omap->pdata->stop(omap->dm_timer);
+ goto unlock_mutex;
+ }
+
+ if (pwm_omap_dmtimer_polarity(omap) != state->polarity)
+ pwm_omap_dmtimer_set_polarity(chip, pwm, state->polarity);
+
+ ret = pwm_omap_dmtimer_config(chip, pwm, state->duty_cycle,
+ state->period);
+ if (ret)
+ goto unlock_mutex;
+
+ if (!pwm_omap_dmtimer_is_enabled(omap) && state->enabled) {
+ omap->pdata->set_pwm(omap->dm_timer,
+ state->polarity == PWM_POLARITY_INVERSED,
+ true,
+ OMAP_TIMER_TRIGGER_OVERFLOW_AND_COMPARE,
+ true);
+ pwm_omap_dmtimer_start(omap);
+ }
+
+unlock_mutex:
mutex_unlock(&omap->mutex);
- return 0;
+ return ret;
}
static const struct pwm_ops pwm_omap_dmtimer_ops = {
- .enable = pwm_omap_dmtimer_enable,
- .disable = pwm_omap_dmtimer_disable,
- .config = pwm_omap_dmtimer_config,
- .set_polarity = pwm_omap_dmtimer_set_polarity,
+ .apply = pwm_omap_dmtimer_apply,
.owner = THIS_MODULE,
};
static int pwm_omap_dmtimer_probe(struct platform_device *pdev)
{
struct device_node *np = pdev->dev.of_node;
- struct device_node *timer;
- struct platform_device *timer_pdev;
- struct pwm_omap_dmtimer_chip *omap;
struct dmtimer_platform_data *timer_pdata;
const struct omap_dm_timer_ops *pdata;
- pwm_omap_dmtimer *dm_timer;
- u32 v;
+ struct platform_device *timer_pdev;
+ struct pwm_omap_dmtimer_chip *omap;
+ struct omap_dm_timer *dm_timer;
+ struct device_node *timer;
int ret = 0;
+ u32 v;
timer = of_parse_phandle(np, "ti,timers", 0);
if (!timer)
@@ -281,6 +357,7 @@ static int pwm_omap_dmtimer_probe(struct platform_device *pdev)
!pdata->set_load ||
!pdata->set_match ||
!pdata->set_pwm ||
+ !pdata->get_pwm_status ||
!pdata->set_prescaler ||
!pdata->write_counter) {
dev_err(&pdev->dev, "Incomplete dmtimer pdata structure\n");
diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c
index b07bdca3d510..76cd22bd6614 100644
--- a/drivers/pwm/pwm-pca9685.c
+++ b/drivers/pwm/pwm-pca9685.c
@@ -20,6 +20,7 @@
#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/pm_runtime.h>
+#include <linux/bitmap.h>
/*
* Because the PCA9685 has only one prescaler per chip, changing the period of
@@ -69,11 +70,11 @@
struct pca9685 {
struct pwm_chip chip;
struct regmap *regmap;
- int duty_ns;
int period_ns;
#if IS_ENABLED(CONFIG_GPIOLIB)
struct mutex lock;
struct gpio_chip gpio;
+ DECLARE_BITMAP(pwms_inuse, PCA9685_MAXCHAN + 1);
#endif
};
@@ -83,51 +84,51 @@ static inline struct pca9685 *to_pca(struct pwm_chip *chip)
}
#if IS_ENABLED(CONFIG_GPIOLIB)
-static int pca9685_pwm_gpio_request(struct gpio_chip *gpio, unsigned int offset)
+static bool pca9685_pwm_test_and_set_inuse(struct pca9685 *pca, int pwm_idx)
{
- struct pca9685 *pca = gpiochip_get_data(gpio);
- struct pwm_device *pwm;
+ bool is_inuse;
mutex_lock(&pca->lock);
-
- pwm = &pca->chip.pwms[offset];
-
- if (pwm->flags & (PWMF_REQUESTED | PWMF_EXPORTED)) {
- mutex_unlock(&pca->lock);
- return -EBUSY;
+ if (pwm_idx >= PCA9685_MAXCHAN) {
+ /*
+ * "all LEDs" channel:
+ * pretend already in use if any of the PWMs are requested
+ */
+ if (!bitmap_empty(pca->pwms_inuse, PCA9685_MAXCHAN)) {
+ is_inuse = true;
+ goto out;
+ }
+ } else {
+ /*
+ * regular channel:
+ * pretend already in use if the "all LEDs" channel is requested
+ */
+ if (test_bit(PCA9685_MAXCHAN, pca->pwms_inuse)) {
+ is_inuse = true;
+ goto out;
+ }
}
-
- pwm_set_chip_data(pwm, (void *)1);
-
+ is_inuse = test_and_set_bit(pwm_idx, pca->pwms_inuse);
+out:
mutex_unlock(&pca->lock);
- pm_runtime_get_sync(pca->chip.dev);
- return 0;
+ return is_inuse;
}
-static bool pca9685_pwm_is_gpio(struct pca9685 *pca, struct pwm_device *pwm)
+static void pca9685_pwm_clear_inuse(struct pca9685 *pca, int pwm_idx)
{
- bool is_gpio = false;
-
mutex_lock(&pca->lock);
+ clear_bit(pwm_idx, pca->pwms_inuse);
+ mutex_unlock(&pca->lock);
+}
- if (pwm->hwpwm >= PCA9685_MAXCHAN) {
- unsigned int i;
-
- /*
- * Check if any of the GPIOs are requested and in that case
- * prevent using the "all LEDs" channel.
- */
- for (i = 0; i < pca->gpio.ngpio; i++)
- if (gpiochip_is_requested(&pca->gpio, i)) {
- is_gpio = true;
- break;
- }
- } else if (pwm_get_chip_data(pwm)) {
- is_gpio = true;
- }
+static int pca9685_pwm_gpio_request(struct gpio_chip *gpio, unsigned int offset)
+{
+ struct pca9685 *pca = gpiochip_get_data(gpio);
- mutex_unlock(&pca->lock);
- return is_gpio;
+ if (pca9685_pwm_test_and_set_inuse(pca, offset))
+ return -EBUSY;
+ pm_runtime_get_sync(pca->chip.dev);
+ return 0;
}
static int pca9685_pwm_gpio_get(struct gpio_chip *gpio, unsigned int offset)
@@ -162,13 +163,14 @@ static void pca9685_pwm_gpio_free(struct gpio_chip *gpio, unsigned int offset)
pca9685_pwm_gpio_set(gpio, offset, 0);
pm_runtime_put(pca->chip.dev);
+ pca9685_pwm_clear_inuse(pca, offset);
}
static int pca9685_pwm_gpio_get_direction(struct gpio_chip *chip,
unsigned int offset)
{
/* Always out */
- return 0;
+ return GPIO_LINE_DIRECTION_OUT;
}
static int pca9685_pwm_gpio_direction_input(struct gpio_chip *gpio,
@@ -213,12 +215,17 @@ static int pca9685_pwm_gpio_probe(struct pca9685 *pca)
return devm_gpiochip_add_data(dev, &pca->gpio, pca);
}
#else
-static inline bool pca9685_pwm_is_gpio(struct pca9685 *pca,
- struct pwm_device *pwm)
+static inline bool pca9685_pwm_test_and_set_inuse(struct pca9685 *pca,
+ int pwm_idx)
{
return false;
}
+static inline void
+pca9685_pwm_clear_inuse(struct pca9685 *pca, int pwm_idx)
+{
+}
+
static inline int pca9685_pwm_gpio_probe(struct pca9685 *pca)
{
return 0;
@@ -272,8 +279,6 @@ static int pca9685_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
}
}
- pca->duty_ns = duty_ns;
-
if (duty_ns < 1) {
if (pwm->hwpwm >= PCA9685_MAXCHAN)
reg = PCA9685_ALL_LED_OFF_H;
@@ -402,7 +407,7 @@ static int pca9685_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
{
struct pca9685 *pca = to_pca(chip);
- if (pca9685_pwm_is_gpio(pca, pwm))
+ if (pca9685_pwm_test_and_set_inuse(pca, pwm->hwpwm))
return -EBUSY;
pm_runtime_get_sync(chip->dev);
@@ -411,8 +416,11 @@ static int pca9685_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
static void pca9685_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
{
+ struct pca9685 *pca = to_pca(chip);
+
pca9685_pwm_disable(chip, pwm);
pm_runtime_put(chip->dev);
+ pca9685_pwm_clear_inuse(pca, pwm->hwpwm);
}
static const struct pwm_ops pca9685_pwm_ops = {
@@ -449,7 +457,6 @@ static int pca9685_pwm_probe(struct i2c_client *client,
ret);
return ret;
}
- pca->duty_ns = 0;
pca->period_ns = PCA9685_DEFAULT_PERIOD;
i2c_set_clientdata(client, pca);
@@ -512,8 +519,7 @@ static int pca9685_pwm_remove(struct i2c_client *client)
return 0;
}
-#ifdef CONFIG_PM
-static int pca9685_pwm_runtime_suspend(struct device *dev)
+static int __maybe_unused pca9685_pwm_runtime_suspend(struct device *dev)
{
struct i2c_client *client = to_i2c_client(dev);
struct pca9685 *pca = i2c_get_clientdata(client);
@@ -522,7 +528,7 @@ static int pca9685_pwm_runtime_suspend(struct device *dev)
return 0;
}
-static int pca9685_pwm_runtime_resume(struct device *dev)
+static int __maybe_unused pca9685_pwm_runtime_resume(struct device *dev)
{
struct i2c_client *client = to_i2c_client(dev);
struct pca9685 *pca = i2c_get_clientdata(client);
@@ -530,7 +536,6 @@ static int pca9685_pwm_runtime_resume(struct device *dev)
pca9685_set_sleep_mode(pca, false);
return 0;
}
-#endif
static const struct i2c_device_id pca9685_id[] = {
{ "pca9685", 0 },
diff --git a/drivers/pwm/pwm-rcar.c b/drivers/pwm/pwm-rcar.c
index 2685577b6dd4..7ab9eb6616d9 100644
--- a/drivers/pwm/pwm-rcar.c
+++ b/drivers/pwm/pwm-rcar.c
@@ -229,24 +229,28 @@ static int rcar_pwm_probe(struct platform_device *pdev)
rcar_pwm->chip.base = -1;
rcar_pwm->chip.npwm = 1;
+ pm_runtime_enable(&pdev->dev);
+
ret = pwmchip_add(&rcar_pwm->chip);
if (ret < 0) {
dev_err(&pdev->dev, "failed to register PWM chip: %d\n", ret);
+ pm_runtime_disable(&pdev->dev);
return ret;
}
- pm_runtime_enable(&pdev->dev);
-
return 0;
}
static int rcar_pwm_remove(struct platform_device *pdev)
{
struct rcar_pwm_chip *rcar_pwm = platform_get_drvdata(pdev);
+ int ret;
+
+ ret = pwmchip_remove(&rcar_pwm->chip);
pm_runtime_disable(&pdev->dev);
- return pwmchip_remove(&rcar_pwm->chip);
+ return ret;
}
static const struct of_device_id rcar_pwm_of_table[] = {
diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c
index 4a855a21b782..81ad5a551455 100644
--- a/drivers/pwm/pwm-renesas-tpu.c
+++ b/drivers/pwm/pwm-renesas-tpu.c
@@ -415,16 +415,15 @@ static int tpu_probe(struct platform_device *pdev)
tpu->chip.base = -1;
tpu->chip.npwm = TPU_CHANNEL_MAX;
+ pm_runtime_enable(&pdev->dev);
+
ret = pwmchip_add(&tpu->chip);
if (ret < 0) {
dev_err(&pdev->dev, "failed to register PWM chip\n");
+ pm_runtime_disable(&pdev->dev);
return ret;
}
- dev_info(&pdev->dev, "TPU PWM %d registered\n", tpu->pdev->id);
-
- pm_runtime_enable(&pdev->dev);
-
return 0;
}
@@ -434,12 +433,10 @@ static int tpu_remove(struct platform_device *pdev)
int ret;
ret = pwmchip_remove(&tpu->chip);
- if (ret)
- return ret;
pm_runtime_disable(&pdev->dev);
- return 0;
+ return ret;
}
#ifdef CONFIG_OF
diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c
index 3e3efa6c768f..5c677c563349 100644
--- a/drivers/pwm/pwm-sun4i.c
+++ b/drivers/pwm/pwm-sun4i.c
@@ -90,7 +90,6 @@ struct sun4i_pwm_chip {
spinlock_t ctrl_lock;
const struct sun4i_pwm_data *data;
unsigned long next_period[2];
- bool needs_delay[2];
};
static inline struct sun4i_pwm_chip *to_sun4i_pwm_chip(struct pwm_chip *chip)
@@ -287,7 +286,6 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
sun4i_pwm_writel(sun4i_pwm, val, PWM_CH_PRD(pwm->hwpwm));
sun4i_pwm->next_period[pwm->hwpwm] = jiffies +
usecs_to_jiffies(cstate.period / 1000 + 1);
- sun4i_pwm->needs_delay[pwm->hwpwm] = true;
if (state->polarity != PWM_POLARITY_NORMAL)
ctrl &= ~BIT_CH(PWM_ACT_STATE, pwm->hwpwm);
@@ -298,7 +296,7 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
if (state->enabled) {
ctrl |= BIT_CH(PWM_EN, pwm->hwpwm);
- } else if (!sun4i_pwm->needs_delay[pwm->hwpwm]) {
+ } else {
ctrl &= ~BIT_CH(PWM_EN, pwm->hwpwm);
ctrl &= ~BIT_CH(PWM_CLK_GATING, pwm->hwpwm);
}
@@ -310,15 +308,9 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
if (state->enabled)
return 0;
- if (!sun4i_pwm->needs_delay[pwm->hwpwm]) {
- clk_disable_unprepare(sun4i_pwm->clk);
- return 0;
- }
-
/* We need a full period to elapse before disabling the channel. */
now = jiffies;
- if (sun4i_pwm->needs_delay[pwm->hwpwm] &&
- time_before(now, sun4i_pwm->next_period[pwm->hwpwm])) {
+ if (time_before(now, sun4i_pwm->next_period[pwm->hwpwm])) {
delay_us = jiffies_to_usecs(sun4i_pwm->next_period[pwm->hwpwm] -
now);
if ((delay_us / 500) > MAX_UDELAY_MS)
@@ -326,7 +318,6 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
else
usleep_range(delay_us, delay_us * 2);
}
- sun4i_pwm->needs_delay[pwm->hwpwm] = false;
spin_lock(&sun4i_pwm->ctrl_lock);
ctrl = sun4i_pwm_readl(sun4i_pwm, PWM_CTRL_REG);
diff --git a/drivers/pwm/pwm-tegra.c b/drivers/pwm/pwm-tegra.c
index aa12fb3ed92e..d26ed8f579ff 100644
--- a/drivers/pwm/pwm-tegra.c
+++ b/drivers/pwm/pwm-tegra.c
@@ -282,9 +282,15 @@ static const struct tegra_pwm_soc tegra186_pwm_soc = {
.max_frequency = 102000000UL,
};
+static const struct tegra_pwm_soc tegra194_pwm_soc = {
+ .num_channels = 1,
+ .max_frequency = 408000000UL,
+};
+
static const struct of_device_id tegra_pwm_of_match[] = {
{ .compatible = "nvidia,tegra20-pwm", .data = &tegra20_pwm_soc },
{ .compatible = "nvidia,tegra186-pwm", .data = &tegra186_pwm_soc },
+ { .compatible = "nvidia,tegra194-pwm", .data = &tegra194_pwm_soc },
{ }
};
MODULE_DEVICE_TABLE(of, tegra_pwm_of_match);
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index f942a3302cdc..ec873f09c763 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -590,6 +590,16 @@ config RTC_DRV_RC5T583
This driver can also be built as a module. If so, the module
will be called rtc-rc5t583.
+config RTC_DRV_RC5T619
+ tristate "RICOH RC5T619 RTC driver"
+ depends on MFD_RN5T618
+ help
+ If you say yes here you get support for the RTC on the
+ RICOH RC5T619 chips.
+
+ This driver can also be built as a module. If so, the module
+ will be called rtc-rc5t619.
+
config RTC_DRV_S35390A
tristate "Seiko Instruments S-35390A"
select BITREVERSE
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 3b66ee99063f..0721752c6ed4 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -133,6 +133,7 @@ obj-$(CONFIG_RTC_DRV_PXA) += rtc-pxa.o
obj-$(CONFIG_RTC_DRV_R7301) += rtc-r7301.o
obj-$(CONFIG_RTC_DRV_R9701) += rtc-r9701.o
obj-$(CONFIG_RTC_DRV_RC5T583) += rtc-rc5t583.o
+obj-$(CONFIG_RTC_DRV_RC5T619) += rtc-rc5t619.o
obj-$(CONFIG_RTC_DRV_RK808) += rtc-rk808.o
obj-$(CONFIG_RTC_DRV_RP5C01) += rtc-rp5c01.o
obj-$(CONFIG_RTC_DRV_RS5C313) += rtc-rs5c313.o
diff --git a/drivers/rtc/rtc-rc5t619.c b/drivers/rtc/rtc-rc5t619.c
new file mode 100644
index 000000000000..24e386ecbc7e
--- /dev/null
+++ b/drivers/rtc/rtc-rc5t619.c
@@ -0,0 +1,444 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * drivers/rtc/rtc-rc5t619.c
+ *
+ * Real time clock driver for RICOH RC5T619 power management chip.
+ *
+ * Copyright (C) 2019 Andreas Kemnade
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mfd/rn5t618.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/bcd.h>
+#include <linux/rtc.h>
+#include <linux/slab.h>
+#include <linux/irqdomain.h>
+
+struct rc5t619_rtc {
+ int irq;
+ struct rtc_device *rtc;
+ struct rn5t618 *rn5t618;
+};
+
+#define CTRL1_ALARM_ENABLED 0x40
+#define CTRL1_24HR 0x20
+#define CTRL1_PERIODIC_MASK 0xf
+
+#define CTRL2_PON 0x10
+#define CTRL2_ALARM_STATUS 0x80
+#define CTRL2_CTFG 0x4
+#define CTRL2_CTC 0x1
+
+#define MONTH_CENTFLAG 0x80
+#define HOUR_PMFLAG 0x20
+#define MDAY_DAL_EXT 0x80
+
+static uint8_t rtc5t619_12hour_bcd2bin(uint8_t hour)
+{
+ if (hour & HOUR_PMFLAG) {
+ hour = bcd2bin(hour & ~HOUR_PMFLAG);
+ return hour == 12 ? 12 : 12 + hour;
+ }
+
+ hour = bcd2bin(hour);
+ return hour == 12 ? 0 : hour;
+}
+
+static uint8_t rtc5t619_12hour_bin2bcd(uint8_t hour)
+{
+ if (!hour)
+ return 0x12;
+
+ if (hour < 12)
+ return bin2bcd(hour);
+
+ if (hour == 12)
+ return 0x12 | HOUR_PMFLAG;
+
+ return bin2bcd(hour - 12) | HOUR_PMFLAG;
+}
+
+static int rc5t619_rtc_periodic_disable(struct device *dev)
+{
+ struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+ int err;
+
+ /* disable function */
+ err = regmap_update_bits(rtc->rn5t618->regmap,
+ RN5T618_RTC_CTRL1, CTRL1_PERIODIC_MASK, 0);
+ if (err < 0)
+ return err;
+
+ /* clear alarm flag and CTFG */
+ err = regmap_update_bits(rtc->rn5t618->regmap, RN5T618_RTC_CTRL2,
+ CTRL2_ALARM_STATUS | CTRL2_CTFG | CTRL2_CTC,
+ 0);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+/* things to be done once after power on */
+static int rc5t619_rtc_pon_setup(struct device *dev)
+{
+ struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+ int err;
+ unsigned int reg_data;
+
+ err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_CTRL2, &reg_data);
+ if (err < 0)
+ return err;
+
+ /* clear VDET PON */
+ reg_data &= ~(CTRL2_PON | CTRL2_CTC | 0x4a); /* 0101-1011 */
+ reg_data |= 0x20; /* 0010-0000 */
+ err = regmap_write(rtc->rn5t618->regmap, RN5T618_RTC_CTRL2, reg_data);
+ if (err < 0)
+ return err;
+
+ /* clearing RTC Adjust register */
+ err = regmap_write(rtc->rn5t618->regmap, RN5T618_RTC_ADJUST, 0);
+ if (err)
+ return err;
+
+ return regmap_update_bits(rtc->rn5t618->regmap,
+ RN5T618_RTC_CTRL1,
+ CTRL1_24HR, CTRL1_24HR);
+}
+
+static int rc5t619_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+ struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+ u8 buff[7];
+ int err;
+ int cent_flag;
+ unsigned int ctrl1;
+ unsigned int ctrl2;
+
+ err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_CTRL2, &ctrl2);
+ if (err < 0)
+ return err;
+
+ if (ctrl2 & CTRL2_PON)
+ return -EINVAL;
+
+ err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_CTRL1, &ctrl1);
+ if (err < 0)
+ return err;
+
+ err = regmap_bulk_read(rtc->rn5t618->regmap, RN5T618_RTC_SECONDS,
+ buff, sizeof(buff));
+ if (err < 0)
+ return err;
+
+ if (buff[5] & MONTH_CENTFLAG)
+ cent_flag = 1;
+ else
+ cent_flag = 0;
+
+ tm->tm_sec = bcd2bin(buff[0]);
+ tm->tm_min = bcd2bin(buff[1]);
+
+ if (ctrl1 & CTRL1_24HR)
+ tm->tm_hour = bcd2bin(buff[2]);
+ else
+ tm->tm_hour = rtc5t619_12hour_bcd2bin(buff[2]);
+
+ tm->tm_wday = bcd2bin(buff[3]);
+ tm->tm_mday = bcd2bin(buff[4]);
+ tm->tm_mon = bcd2bin(buff[5] & 0x1f) - 1; /* back to system 0-11 */
+ tm->tm_year = bcd2bin(buff[6]) + 100 * cent_flag;
+
+ return 0;
+}
+
+static int rc5t619_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+ struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+ u8 buff[7];
+ int err;
+ int cent_flag;
+ unsigned int ctrl1;
+ unsigned int ctrl2;
+
+ err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_CTRL2, &ctrl2);
+ if (err < 0)
+ return err;
+
+ if (ctrl2 & CTRL2_PON)
+ rc5t619_rtc_pon_setup(dev);
+
+ err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_CTRL1, &ctrl1);
+ if (err < 0)
+ return err;
+
+ if (tm->tm_year >= 100)
+ cent_flag = 1;
+ else
+ cent_flag = 0;
+
+ buff[0] = bin2bcd(tm->tm_sec);
+ buff[1] = bin2bcd(tm->tm_min);
+
+ if (ctrl1 & CTRL1_24HR)
+ buff[2] = bin2bcd(tm->tm_hour);
+ else
+ buff[2] = rtc5t619_12hour_bin2bcd(tm->tm_hour);
+
+ buff[3] = bin2bcd(tm->tm_wday);
+ buff[4] = bin2bcd(tm->tm_mday);
+ buff[5] = bin2bcd(tm->tm_mon + 1); /* system set 0-11 */
+ buff[6] = bin2bcd(tm->tm_year - cent_flag * 100);
+
+ if (cent_flag)
+ buff[5] |= MONTH_CENTFLAG;
+
+ err = regmap_bulk_write(rtc->rn5t618->regmap, RN5T618_RTC_SECONDS,
+ buff, sizeof(buff));
+ if (err < 0) {
+ dev_err(dev, "failed to program new time: %d\n", err);
+ return err;
+ }
+
+ return 0;
+}
+
+/* 0-disable, 1-enable */
+static int rc5t619_rtc_alarm_enable(struct device *dev, unsigned int enabled)
+{
+ struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+
+ return regmap_update_bits(rtc->rn5t618->regmap,
+ RN5T618_RTC_CTRL1,
+ CTRL1_ALARM_ENABLED,
+ enabled ? CTRL1_ALARM_ENABLED : 0);
+}
+
+static int rc5t619_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+ struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+ u8 buff[6];
+ unsigned int buff_cent;
+ int err;
+ int cent_flag;
+ unsigned int ctrl1;
+
+ err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_CTRL1, &ctrl1);
+ if (err)
+ return err;
+
+ err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_MONTH, &buff_cent);
+ if (err < 0) {
+ dev_err(dev, "failed to read time: %d\n", err);
+ return err;
+ }
+
+ if (buff_cent & MONTH_CENTFLAG)
+ cent_flag = 1;
+ else
+ cent_flag = 0;
+
+ err = regmap_bulk_read(rtc->rn5t618->regmap, RN5T618_RTC_ALARM_Y_SEC,
+ buff, sizeof(buff));
+ if (err)
+ return err;
+
+ buff[3] = buff[3] & 0x3f;
+
+ alrm->time.tm_sec = bcd2bin(buff[0]);
+ alrm->time.tm_min = bcd2bin(buff[1]);
+
+ if (ctrl1 & CTRL1_24HR)
+ alrm->time.tm_hour = bcd2bin(buff[2]);
+ else
+ alrm->time.tm_hour = rtc5t619_12hour_bcd2bin(buff[2]);
+
+ alrm->time.tm_mday = bcd2bin(buff[3]);
+ alrm->time.tm_mon = bcd2bin(buff[4]) - 1;
+ alrm->time.tm_year = bcd2bin(buff[5]) + 100 * cent_flag;
+ alrm->enabled = !!(ctrl1 & CTRL1_ALARM_ENABLED);
+ dev_dbg(dev, "read alarm: %ptR\n", &alrm->time);
+
+ return 0;
+}
+
+static int rc5t619_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+ struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+ u8 buff[6];
+ int err;
+ int cent_flag;
+ unsigned int ctrl1;
+
+ err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_CTRL1, &ctrl1);
+ if (err)
+ return err;
+
+ err = rc5t619_rtc_alarm_enable(dev, 0);
+ if (err < 0)
+ return err;
+
+ if (rtc->irq == -1)
+ return -EINVAL;
+
+ if (alrm->enabled == 0)
+ return 0;
+
+ if (alrm->time.tm_year >= 100)
+ cent_flag = 1;
+ else
+ cent_flag = 0;
+
+ alrm->time.tm_mon += 1;
+ buff[0] = bin2bcd(alrm->time.tm_sec);
+ buff[1] = bin2bcd(alrm->time.tm_min);
+
+ if (ctrl1 & CTRL1_24HR)
+ buff[2] = bin2bcd(alrm->time.tm_hour);
+ else
+ buff[2] = rtc5t619_12hour_bin2bcd(alrm->time.tm_hour);
+
+ buff[3] = bin2bcd(alrm->time.tm_mday);
+ buff[4] = bin2bcd(alrm->time.tm_mon);
+ buff[5] = bin2bcd(alrm->time.tm_year - 100 * cent_flag);
+ buff[3] |= MDAY_DAL_EXT;
+
+ err = regmap_bulk_write(rtc->rn5t618->regmap, RN5T618_RTC_ALARM_Y_SEC,
+ buff, sizeof(buff));
+ if (err < 0)
+ return err;
+
+ return rc5t619_rtc_alarm_enable(dev, alrm->enabled);
+}
+
+static const struct rtc_class_ops rc5t619_rtc_ops = {
+ .read_time = rc5t619_rtc_read_time,
+ .set_time = rc5t619_rtc_set_time,
+ .set_alarm = rc5t619_rtc_set_alarm,
+ .read_alarm = rc5t619_rtc_read_alarm,
+ .alarm_irq_enable = rc5t619_rtc_alarm_enable,
+};
+
+static int rc5t619_rtc_alarm_flag_clr(struct device *dev)
+{
+ struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+
+ /* clear alarm-D status bits.*/
+ return regmap_update_bits(rtc->rn5t618->regmap,
+ RN5T618_RTC_CTRL2,
+ CTRL2_ALARM_STATUS | CTRL2_CTC, 0);
+}
+
+static irqreturn_t rc5t619_rtc_irq(int irq, void *data)
+{
+ struct device *dev = data;
+ struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+
+ rc5t619_rtc_alarm_flag_clr(dev);
+
+ rtc_update_irq(rtc->rtc, 1, RTC_IRQF | RTC_AF);
+ return IRQ_HANDLED;
+}
+
+static int rc5t619_rtc_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct rn5t618 *rn5t618 = dev_get_drvdata(pdev->dev.parent);
+ struct rc5t619_rtc *rtc;
+ unsigned int ctrl2;
+ int err;
+
+ rtc = devm_kzalloc(dev, sizeof(*rtc), GFP_KERNEL);
+ if (IS_ERR(rtc)) {
+ err = PTR_ERR(rtc);
+ return -ENOMEM;
+ }
+
+ rtc->rn5t618 = rn5t618;
+
+ dev_set_drvdata(dev, rtc);
+ rtc->irq = -1;
+
+ if (rn5t618->irq_data)
+ rtc->irq = regmap_irq_get_virq(rn5t618->irq_data,
+ RN5T618_IRQ_RTC);
+
+ if (rtc->irq < 0)
+ rtc->irq = -1;
+
+ err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_CTRL2, &ctrl2);
+ if (err < 0)
+ return err;
+
+ /* disable rtc periodic function */
+ err = rc5t619_rtc_periodic_disable(&pdev->dev);
+ if (err)
+ return err;
+
+ if (ctrl2 & CTRL2_PON) {
+ err = rc5t619_rtc_alarm_flag_clr(&pdev->dev);
+ if (err)
+ return err;
+ }
+
+ rtc->rtc = devm_rtc_allocate_device(&pdev->dev);
+ if (IS_ERR(rtc->rtc)) {
+ err = PTR_ERR(rtc->rtc);
+ dev_err(dev, "RTC device register: err %d\n", err);
+ return err;
+ }
+
+ rtc->rtc->ops = &rc5t619_rtc_ops;
+ rtc->rtc->range_min = RTC_TIMESTAMP_BEGIN_1900;
+ rtc->rtc->range_max = RTC_TIMESTAMP_END_2099;
+
+ /* set interrupt and enable it */
+ if (rtc->irq != -1) {
+ err = devm_request_threaded_irq(&pdev->dev, rtc->irq, NULL,
+ rc5t619_rtc_irq,
+ IRQF_ONESHOT,
+ "rtc-rc5t619",
+ &pdev->dev);
+ if (err < 0) {
+ dev_err(&pdev->dev, "request IRQ:%d fail\n", rtc->irq);
+ rtc->irq = -1;
+
+ err = rc5t619_rtc_alarm_enable(&pdev->dev, 0);
+ if (err)
+ return err;
+
+ } else {
+ /* enable wake */
+ device_init_wakeup(&pdev->dev, 1);
+ enable_irq_wake(rtc->irq);
+ }
+ } else {
+ /* system don't want to using alarm interrupt, so close it */
+ err = rc5t619_rtc_alarm_enable(&pdev->dev, 0);
+ if (err)
+ return err;
+
+ dev_warn(&pdev->dev, "rc5t619 interrupt is disabled\n");
+ }
+
+ return rtc_register_device(rtc->rtc);
+}
+
+static struct platform_driver rc5t619_rtc_driver = {
+ .driver = {
+ .name = "rc5t619-rtc",
+ },
+ .probe = rc5t619_rtc_probe,
+};
+
+module_platform_driver(rc5t619_rtc_driver);
+MODULE_ALIAS("platform:rc5t619-rtc");
+MODULE_DESCRIPTION("RICOH RC5T619 RTC driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 80d22290f268..384edffe5cb4 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -57,11 +57,26 @@ static size_t dcssblk_dax_copy_to_iter(struct dax_device *dax_dev,
return copy_to_iter(addr, bytes, i);
}
+static int dcssblk_dax_zero_page_range(struct dax_device *dax_dev,
+ pgoff_t pgoff, size_t nr_pages)
+{
+ long rc;
+ void *kaddr;
+
+ rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL);
+ if (rc < 0)
+ return rc;
+ memset(kaddr, 0, nr_pages << PAGE_SHIFT);
+ dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT);
+ return 0;
+}
+
static const struct dax_operations dcssblk_dax_ops = {
.direct_access = dcssblk_dax_direct_access,
.dax_supported = generic_fsdax_supported,
.copy_from_iter = dcssblk_dax_copy_from_iter,
.copy_to_iter = dcssblk_dax_copy_to_iter,
+ .zero_page_range = dcssblk_dax_zero_page_range,
};
struct dcssblk_dev_info {
@@ -680,8 +695,9 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name,
&dcssblk_dax_ops, DAXDEV_F_SYNC);
- if (!dev_info->dax_dev) {
- rc = -ENOMEM;
+ if (IS_ERR(dev_info->dax_dev)) {
+ rc = PTR_ERR(dev_info->dax_dev);
+ dev_info->dax_dev = NULL;
goto put_dev;
}
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 50007cb9be5b..b29fe8d50baf 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -849,8 +849,10 @@ static void io_subchannel_register(struct ccw_device *cdev)
* Now we know this subchannel will stay, we can throw
* our delayed uevent.
*/
- dev_set_uevent_suppress(&sch->dev, 0);
- kobject_uevent(&sch->dev.kobj, KOBJ_ADD);
+ if (dev_get_uevent_suppress(&sch->dev)) {
+ dev_set_uevent_suppress(&sch->dev, 0);
+ kobject_uevent(&sch->dev.kobj, KOBJ_ADD);
+ }
/* make it known to the system */
ret = ccw_device_add(cdev);
if (ret) {
@@ -1058,8 +1060,11 @@ static int io_subchannel_probe(struct subchannel *sch)
* Throw the delayed uevent for the subchannel, register
* the ccw_device and exit.
*/
- dev_set_uevent_suppress(&sch->dev, 0);
- kobject_uevent(&sch->dev.kobj, KOBJ_ADD);
+ if (dev_get_uevent_suppress(&sch->dev)) {
+ /* should always be the case for the console */
+ dev_set_uevent_suppress(&sch->dev, 0);
+ kobject_uevent(&sch->dev.kobj, KOBJ_ADD);
+ }
cdev = sch_get_cdev(sch);
rc = ccw_device_add(cdev);
if (rc) {
diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h
index b0beafc43d37..b8453b594679 100644
--- a/drivers/s390/cio/qdio.h
+++ b/drivers/s390/cio/qdio.h
@@ -374,7 +374,6 @@ int tiqdio_allocate_memory(void);
void tiqdio_free_memory(void);
int tiqdio_register_thinints(void);
void tiqdio_unregister_thinints(void);
-void clear_nonshared_ind(struct qdio_irq *);
int test_nonshared_ind(struct qdio_irq *);
/* prototypes for setup */
diff --git a/drivers/s390/cio/qdio_debug.c b/drivers/s390/cio/qdio_debug.c
index 5a3d9ee90a7f..286b044fb027 100644
--- a/drivers/s390/cio/qdio_debug.c
+++ b/drivers/s390/cio/qdio_debug.c
@@ -58,25 +58,11 @@ static void qdio_clear_dbf_list(void)
mutex_unlock(&qdio_dbf_list_mutex);
}
-int qdio_allocate_dbf(struct qdio_initialize *init_data,
- struct qdio_irq *irq_ptr)
+int qdio_allocate_dbf(struct qdio_irq *irq_ptr)
{
char text[QDIO_DBF_NAME_LEN];
struct qdio_dbf_entry *new_entry;
- DBF_EVENT("qfmt:%1d", init_data->q_format);
- DBF_HEX(init_data->adapter_name, 8);
- DBF_EVENT("qpff%4x", init_data->qib_param_field_format);
- DBF_HEX(&init_data->qib_param_field, sizeof(void *));
- DBF_HEX(&init_data->input_slib_elements, sizeof(void *));
- DBF_HEX(&init_data->output_slib_elements, sizeof(void *));
- DBF_EVENT("niq:%1d noq:%1d", init_data->no_input_qs,
- init_data->no_output_qs);
- DBF_HEX(&init_data->input_handler, sizeof(void *));
- DBF_HEX(&init_data->output_handler, sizeof(void *));
- DBF_HEX(&init_data->int_parm, sizeof(long));
- DBF_HEX(&init_data->input_sbal_addr_array, sizeof(void *));
- DBF_HEX(&init_data->output_sbal_addr_array, sizeof(void *));
DBF_EVENT("irq:%8lx", (unsigned long)irq_ptr);
/* allocate trace view for the interface */
diff --git a/drivers/s390/cio/qdio_debug.h b/drivers/s390/cio/qdio_debug.h
index 122450ba6b90..0dfba085f360 100644
--- a/drivers/s390/cio/qdio_debug.h
+++ b/drivers/s390/cio/qdio_debug.h
@@ -64,8 +64,7 @@ static inline void DBF_DEV_HEX(struct qdio_irq *dev, void *addr,
debug_event(dev->debug_area, level, addr, len);
}
-int qdio_allocate_dbf(struct qdio_initialize *init_data,
- struct qdio_irq *irq_ptr);
+int qdio_allocate_dbf(struct qdio_irq *irq_ptr);
void qdio_setup_debug_entries(struct qdio_irq *irq_ptr);
void qdio_shutdown_debug_entries(struct qdio_irq *irq_ptr);
int qdio_debug_init(void);
diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c
index c890848064fe..bcc3ab14e72d 100644
--- a/drivers/s390/cio/qdio_main.c
+++ b/drivers/s390/cio/qdio_main.c
@@ -1220,27 +1220,21 @@ EXPORT_SYMBOL_GPL(qdio_free);
/**
* qdio_allocate - allocate qdio queues and associated data
- * @init_data: initialization data
+ * @cdev: associated ccw device
+ * @no_input_qs: allocate this number of Input Queues
+ * @no_output_qs: allocate this number of Output Queues
*/
-int qdio_allocate(struct qdio_initialize *init_data)
+int qdio_allocate(struct ccw_device *cdev, unsigned int no_input_qs,
+ unsigned int no_output_qs)
{
- struct ccw_device *cdev = init_data->cdev;
struct subchannel_id schid;
struct qdio_irq *irq_ptr;
ccw_device_get_schid(cdev, &schid);
DBF_EVENT("qallocate:%4x", schid.sch_no);
- if ((init_data->no_input_qs && !init_data->input_handler) ||
- (init_data->no_output_qs && !init_data->output_handler))
- return -EINVAL;
-
- if ((init_data->no_input_qs > QDIO_MAX_QUEUES_PER_IRQ) ||
- (init_data->no_output_qs > QDIO_MAX_QUEUES_PER_IRQ))
- return -EINVAL;
-
- if ((!init_data->input_sbal_addr_array) ||
- (!init_data->output_sbal_addr_array))
+ if (no_input_qs > QDIO_MAX_QUEUES_PER_IRQ ||
+ no_output_qs > QDIO_MAX_QUEUES_PER_IRQ)
return -EINVAL;
/* irq_ptr must be in GFP_DMA since it contains ccw1.cda */
@@ -1250,9 +1244,12 @@ int qdio_allocate(struct qdio_initialize *init_data)
irq_ptr->cdev = cdev;
mutex_init(&irq_ptr->setup_mutex);
- if (qdio_allocate_dbf(init_data, irq_ptr))
+ if (qdio_allocate_dbf(irq_ptr))
goto out_rel;
+ DBF_DEV_EVENT(DBF_ERR, irq_ptr, "alloc niq:%1u noq:%1u", no_input_qs,
+ no_output_qs);
+
/*
* Allocate a page for the chsc calls in qdio_establish.
* Must be pre-allocated since a zfcp recovery will call
@@ -1268,8 +1265,7 @@ int qdio_allocate(struct qdio_initialize *init_data)
if (!irq_ptr->qdr)
goto out_rel;
- if (qdio_allocate_qs(irq_ptr, init_data->no_input_qs,
- init_data->no_output_qs))
+ if (qdio_allocate_qs(irq_ptr, no_input_qs, no_output_qs))
goto out_rel;
INIT_LIST_HEAD(&irq_ptr->entry);
@@ -1305,13 +1301,33 @@ static void qdio_detect_hsicq(struct qdio_irq *irq_ptr)
DBF_EVENT("use_cq:%d", use_cq);
}
+static void qdio_trace_init_data(struct qdio_irq *irq,
+ struct qdio_initialize *data)
+{
+ DBF_DEV_EVENT(DBF_ERR, irq, "qfmt:%1u", data->q_format);
+ DBF_DEV_HEX(irq, data->adapter_name, 8, DBF_ERR);
+ DBF_DEV_EVENT(DBF_ERR, irq, "qpff%4x", data->qib_param_field_format);
+ DBF_DEV_HEX(irq, &data->qib_param_field, sizeof(void *), DBF_ERR);
+ DBF_DEV_HEX(irq, &data->input_slib_elements, sizeof(void *), DBF_ERR);
+ DBF_DEV_HEX(irq, &data->output_slib_elements, sizeof(void *), DBF_ERR);
+ DBF_DEV_EVENT(DBF_ERR, irq, "niq:%1u noq:%1u", data->no_input_qs,
+ data->no_output_qs);
+ DBF_DEV_HEX(irq, &data->input_handler, sizeof(void *), DBF_ERR);
+ DBF_DEV_HEX(irq, &data->output_handler, sizeof(void *), DBF_ERR);
+ DBF_DEV_HEX(irq, &data->int_parm, sizeof(long), DBF_ERR);
+ DBF_DEV_HEX(irq, &data->input_sbal_addr_array, sizeof(void *), DBF_ERR);
+ DBF_DEV_HEX(irq, &data->output_sbal_addr_array, sizeof(void *),
+ DBF_ERR);
+}
+
/**
* qdio_establish - establish queues on a qdio subchannel
+ * @cdev: associated ccw device
* @init_data: initialization data
*/
-int qdio_establish(struct qdio_initialize *init_data)
+int qdio_establish(struct ccw_device *cdev,
+ struct qdio_initialize *init_data)
{
- struct ccw_device *cdev = init_data->cdev;
struct qdio_irq *irq_ptr = cdev->private->qdio_data;
struct subchannel_id schid;
int rc;
@@ -1322,7 +1338,16 @@ int qdio_establish(struct qdio_initialize *init_data)
if (!irq_ptr)
return -ENODEV;
+ if ((init_data->no_input_qs && !init_data->input_handler) ||
+ (init_data->no_output_qs && !init_data->output_handler))
+ return -EINVAL;
+
+ if (!init_data->input_sbal_addr_array ||
+ !init_data->output_sbal_addr_array)
+ return -EINVAL;
+
mutex_lock(&irq_ptr->setup_mutex);
+ qdio_trace_init_data(irq_ptr, init_data);
qdio_setup_irq(irq_ptr, init_data);
rc = qdio_establish_thinint(irq_ptr);
@@ -1618,8 +1643,6 @@ int qdio_start_irq(struct ccw_device *cdev)
if (!irq_ptr)
return -ENODEV;
- clear_nonshared_ind(irq_ptr);
-
for_each_input_queue(irq_ptr, q, i)
qdio_stop_polling(q);
diff --git a/drivers/s390/cio/qdio_setup.c b/drivers/s390/cio/qdio_setup.c
index bbbefc9f9e04..3083edd61f0c 100644
--- a/drivers/s390/cio/qdio_setup.c
+++ b/drivers/s390/cio/qdio_setup.c
@@ -213,8 +213,6 @@ static void setup_queues(struct qdio_irq *irq_ptr,
struct qdio_initialize *qdio_init)
{
struct qdio_q *q;
- struct qdio_buffer **input_sbal_array = qdio_init->input_sbal_addr_array;
- struct qdio_buffer **output_sbal_array = qdio_init->output_sbal_addr_array;
struct qdio_outbuf_state *output_sbal_state_array =
qdio_init->output_sbal_state_array;
int i;
@@ -225,8 +223,8 @@ static void setup_queues(struct qdio_irq *irq_ptr,
q->is_input_q = 1;
- setup_storage_lists(q, irq_ptr, input_sbal_array, i);
- input_sbal_array += QDIO_MAX_BUFFERS_PER_Q;
+ setup_storage_lists(q, irq_ptr,
+ qdio_init->input_sbal_addr_array[i], i);
if (is_thinint_irq(irq_ptr)) {
tasklet_init(&q->tasklet, tiqdio_inbound_processing,
@@ -245,8 +243,8 @@ static void setup_queues(struct qdio_irq *irq_ptr,
output_sbal_state_array += QDIO_MAX_BUFFERS_PER_Q;
q->is_input_q = 0;
- setup_storage_lists(q, irq_ptr, output_sbal_array, i);
- output_sbal_array += QDIO_MAX_BUFFERS_PER_Q;
+ setup_storage_lists(q, irq_ptr,
+ qdio_init->output_sbal_addr_array[i], i);
tasklet_init(&q->tasklet, qdio_outbound_processing,
(unsigned long) q);
diff --git a/drivers/s390/cio/qdio_thinint.c b/drivers/s390/cio/qdio_thinint.c
index ea09df7209f0..ae50373617cd 100644
--- a/drivers/s390/cio/qdio_thinint.c
+++ b/drivers/s390/cio/qdio_thinint.c
@@ -82,36 +82,16 @@ void tiqdio_remove_device(struct qdio_irq *irq_ptr)
INIT_LIST_HEAD(&irq_ptr->entry);
}
-static inline int has_multiple_inq_on_dsci(struct qdio_irq *irq_ptr)
-{
- return irq_ptr->nr_input_qs > 1;
-}
-
static inline int references_shared_dsci(struct qdio_irq *irq_ptr)
{
return irq_ptr->dsci == &q_indicators[TIQDIO_SHARED_IND].ind;
}
-static inline int shared_ind(struct qdio_irq *irq_ptr)
-{
- return references_shared_dsci(irq_ptr) ||
- has_multiple_inq_on_dsci(irq_ptr);
-}
-
-void clear_nonshared_ind(struct qdio_irq *irq_ptr)
-{
- if (!is_thinint_irq(irq_ptr))
- return;
- if (shared_ind(irq_ptr))
- return;
- xchg(irq_ptr->dsci, 0);
-}
-
int test_nonshared_ind(struct qdio_irq *irq_ptr)
{
if (!is_thinint_irq(irq_ptr))
return 0;
- if (shared_ind(irq_ptr))
+ if (references_shared_dsci(irq_ptr))
return 0;
if (*irq_ptr->dsci)
return 1;
@@ -131,8 +111,7 @@ static inline void tiqdio_call_inq_handlers(struct qdio_irq *irq)
struct qdio_q *q;
int i;
- if (!references_shared_dsci(irq) &&
- has_multiple_inq_on_dsci(irq))
+ if (!references_shared_dsci(irq))
xchg(irq->dsci, 0);
if (irq->irq_poll) {
@@ -145,9 +124,6 @@ static inline void tiqdio_call_inq_handlers(struct qdio_irq *irq)
}
for_each_input_queue(irq, q, i) {
- if (!shared_ind(irq))
- xchg(irq->dsci, 0);
-
/*
* Call inbound processing but not directly
* since that could starve other thinint queues.
diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c
index e401a3d0aa57..339a6bc0339b 100644
--- a/drivers/s390/cio/vfio_ccw_drv.c
+++ b/drivers/s390/cio/vfio_ccw_drv.c
@@ -167,6 +167,11 @@ static int vfio_ccw_sch_probe(struct subchannel *sch)
if (ret)
goto out_disable;
+ if (dev_get_uevent_suppress(&sch->dev)) {
+ dev_set_uevent_suppress(&sch->dev, 0);
+ kobject_uevent(&sch->dev.kobj, KOBJ_ADD);
+ }
+
VFIO_CCW_MSG_EVENT(4, "bound to subchannel %x.%x.%04x\n",
sch->schid.cssid, sch->schid.ssid,
sch->schid.sch_no);
diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index acda230323d5..e0b26310ecab 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -181,11 +181,12 @@ struct qeth_vnicc_info {
/*****************************************************************************/
/* QDIO queue and buffer handling */
/*****************************************************************************/
-#define QETH_MAX_QUEUES 4
+#define QETH_MAX_OUT_QUEUES 4
#define QETH_IQD_MIN_TXQ 2 /* One for ucast, one for mcast. */
#define QETH_IQD_MCAST_TXQ 0
#define QETH_IQD_MIN_UCAST_TXQ 1
+#define QETH_MAX_IN_QUEUES 2
#define QETH_RX_COPYBREAK (PAGE_SIZE >> 1)
#define QETH_IN_BUF_SIZE_DEFAULT 65536
#define QETH_IN_BUF_COUNT_DEFAULT 64
@@ -539,7 +540,7 @@ struct qeth_qdio_info {
/* output */
int no_out_queues;
- struct qeth_qdio_out_q *out_qs[QETH_MAX_QUEUES];
+ struct qeth_qdio_out_q *out_qs[QETH_MAX_OUT_QUEUES];
struct qdio_outbuf_state *out_bufstates;
/* priority queueing */
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 24fd17b347fe..f7689461c242 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -4812,28 +4812,13 @@ out:
return;
}
-static void qeth_qdio_establish_cq(struct qeth_card *card,
- struct qdio_buffer **in_sbal_ptrs)
-{
- int i;
-
- if (card->options.cq == QETH_CQ_ENABLED) {
- int offset = QDIO_MAX_BUFFERS_PER_Q *
- (card->qdio.no_in_queues - 1);
-
- for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; i++)
- in_sbal_ptrs[offset + i] =
- card->qdio.c_q->bufs[i].buffer;
- }
-}
-
static int qeth_qdio_establish(struct qeth_card *card)
{
+ struct qdio_buffer **out_sbal_ptrs[QETH_MAX_OUT_QUEUES];
+ struct qdio_buffer **in_sbal_ptrs[QETH_MAX_IN_QUEUES];
struct qdio_initialize init_data;
char *qib_param_field;
- struct qdio_buffer **in_sbal_ptrs;
- struct qdio_buffer **out_sbal_ptrs;
- int i, j, k;
+ unsigned int i;
int rc = 0;
QETH_CARD_TEXT(card, 2, "qdioest");
@@ -4847,35 +4832,14 @@ static int qeth_qdio_establish(struct qeth_card *card)
qeth_create_qib_param_field(card, qib_param_field);
qeth_create_qib_param_field_blkt(card, qib_param_field);
- in_sbal_ptrs = kcalloc(card->qdio.no_in_queues * QDIO_MAX_BUFFERS_PER_Q,
- sizeof(void *),
- GFP_KERNEL);
- if (!in_sbal_ptrs) {
- rc = -ENOMEM;
- goto out_free_qib_param;
- }
-
- for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; i++)
- in_sbal_ptrs[i] = card->qdio.in_q->bufs[i].buffer;
-
- qeth_qdio_establish_cq(card, in_sbal_ptrs);
-
- out_sbal_ptrs =
- kcalloc(card->qdio.no_out_queues * QDIO_MAX_BUFFERS_PER_Q,
- sizeof(void *),
- GFP_KERNEL);
- if (!out_sbal_ptrs) {
- rc = -ENOMEM;
- goto out_free_in_sbals;
- }
+ in_sbal_ptrs[0] = card->qdio.in_q->qdio_bufs;
+ if (card->options.cq == QETH_CQ_ENABLED)
+ in_sbal_ptrs[1] = card->qdio.c_q->qdio_bufs;
- for (i = 0, k = 0; i < card->qdio.no_out_queues; ++i)
- for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; j++, k++)
- out_sbal_ptrs[k] =
- card->qdio.out_qs[i]->bufs[j]->buffer;
+ for (i = 0; i < card->qdio.no_out_queues; i++)
+ out_sbal_ptrs[i] = card->qdio.out_qs[i]->qdio_bufs;
memset(&init_data, 0, sizeof(struct qdio_initialize));
- init_data.cdev = CARD_DDEV(card);
init_data.q_format = IS_IQD(card) ? QDIO_IQDIO_QFMT :
QDIO_QETH_QFMT;
init_data.qib_param_field_format = 0;
@@ -4893,12 +4857,13 @@ static int qeth_qdio_establish(struct qeth_card *card)
if (atomic_cmpxchg(&card->qdio.state, QETH_QDIO_ALLOCATED,
QETH_QDIO_ESTABLISHED) == QETH_QDIO_ALLOCATED) {
- rc = qdio_allocate(&init_data);
+ rc = qdio_allocate(CARD_DDEV(card), init_data.no_input_qs,
+ init_data.no_output_qs);
if (rc) {
atomic_set(&card->qdio.state, QETH_QDIO_ALLOCATED);
goto out;
}
- rc = qdio_establish(&init_data);
+ rc = qdio_establish(CARD_DDEV(card), &init_data);
if (rc) {
atomic_set(&card->qdio.state, QETH_QDIO_ALLOCATED);
qdio_free(CARD_DDEV(card));
@@ -4916,10 +4881,6 @@ static int qeth_qdio_establish(struct qeth_card *card)
break;
}
out:
- kfree(out_sbal_ptrs);
-out_free_in_sbals:
- kfree(in_sbal_ptrs);
-out_free_qib_param:
kfree(qib_param_field);
out_free_nothing:
return rc;
@@ -5985,7 +5946,7 @@ static struct net_device *qeth_alloc_netdev(struct qeth_card *card)
switch (card->info.type) {
case QETH_CARD_TYPE_IQD:
dev = alloc_netdev_mqs(sizeof(*priv), "hsi%d", NET_NAME_UNKNOWN,
- ether_setup, QETH_MAX_QUEUES, 1);
+ ether_setup, QETH_MAX_OUT_QUEUES, 1);
break;
case QETH_CARD_TYPE_OSM:
dev = alloc_etherdev(sizeof(*priv));
@@ -5995,7 +5956,7 @@ static struct net_device *qeth_alloc_netdev(struct qeth_card *card)
ether_setup);
break;
default:
- dev = alloc_etherdev_mqs(sizeof(*priv), QETH_MAX_QUEUES, 1);
+ dev = alloc_etherdev_mqs(sizeof(*priv), QETH_MAX_OUT_QUEUES, 1);
}
if (!dev)
diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index 18a6751299f9..3d0bc000f500 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -178,12 +178,12 @@ static enum zfcp_erp_act_type zfcp_erp_required_act(enum zfcp_erp_act_type want,
return 0;
if (!(p_status & ZFCP_STATUS_COMMON_UNBLOCKED))
need = ZFCP_ERP_ACTION_REOPEN_PORT;
- /* fall through */
+ fallthrough;
case ZFCP_ERP_ACTION_REOPEN_PORT_FORCED:
p_status = atomic_read(&port->status);
if (!(p_status & ZFCP_STATUS_COMMON_OPEN))
need = ZFCP_ERP_ACTION_REOPEN_PORT;
- /* fall through */
+ fallthrough;
case ZFCP_ERP_ACTION_REOPEN_PORT:
p_status = atomic_read(&port->status);
if (p_status & ZFCP_STATUS_COMMON_ERP_INUSE)
@@ -196,7 +196,7 @@ static enum zfcp_erp_act_type zfcp_erp_required_act(enum zfcp_erp_act_type want,
return need;
if (!(a_status & ZFCP_STATUS_COMMON_UNBLOCKED))
need = ZFCP_ERP_ACTION_REOPEN_ADAPTER;
- /* fall through */
+ fallthrough;
case ZFCP_ERP_ACTION_REOPEN_ADAPTER:
a_status = atomic_read(&adapter->status);
if (a_status & ZFCP_STATUS_COMMON_ERP_INUSE)
@@ -1086,7 +1086,7 @@ static enum zfcp_erp_act_result zfcp_erp_lun_strategy(
if (atomic_read(&zfcp_sdev->status) & ZFCP_STATUS_COMMON_OPEN)
return zfcp_erp_lun_strategy_close(erp_action);
/* already closed */
- /* fall through */
+ fallthrough;
case ZFCP_ERP_STEP_LUN_CLOSING:
if (atomic_read(&zfcp_sdev->status) & ZFCP_STATUS_COMMON_OPEN)
return ZFCP_ERP_FAILED;
@@ -1415,7 +1415,7 @@ static void zfcp_erp_action_cleanup(struct zfcp_erp_action *act,
if (act->step != ZFCP_ERP_STEP_UNINITIALIZED)
if (result == ZFCP_ERP_SUCCEEDED)
zfcp_erp_try_rport_unblock(port);
- /* fall through */
+ fallthrough;
case ZFCP_ERP_ACTION_REOPEN_PORT_FORCED:
put_device(&port->dev);
break;
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index 7c603e5b5b19..111fe3fc32d7 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -564,7 +564,7 @@ static int zfcp_fsf_exchange_config_evaluate(struct zfcp_fsf_req *req)
case FSF_TOPO_AL:
fc_host_port_type(shost) = FC_PORTTYPE_NLPORT;
fc_host_fabric_name(shost) = 0;
- /* fall through */
+ fallthrough;
default:
fc_host_fabric_name(shost) = 0;
dev_err(&adapter->ccw_device->dev,
@@ -1032,7 +1032,7 @@ static void zfcp_fsf_abort_fcp_command_handler(struct zfcp_fsf_req *req)
switch (fsq->word[0]) {
case FSF_SQ_INVOKE_LINK_TEST_PROCEDURE:
zfcp_fc_test_link(zfcp_sdev->port);
- /* fall through */
+ fallthrough;
case FSF_SQ_ULP_DEPENDENT_ERP_REQUIRED:
req->status |= ZFCP_STATUS_FSFREQ_ERROR;
break;
@@ -1127,7 +1127,7 @@ static void zfcp_fsf_send_ct_handler(struct zfcp_fsf_req *req)
break;
case FSF_PORT_HANDLE_NOT_VALID:
zfcp_erp_adapter_reopen(adapter, 0, "fsscth1");
- /* fall through */
+ fallthrough;
case FSF_GENERIC_COMMAND_REJECTED:
case FSF_PAYLOAD_SIZE_MISMATCH:
case FSF_REQUEST_SIZE_TOO_LARGE:
@@ -1313,7 +1313,7 @@ static void zfcp_fsf_send_els_handler(struct zfcp_fsf_req *req)
break;
case FSF_SBAL_MISMATCH:
/* should never occur, avoided in zfcp_fsf_send_els */
- /* fall through */
+ fallthrough;
default:
req->status |= ZFCP_STATUS_FSFREQ_ERROR;
break;
@@ -1736,7 +1736,7 @@ static void zfcp_fsf_open_port_handler(struct zfcp_fsf_req *req)
switch (header->fsf_status_qual.word[0]) {
case FSF_SQ_INVOKE_LINK_TEST_PROCEDURE:
/* no zfcp_fc_test_link() with failed open port */
- /* fall through */
+ fallthrough;
case FSF_SQ_ULP_DEPENDENT_ERP_REQUIRED:
case FSF_SQ_NO_RETRY_POSSIBLE:
req->status |= ZFCP_STATUS_FSFREQ_ERROR;
@@ -1909,14 +1909,14 @@ static void zfcp_fsf_open_wka_port_handler(struct zfcp_fsf_req *req)
case FSF_MAXIMUM_NUMBER_OF_PORTS_EXCEEDED:
dev_warn(&req->adapter->ccw_device->dev,
"Opening WKA port 0x%x failed\n", wka_port->d_id);
- /* fall through */
+ fallthrough;
case FSF_ADAPTER_STATUS_AVAILABLE:
req->status |= ZFCP_STATUS_FSFREQ_ERROR;
wka_port->status = ZFCP_FC_WKA_PORT_OFFLINE;
break;
case FSF_GOOD:
wka_port->handle = header->port_handle;
- /* fall through */
+ fallthrough;
case FSF_PORT_ALREADY_OPEN:
wka_port->status = ZFCP_FC_WKA_PORT_ONLINE;
}
@@ -2059,7 +2059,6 @@ static void zfcp_fsf_close_physical_port_handler(struct zfcp_fsf_req *req)
case FSF_ADAPTER_STATUS_AVAILABLE:
switch (header->fsf_status_qual.word[0]) {
case FSF_SQ_INVOKE_LINK_TEST_PROCEDURE:
- /* fall through */
case FSF_SQ_ULP_DEPENDENT_ERP_REQUIRED:
req->status |= ZFCP_STATUS_FSFREQ_ERROR;
break;
@@ -2144,7 +2143,7 @@ static void zfcp_fsf_open_lun_handler(struct zfcp_fsf_req *req)
case FSF_PORT_HANDLE_NOT_VALID:
zfcp_erp_adapter_reopen(adapter, 0, "fsouh_1");
- /* fall through */
+ fallthrough;
case FSF_LUN_ALREADY_OPEN:
break;
case FSF_PORT_BOXED:
@@ -2175,7 +2174,7 @@ static void zfcp_fsf_open_lun_handler(struct zfcp_fsf_req *req)
(unsigned long long)zfcp_scsi_dev_lun(sdev),
(unsigned long long)zfcp_sdev->port->wwpn);
zfcp_erp_set_lun_status(sdev, ZFCP_STATUS_COMMON_ERP_FAILED);
- /* fall through */
+ fallthrough;
case FSF_INVALID_COMMAND_OPTION:
req->status |= ZFCP_STATUS_FSFREQ_ERROR;
break;
@@ -2183,7 +2182,7 @@ static void zfcp_fsf_open_lun_handler(struct zfcp_fsf_req *req)
switch (header->fsf_status_qual.word[0]) {
case FSF_SQ_INVOKE_LINK_TEST_PROCEDURE:
zfcp_fc_test_link(zfcp_sdev->port);
- /* fall through */
+ fallthrough;
case FSF_SQ_ULP_DEPENDENT_ERP_REQUIRED:
req->status |= ZFCP_STATUS_FSFREQ_ERROR;
break;
@@ -2277,7 +2276,7 @@ static void zfcp_fsf_close_lun_handler(struct zfcp_fsf_req *req)
switch (req->qtcb->header.fsf_status_qual.word[0]) {
case FSF_SQ_INVOKE_LINK_TEST_PROCEDURE:
zfcp_fc_test_link(zfcp_sdev->port);
- /* fall through */
+ fallthrough;
case FSF_SQ_ULP_DEPENDENT_ERP_REQUIRED:
req->status |= ZFCP_STATUS_FSFREQ_ERROR;
break;
diff --git a/drivers/s390/scsi/zfcp_qdio.c b/drivers/s390/scsi/zfcp_qdio.c
index f0d6296e673b..26702b56a7ab 100644
--- a/drivers/s390/scsi/zfcp_qdio.c
+++ b/drivers/s390/scsi/zfcp_qdio.c
@@ -277,29 +277,6 @@ int zfcp_qdio_send(struct zfcp_qdio *qdio, struct zfcp_qdio_req *q_req)
return 0;
}
-
-static void zfcp_qdio_setup_init_data(struct qdio_initialize *id,
- struct zfcp_qdio *qdio)
-{
- memset(id, 0, sizeof(*id));
- id->cdev = qdio->adapter->ccw_device;
- id->q_format = QDIO_ZFCP_QFMT;
- memcpy(id->adapter_name, dev_name(&id->cdev->dev), 8);
- ASCEBC(id->adapter_name, 8);
- id->qib_rflags = QIB_RFLAGS_ENABLE_DATA_DIV;
- if (enable_multibuffer)
- id->qdr_ac |= QDR_AC_MULTI_BUFFER_ENABLE;
- id->no_input_qs = 1;
- id->no_output_qs = 1;
- id->input_handler = zfcp_qdio_int_resp;
- id->output_handler = zfcp_qdio_int_req;
- id->int_parm = (unsigned long) qdio;
- id->input_sbal_addr_array = qdio->res_q;
- id->output_sbal_addr_array = qdio->req_q;
- id->scan_threshold =
- QDIO_MAX_BUFFERS_PER_Q - ZFCP_QDIO_MAX_SBALS_PER_REQ * 2;
-}
-
/**
* zfcp_qdio_allocate - allocate queue memory and initialize QDIO data
* @qdio: pointer to struct zfcp_qdio
@@ -308,7 +285,6 @@ static void zfcp_qdio_setup_init_data(struct qdio_initialize *id,
*/
static int zfcp_qdio_allocate(struct zfcp_qdio *qdio)
{
- struct qdio_initialize init_data;
int ret;
ret = qdio_alloc_buffers(qdio->req_q, QDIO_MAX_BUFFERS_PER_Q);
@@ -319,10 +295,9 @@ static int zfcp_qdio_allocate(struct zfcp_qdio *qdio)
if (ret)
goto free_req_q;
- zfcp_qdio_setup_init_data(&init_data, qdio);
init_waitqueue_head(&qdio->req_q_wq);
- ret = qdio_allocate(&init_data);
+ ret = qdio_allocate(qdio->adapter->ccw_device, 1, 1);
if (ret)
goto free_res_q;
@@ -374,8 +349,10 @@ void zfcp_qdio_close(struct zfcp_qdio *qdio)
*/
int zfcp_qdio_open(struct zfcp_qdio *qdio)
{
+ struct qdio_buffer **input_sbals[1] = {qdio->res_q};
+ struct qdio_buffer **output_sbals[1] = {qdio->req_q};
struct qdio_buffer_element *sbale;
- struct qdio_initialize init_data;
+ struct qdio_initialize init_data = {0};
struct zfcp_adapter *adapter = qdio->adapter;
struct ccw_device *cdev = adapter->ccw_device;
struct qdio_ssqd_desc ssqd;
@@ -387,12 +364,26 @@ int zfcp_qdio_open(struct zfcp_qdio *qdio)
atomic_andnot(ZFCP_STATUS_ADAPTER_SIOSL_ISSUED,
&qdio->adapter->status);
- zfcp_qdio_setup_init_data(&init_data, qdio);
+ init_data.q_format = QDIO_ZFCP_QFMT;
+ memcpy(init_data.adapter_name, dev_name(&cdev->dev), 8);
+ ASCEBC(init_data.adapter_name, 8);
+ init_data.qib_rflags = QIB_RFLAGS_ENABLE_DATA_DIV;
+ if (enable_multibuffer)
+ init_data.qdr_ac |= QDR_AC_MULTI_BUFFER_ENABLE;
+ init_data.no_input_qs = 1;
+ init_data.no_output_qs = 1;
+ init_data.input_handler = zfcp_qdio_int_resp;
+ init_data.output_handler = zfcp_qdio_int_req;
+ init_data.int_parm = (unsigned long) qdio;
+ init_data.input_sbal_addr_array = input_sbals;
+ init_data.output_sbal_addr_array = output_sbals;
+ init_data.scan_threshold =
+ QDIO_MAX_BUFFERS_PER_Q - ZFCP_QDIO_MAX_SBALS_PER_REQ * 2;
- if (qdio_establish(&init_data))
+ if (qdio_establish(cdev, &init_data))
goto failed_establish;
- if (qdio_get_ssqd_desc(init_data.cdev, &ssqd))
+ if (qdio_get_ssqd_desc(cdev, &ssqd))
goto failed_qdio;
if (ssqd.qdioac2 & CHSC_AC2_DATA_DIV_ENABLED)
diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c
index 4725e4c763cf..ddd73f6798af 100644
--- a/drivers/scsi/aacraid/commsup.c
+++ b/drivers/scsi/aacraid/commsup.c
@@ -1626,7 +1626,7 @@ out:
int aac_reset_adapter(struct aac_dev *aac, int forced, u8 reset_type)
{
unsigned long flagv = 0;
- int retval;
+ int retval, unblock_retval;
struct Scsi_Host *host = aac->scsi_host_ptr;
int bled;
@@ -1656,8 +1656,9 @@ int aac_reset_adapter(struct aac_dev *aac, int forced, u8 reset_type)
retval = _aac_reset_adapter(aac, bled, reset_type);
spin_unlock_irqrestore(host->host_lock, flagv);
- retval = scsi_host_unblock(host, SDEV_RUNNING);
-
+ unblock_retval = scsi_host_unblock(host, SDEV_RUNNING);
+ if (!retval)
+ retval = unblock_retval;
if ((forced < 2) && (retval == -ENODEV)) {
/* Unwind aac_send_shutdown() IOP_RESET unsupported/disabled */
struct fib * fibctx = aac_fib_alloc(aac);
diff --git a/drivers/scsi/aic7xxx/aic7xxx_core.c b/drivers/scsi/aic7xxx/aic7xxx_core.c
index 4190a025381a..84fc499cb1e6 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_core.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_core.c
@@ -1834,21 +1834,6 @@ ahc_handle_scsiint(struct ahc_softc *ahc, u_int intstat)
printerror = 0;
} else if (ahc_sent_msg(ahc, AHCMSG_1B,
MSG_BUS_DEV_RESET, TRUE)) {
-#ifdef __FreeBSD__
- /*
- * Don't mark the user's request for this BDR
- * as completing with CAM_BDR_SENT. CAM3
- * specifies CAM_REQ_CMP.
- */
- if (scb != NULL
- && scb->io_ctx->ccb_h.func_code== XPT_RESET_DEV
- && ahc_match_scb(ahc, scb, target, channel,
- CAM_LUN_WILDCARD,
- SCB_LIST_NULL,
- ROLE_INITIATOR)) {
- ahc_set_transaction_status(scb, CAM_REQ_CMP);
- }
-#endif
ahc_compile_devinfo(&devinfo,
initiator_role_id,
target,
@@ -4399,22 +4384,16 @@ ahc_alloc(void *platform_arg, char *name)
struct ahc_softc *ahc;
int i;
-#ifndef __FreeBSD__
ahc = kmalloc(sizeof(*ahc), GFP_ATOMIC);
if (!ahc) {
printk("aic7xxx: cannot malloc softc!\n");
kfree(name);
return NULL;
}
-#else
- ahc = device_get_softc((device_t)platform_arg);
-#endif
memset(ahc, 0, sizeof(*ahc));
ahc->seep_config = kmalloc(sizeof(*ahc->seep_config), GFP_ATOMIC);
if (ahc->seep_config == NULL) {
-#ifndef __FreeBSD__
kfree(ahc);
-#endif
kfree(name);
return (NULL);
}
@@ -4540,9 +4519,7 @@ ahc_free(struct ahc_softc *ahc)
kfree(ahc->name);
if (ahc->seep_config != NULL)
kfree(ahc->seep_config);
-#ifndef __FreeBSD__
kfree(ahc);
-#endif
return;
}
diff --git a/drivers/scsi/bnx2fc/bnx2fc.h b/drivers/scsi/bnx2fc/bnx2fc.h
index 3b84db8d13a9..b6e8ed757252 100644
--- a/drivers/scsi/bnx2fc/bnx2fc.h
+++ b/drivers/scsi/bnx2fc/bnx2fc.h
@@ -66,7 +66,7 @@
#include "bnx2fc_constants.h"
#define BNX2FC_NAME "bnx2fc"
-#define BNX2FC_VERSION "2.12.10"
+#define BNX2FC_VERSION "2.12.13"
#define PFX "bnx2fc: "
@@ -482,7 +482,10 @@ struct io_bdt {
struct bnx2fc_work {
struct list_head list;
struct bnx2fc_rport *tgt;
+ struct fcoe_task_ctx_entry *task;
+ unsigned char rq_data[BNX2FC_RQ_BUF_SZ];
u16 wqe;
+ u8 num_rq;
};
struct bnx2fc_unsol_els {
struct fc_lport *lport;
@@ -550,7 +553,7 @@ void bnx2fc_rport_event_handler(struct fc_lport *lport,
enum fc_rport_event event);
void bnx2fc_process_scsi_cmd_compl(struct bnx2fc_cmd *io_req,
struct fcoe_task_ctx_entry *task,
- u8 num_rq);
+ u8 num_rq, unsigned char *rq_data);
void bnx2fc_process_cleanup_compl(struct bnx2fc_cmd *io_req,
struct fcoe_task_ctx_entry *task,
u8 num_rq);
@@ -559,7 +562,7 @@ void bnx2fc_process_abts_compl(struct bnx2fc_cmd *io_req,
u8 num_rq);
void bnx2fc_process_tm_compl(struct bnx2fc_cmd *io_req,
struct fcoe_task_ctx_entry *task,
- u8 num_rq);
+ u8 num_rq, unsigned char *rq_data);
void bnx2fc_process_els_compl(struct bnx2fc_cmd *els_req,
struct fcoe_task_ctx_entry *task,
u8 num_rq);
@@ -577,7 +580,9 @@ struct fc_seq *bnx2fc_elsct_send(struct fc_lport *lport, u32 did,
void *arg, u32 timeout);
void bnx2fc_arm_cq(struct bnx2fc_rport *tgt);
int bnx2fc_process_new_cqes(struct bnx2fc_rport *tgt);
-void bnx2fc_process_cq_compl(struct bnx2fc_rport *tgt, u16 wqe);
+void bnx2fc_process_cq_compl(struct bnx2fc_rport *tgt, u16 wqe,
+ unsigned char *rq_data, u8 num_rq,
+ struct fcoe_task_ctx_entry *task);
struct bnx2fc_rport *bnx2fc_tgt_lookup(struct fcoe_port *port,
u32 port_id);
void bnx2fc_process_l2_frame_compl(struct bnx2fc_rport *tgt,
diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
index b4bfab5edf8f..1cbb431fa682 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
@@ -660,7 +660,10 @@ static int bnx2fc_percpu_io_thread(void *arg)
list_for_each_entry_safe(work, tmp, &work_list, list) {
list_del_init(&work->list);
- bnx2fc_process_cq_compl(work->tgt, work->wqe);
+ bnx2fc_process_cq_compl(work->tgt, work->wqe,
+ work->rq_data,
+ work->num_rq,
+ work->task);
kfree(work);
}
@@ -2655,7 +2658,8 @@ static int bnx2fc_cpu_offline(unsigned int cpu)
/* Free all work in the list */
list_for_each_entry_safe(work, tmp, &p->work_list, list) {
list_del_init(&work->list);
- bnx2fc_process_cq_compl(work->tgt, work->wqe);
+ bnx2fc_process_cq_compl(work->tgt, work->wqe, work->rq_data,
+ work->num_rq, work->task);
kfree(work);
}
diff --git a/drivers/scsi/bnx2fc/bnx2fc_hwi.c b/drivers/scsi/bnx2fc/bnx2fc_hwi.c
index 6f8335ddb1f2..1f7c58b4c535 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_hwi.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_hwi.c
@@ -863,36 +863,22 @@ ret_warn_rqe:
}
}
-void bnx2fc_process_cq_compl(struct bnx2fc_rport *tgt, u16 wqe)
+void bnx2fc_process_cq_compl(struct bnx2fc_rport *tgt, u16 wqe,
+ unsigned char *rq_data, u8 num_rq,
+ struct fcoe_task_ctx_entry *task)
{
- struct fcoe_task_ctx_entry *task;
- struct fcoe_task_ctx_entry *task_page;
struct fcoe_port *port = tgt->port;
struct bnx2fc_interface *interface = port->priv;
struct bnx2fc_hba *hba = interface->hba;
struct bnx2fc_cmd *io_req;
- int task_idx, index;
+
u16 xid;
u8 cmd_type;
u8 rx_state = 0;
- u8 num_rq;
spin_lock_bh(&tgt->tgt_lock);
- xid = wqe & FCOE_PEND_WQ_CQE_TASK_ID;
- if (xid >= hba->max_tasks) {
- printk(KERN_ERR PFX "ERROR:xid out of range\n");
- spin_unlock_bh(&tgt->tgt_lock);
- return;
- }
- task_idx = xid / BNX2FC_TASKS_PER_PAGE;
- index = xid % BNX2FC_TASKS_PER_PAGE;
- task_page = (struct fcoe_task_ctx_entry *)hba->task_ctx[task_idx];
- task = &(task_page[index]);
-
- num_rq = ((task->rxwr_txrd.var_ctx.rx_flags &
- FCOE_TCE_RX_WR_TX_RD_VAR_NUM_RQ_WQE) >>
- FCOE_TCE_RX_WR_TX_RD_VAR_NUM_RQ_WQE_SHIFT);
+ xid = wqe & FCOE_PEND_WQ_CQE_TASK_ID;
io_req = (struct bnx2fc_cmd *)hba->cmd_mgr->cmds[xid];
if (io_req == NULL) {
@@ -912,7 +898,8 @@ void bnx2fc_process_cq_compl(struct bnx2fc_rport *tgt, u16 wqe)
switch (cmd_type) {
case BNX2FC_SCSI_CMD:
if (rx_state == FCOE_TASK_RX_STATE_COMPLETED) {
- bnx2fc_process_scsi_cmd_compl(io_req, task, num_rq);
+ bnx2fc_process_scsi_cmd_compl(io_req, task, num_rq,
+ rq_data);
spin_unlock_bh(&tgt->tgt_lock);
return;
}
@@ -929,7 +916,7 @@ void bnx2fc_process_cq_compl(struct bnx2fc_rport *tgt, u16 wqe)
case BNX2FC_TASK_MGMT_CMD:
BNX2FC_IO_DBG(io_req, "Processing TM complete\n");
- bnx2fc_process_tm_compl(io_req, task, num_rq);
+ bnx2fc_process_tm_compl(io_req, task, num_rq, rq_data);
break;
case BNX2FC_ABTS:
@@ -987,7 +974,9 @@ void bnx2fc_arm_cq(struct bnx2fc_rport *tgt)
}
-static struct bnx2fc_work *bnx2fc_alloc_work(struct bnx2fc_rport *tgt, u16 wqe)
+static struct bnx2fc_work *bnx2fc_alloc_work(struct bnx2fc_rport *tgt, u16 wqe,
+ unsigned char *rq_data, u8 num_rq,
+ struct fcoe_task_ctx_entry *task)
{
struct bnx2fc_work *work;
work = kzalloc(sizeof(struct bnx2fc_work), GFP_ATOMIC);
@@ -997,29 +986,87 @@ static struct bnx2fc_work *bnx2fc_alloc_work(struct bnx2fc_rport *tgt, u16 wqe)
INIT_LIST_HEAD(&work->list);
work->tgt = tgt;
work->wqe = wqe;
+ work->num_rq = num_rq;
+ work->task = task;
+ if (rq_data)
+ memcpy(work->rq_data, rq_data, BNX2FC_RQ_BUF_SZ);
+
return work;
}
/* Pending work request completion */
-static void bnx2fc_pending_work(struct bnx2fc_rport *tgt, unsigned int wqe)
+static bool bnx2fc_pending_work(struct bnx2fc_rport *tgt, unsigned int wqe)
{
unsigned int cpu = wqe % num_possible_cpus();
struct bnx2fc_percpu_s *fps;
struct bnx2fc_work *work;
+ struct fcoe_task_ctx_entry *task;
+ struct fcoe_task_ctx_entry *task_page;
+ struct fcoe_port *port = tgt->port;
+ struct bnx2fc_interface *interface = port->priv;
+ struct bnx2fc_hba *hba = interface->hba;
+ unsigned char *rq_data = NULL;
+ unsigned char rq_data_buff[BNX2FC_RQ_BUF_SZ];
+ int task_idx, index;
+ unsigned char *dummy;
+ u16 xid;
+ u8 num_rq;
+ int i;
+
+ xid = wqe & FCOE_PEND_WQ_CQE_TASK_ID;
+ if (xid >= hba->max_tasks) {
+ pr_err(PFX "ERROR:xid out of range\n");
+ return false;
+ }
+
+ task_idx = xid / BNX2FC_TASKS_PER_PAGE;
+ index = xid % BNX2FC_TASKS_PER_PAGE;
+ task_page = (struct fcoe_task_ctx_entry *)hba->task_ctx[task_idx];
+ task = &task_page[index];
+
+ num_rq = ((task->rxwr_txrd.var_ctx.rx_flags &
+ FCOE_TCE_RX_WR_TX_RD_VAR_NUM_RQ_WQE) >>
+ FCOE_TCE_RX_WR_TX_RD_VAR_NUM_RQ_WQE_SHIFT);
+
+ memset(rq_data_buff, 0, BNX2FC_RQ_BUF_SZ);
+
+ if (!num_rq)
+ goto num_rq_zero;
+
+ rq_data = bnx2fc_get_next_rqe(tgt, 1);
+
+ if (num_rq > 1) {
+ /* We do not need extra sense data */
+ for (i = 1; i < num_rq; i++)
+ dummy = bnx2fc_get_next_rqe(tgt, 1);
+ }
+
+ if (rq_data)
+ memcpy(rq_data_buff, rq_data, BNX2FC_RQ_BUF_SZ);
+
+ /* return RQ entries */
+ for (i = 0; i < num_rq; i++)
+ bnx2fc_return_rqe(tgt, 1);
+
+num_rq_zero:
fps = &per_cpu(bnx2fc_percpu, cpu);
spin_lock_bh(&fps->fp_work_lock);
if (fps->iothread) {
- work = bnx2fc_alloc_work(tgt, wqe);
+ work = bnx2fc_alloc_work(tgt, wqe, rq_data_buff,
+ num_rq, task);
if (work) {
list_add_tail(&work->list, &fps->work_list);
wake_up_process(fps->iothread);
spin_unlock_bh(&fps->fp_work_lock);
- return;
+ return true;
}
}
spin_unlock_bh(&fps->fp_work_lock);
- bnx2fc_process_cq_compl(tgt, wqe);
+ bnx2fc_process_cq_compl(tgt, wqe,
+ rq_data_buff, num_rq, task);
+
+ return true;
}
int bnx2fc_process_new_cqes(struct bnx2fc_rport *tgt)
@@ -1056,8 +1103,8 @@ int bnx2fc_process_new_cqes(struct bnx2fc_rport *tgt)
/* Unsolicited event notification */
bnx2fc_process_unsol_compl(tgt, wqe);
} else {
- bnx2fc_pending_work(tgt, wqe);
- num_free_sqes++;
+ if (bnx2fc_pending_work(tgt, wqe))
+ num_free_sqes++;
}
cqe++;
tgt->cq_cons_idx++;
diff --git a/drivers/scsi/bnx2fc/bnx2fc_io.c b/drivers/scsi/bnx2fc/bnx2fc_io.c
index 4c8122a82322..2b070f0835df 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_io.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_io.c
@@ -24,7 +24,7 @@ static void bnx2fc_unmap_sg_list(struct bnx2fc_cmd *io_req);
static void bnx2fc_free_mp_resc(struct bnx2fc_cmd *io_req);
static void bnx2fc_parse_fcp_rsp(struct bnx2fc_cmd *io_req,
struct fcoe_fcp_rsp_payload *fcp_rsp,
- u8 num_rq);
+ u8 num_rq, unsigned char *rq_data);
void bnx2fc_cmd_timer_set(struct bnx2fc_cmd *io_req,
unsigned int timer_msec)
@@ -1518,7 +1518,8 @@ static void bnx2fc_tgt_reset_cmpl(struct bnx2fc_cmd *io_req)
}
void bnx2fc_process_tm_compl(struct bnx2fc_cmd *io_req,
- struct fcoe_task_ctx_entry *task, u8 num_rq)
+ struct fcoe_task_ctx_entry *task, u8 num_rq,
+ unsigned char *rq_data)
{
struct bnx2fc_mp_req *tm_req;
struct fc_frame_header *fc_hdr;
@@ -1557,7 +1558,7 @@ void bnx2fc_process_tm_compl(struct bnx2fc_cmd *io_req,
if (fc_hdr->fh_r_ctl == FC_RCTL_DD_CMD_STATUS) {
bnx2fc_parse_fcp_rsp(io_req,
(struct fcoe_fcp_rsp_payload *)
- rsp_buf, num_rq);
+ rsp_buf, num_rq, rq_data);
if (io_req->fcp_rsp_code == 0) {
/* TM successful */
if (tm_req->tm_flags & FCP_TMF_LUN_RESET)
@@ -1755,15 +1756,11 @@ void bnx2fc_build_fcp_cmnd(struct bnx2fc_cmd *io_req,
static void bnx2fc_parse_fcp_rsp(struct bnx2fc_cmd *io_req,
struct fcoe_fcp_rsp_payload *fcp_rsp,
- u8 num_rq)
+ u8 num_rq, unsigned char *rq_data)
{
struct scsi_cmnd *sc_cmd = io_req->sc_cmd;
- struct bnx2fc_rport *tgt = io_req->tgt;
u8 rsp_flags = fcp_rsp->fcp_flags.flags;
u32 rq_buff_len = 0;
- int i;
- unsigned char *rq_data;
- unsigned char *dummy;
int fcp_sns_len = 0;
int fcp_rsp_len = 0;
@@ -1809,14 +1806,6 @@ static void bnx2fc_parse_fcp_rsp(struct bnx2fc_cmd *io_req,
rq_buff_len = num_rq * BNX2FC_RQ_BUF_SZ;
}
- rq_data = bnx2fc_get_next_rqe(tgt, 1);
-
- if (num_rq > 1) {
- /* We do not need extra sense data */
- for (i = 1; i < num_rq; i++)
- dummy = bnx2fc_get_next_rqe(tgt, 1);
- }
-
/* fetch fcp_rsp_code */
if ((fcp_rsp_len == 4) || (fcp_rsp_len == 8)) {
/* Only for task management function */
@@ -1837,9 +1826,6 @@ static void bnx2fc_parse_fcp_rsp(struct bnx2fc_cmd *io_req,
if (fcp_sns_len)
memcpy(sc_cmd->sense_buffer, rq_data, fcp_sns_len);
- /* return RQ entries */
- for (i = 0; i < num_rq; i++)
- bnx2fc_return_rqe(tgt, 1);
}
}
@@ -1918,7 +1904,7 @@ exit_qcmd:
void bnx2fc_process_scsi_cmd_compl(struct bnx2fc_cmd *io_req,
struct fcoe_task_ctx_entry *task,
- u8 num_rq)
+ u8 num_rq, unsigned char *rq_data)
{
struct fcoe_fcp_rsp_payload *fcp_rsp;
struct bnx2fc_rport *tgt = io_req->tgt;
@@ -1931,6 +1917,12 @@ void bnx2fc_process_scsi_cmd_compl(struct bnx2fc_cmd *io_req,
/* we will not receive ABTS response for this IO */
BNX2FC_IO_DBG(io_req, "Timer context finished processing "
"this scsi cmd\n");
+ if (test_and_clear_bit(BNX2FC_FLAG_IO_CLEANUP,
+ &io_req->req_flags)) {
+ BNX2FC_IO_DBG(io_req,
+ "Actual completion after cleanup request cleaning up\n");
+ bnx2fc_process_cleanup_compl(io_req, task, num_rq);
+ }
return;
}
@@ -1950,7 +1942,7 @@ void bnx2fc_process_scsi_cmd_compl(struct bnx2fc_cmd *io_req,
&(task->rxwr_only.union_ctx.comp_info.fcp_rsp.payload);
/* parse fcp_rsp and obtain sense data from RQ if available */
- bnx2fc_parse_fcp_rsp(io_req, fcp_rsp, num_rq);
+ bnx2fc_parse_fcp_rsp(io_req, fcp_rsp, num_rq, rq_data);
if (!sc_cmd->SCp.ptr) {
printk(KERN_ERR PFX "SCp.ptr is NULL\n");
diff --git a/drivers/scsi/constants.c b/drivers/scsi/constants.c
index d4c2a2e4c5d4..84d73f57292b 100644
--- a/drivers/scsi/constants.c
+++ b/drivers/scsi/constants.c
@@ -404,7 +404,7 @@ static const char * const hostbyte_table[]={
"DID_ABORT", "DID_PARITY", "DID_ERROR", "DID_RESET", "DID_BAD_INTR",
"DID_PASSTHROUGH", "DID_SOFT_ERROR", "DID_IMM_RETRY", "DID_REQUEUE",
"DID_TRANSPORT_DISRUPTED", "DID_TRANSPORT_FAILFAST", "DID_TARGET_FAILURE",
-"DID_NEXUS_FAILURE" };
+"DID_NEXUS_FAILURE", "DID_ALLOC_FAILURE", "DID_MEDIUM_ERROR" };
static const char * const driverbyte_table[]={
"DRIVER_OK", "DRIVER_BUSY", "DRIVER_SOFT", "DRIVER_MEDIA", "DRIVER_ERROR",
diff --git a/drivers/scsi/libfc/fc_rport.c b/drivers/scsi/libfc/fc_rport.c
index da6e97d8dc3b..773c45af9387 100644
--- a/drivers/scsi/libfc/fc_rport.c
+++ b/drivers/scsi/libfc/fc_rport.c
@@ -632,6 +632,8 @@ static void fc_rport_error(struct fc_rport_priv *rdata, int err)
fc_rport_enter_ready(rdata);
break;
case RPORT_ST_PRLI:
+ fc_rport_enter_plogi(rdata);
+ break;
case RPORT_ST_ADISC:
fc_rport_enter_logo(rdata);
break;
@@ -1208,9 +1210,15 @@ static void fc_rport_prli_resp(struct fc_seq *sp, struct fc_frame *fp,
rjt = fc_frame_payload_get(fp, sizeof(*rjt));
if (!rjt)
FC_RPORT_DBG(rdata, "PRLI bad response\n");
- else
+ else {
FC_RPORT_DBG(rdata, "PRLI ELS rejected, reason %x expl %x\n",
rjt->er_reason, rjt->er_explan);
+ if (rjt->er_reason == ELS_RJT_UNAB &&
+ rjt->er_explan == ELS_EXPL_PLOGI_REQD) {
+ fc_rport_enter_plogi(rdata);
+ goto out;
+ }
+ }
fc_rport_error_retry(rdata, FC_EX_ELS_RJT);
}
diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h
index 357fdec06bae..8e2a356911a9 100644
--- a/drivers/scsi/lpfc/lpfc.h
+++ b/drivers/scsi/lpfc/lpfc.h
@@ -207,8 +207,7 @@ typedef struct lpfc_vpd {
} rev;
struct {
#ifdef __BIG_ENDIAN_BITFIELD
- uint32_t rsvd3 :19; /* Reserved */
- uint32_t cdss : 1; /* Configure Data Security SLI */
+ uint32_t rsvd3 :20; /* Reserved */
uint32_t rsvd2 : 3; /* Reserved */
uint32_t cbg : 1; /* Configure BlockGuard */
uint32_t cmv : 1; /* Configure Max VPIs */
@@ -230,8 +229,7 @@ typedef struct lpfc_vpd {
uint32_t cmv : 1; /* Configure Max VPIs */
uint32_t cbg : 1; /* Configure BlockGuard */
uint32_t rsvd2 : 3; /* Reserved */
- uint32_t cdss : 1; /* Configure Data Security SLI */
- uint32_t rsvd3 :19; /* Reserved */
+ uint32_t rsvd3 :20; /* Reserved */
#endif
} sli3Feat;
} lpfc_vpd_t;
@@ -480,8 +478,8 @@ struct lpfc_vport {
struct dentry *debug_nodelist;
struct dentry *debug_nvmestat;
struct dentry *debug_scsistat;
- struct dentry *debug_nvmektime;
- struct dentry *debug_cpucheck;
+ struct dentry *debug_ioktime;
+ struct dentry *debug_hdwqstat;
struct dentry *vport_debugfs_root;
struct lpfc_debugfs_trc *disc_trc;
atomic_t disc_trc_cnt;
@@ -887,7 +885,6 @@ struct lpfc_hba {
#define LPFC_INITIALIZE_LINK 0 /* do normal init_link mbox */
#define LPFC_DELAY_INIT_LINK 1 /* layered driver hold off */
#define LPFC_DELAY_INIT_LINK_INDEFINITELY 2 /* wait, manual intervention */
- uint32_t cfg_enable_dss;
uint32_t cfg_fdmi_on;
#define LPFC_FDMI_NO_SUPPORT 0 /* FDMI not supported */
#define LPFC_FDMI_SUPPORT 1 /* FDMI supported? */
@@ -1156,8 +1153,6 @@ struct lpfc_hba {
uint32_t iocb_cnt;
uint32_t iocb_max;
atomic_t sdev_cnt;
- uint8_t fips_spec_rev;
- uint8_t fips_level;
spinlock_t devicelock; /* lock for luns list */
mempool_t *device_data_mem_pool;
struct list_head luns;
@@ -1175,12 +1170,11 @@ struct lpfc_hba {
uint16_t sfp_warning;
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
- uint16_t cpucheck_on;
+ uint16_t hdwqstat_on;
#define LPFC_CHECK_OFF 0
#define LPFC_CHECK_NVME_IO 1
-#define LPFC_CHECK_NVMET_RCV 2
-#define LPFC_CHECK_NVMET_IO 4
-#define LPFC_CHECK_SCSI_IO 8
+#define LPFC_CHECK_NVMET_IO 2
+#define LPFC_CHECK_SCSI_IO 4
uint16_t ktime_on;
uint64_t ktime_data_samples;
uint64_t ktime_status_samples;
@@ -1225,6 +1219,11 @@ struct lpfc_hba {
#define LPFC_POLL_SLOWPATH 1 /* called from slowpath */
char os_host_name[MAXHOSTNAMELEN];
+
+ /* SCSI host template information - for physical port */
+ struct scsi_host_template port_template;
+ /* SCSI host template information - for all vports */
+ struct scsi_host_template vport_template;
};
static inline struct Scsi_Host *
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index be3b0ccbac78..1354c141d614 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -2231,66 +2231,6 @@ lpfc_poll_store(struct device *dev, struct device_attribute *attr,
}
/**
- * lpfc_fips_level_show - Return the current FIPS level for the HBA
- * @dev: class unused variable.
- * @attr: device attribute, not used.
- * @buf: on return contains the module description text.
- *
- * Returns: size of formatted string.
- **/
-static ssize_t
-lpfc_fips_level_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct Scsi_Host *shost = class_to_shost(dev);
- struct lpfc_vport *vport = (struct lpfc_vport *) shost->hostdata;
- struct lpfc_hba *phba = vport->phba;
-
- return scnprintf(buf, PAGE_SIZE, "%d\n", phba->fips_level);
-}
-
-/**
- * lpfc_fips_rev_show - Return the FIPS Spec revision for the HBA
- * @dev: class unused variable.
- * @attr: device attribute, not used.
- * @buf: on return contains the module description text.
- *
- * Returns: size of formatted string.
- **/
-static ssize_t
-lpfc_fips_rev_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct Scsi_Host *shost = class_to_shost(dev);
- struct lpfc_vport *vport = (struct lpfc_vport *) shost->hostdata;
- struct lpfc_hba *phba = vport->phba;
-
- return scnprintf(buf, PAGE_SIZE, "%d\n", phba->fips_spec_rev);
-}
-
-/**
- * lpfc_dss_show - Return the current state of dss and the configured state
- * @dev: class converted to a Scsi_host structure.
- * @attr: device attribute, not used.
- * @buf: on return contains the formatted text.
- *
- * Returns: size of formatted string.
- **/
-static ssize_t
-lpfc_dss_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct Scsi_Host *shost = class_to_shost(dev);
- struct lpfc_vport *vport = (struct lpfc_vport *) shost->hostdata;
- struct lpfc_hba *phba = vport->phba;
-
- return scnprintf(buf, PAGE_SIZE, "%s - %sOperational\n",
- (phba->cfg_enable_dss) ? "Enabled" : "Disabled",
- (phba->sli3_options & LPFC_SLI3_DSS_ENABLED) ?
- "" : "Not ");
-}
-
-/**
* lpfc_sriov_hw_max_virtfn_show - Return maximum number of virtual functions
* @dev: class converted to a Scsi_host structure.
* @attr: device attribute, not used.
@@ -2705,9 +2645,6 @@ static DEVICE_ATTR(max_xri, S_IRUGO, lpfc_max_xri_show, NULL);
static DEVICE_ATTR(used_xri, S_IRUGO, lpfc_used_xri_show, NULL);
static DEVICE_ATTR(npiv_info, S_IRUGO, lpfc_npiv_info_show, NULL);
static DEVICE_ATTR_RO(lpfc_temp_sensor);
-static DEVICE_ATTR_RO(lpfc_fips_level);
-static DEVICE_ATTR_RO(lpfc_fips_rev);
-static DEVICE_ATTR_RO(lpfc_dss);
static DEVICE_ATTR_RO(lpfc_sriov_hw_max_virtfn);
static DEVICE_ATTR(protocol, S_IRUGO, lpfc_sli4_protocol_show, NULL);
static DEVICE_ATTR(lpfc_xlane_supported, S_IRUGO, lpfc_oas_supported_show,
@@ -3868,9 +3805,9 @@ LPFC_VPORT_ATTR_R(enable_da_id, 1, 0, 1,
/*
# lun_queue_depth: This parameter is used to limit the number of outstanding
-# commands per FCP LUN. Value range is [1,512]. Default value is 30.
+# commands per FCP LUN.
*/
-LPFC_VPORT_ATTR_R(lun_queue_depth, 30, 1, 512,
+LPFC_VPORT_ATTR_R(lun_queue_depth, 64, 1, 512,
"Max number of FCP commands we can queue to a specific LUN");
/*
@@ -6251,9 +6188,6 @@ struct device_attribute *lpfc_hba_attrs[] = {
&dev_attr_pt,
&dev_attr_txq_hw,
&dev_attr_txcmplq_hw,
- &dev_attr_lpfc_fips_level,
- &dev_attr_lpfc_fips_rev,
- &dev_attr_lpfc_dss,
&dev_attr_lpfc_sriov_hw_max_virtfn,
&dev_attr_protocol,
&dev_attr_lpfc_xlane_supported,
@@ -6289,8 +6223,6 @@ struct device_attribute *lpfc_vport_attrs[] = {
&dev_attr_lpfc_max_scsicmpl_time,
&dev_attr_lpfc_stat_data_ctrl,
&dev_attr_lpfc_static_vport,
- &dev_attr_lpfc_fips_level,
- &dev_attr_lpfc_fips_rev,
NULL,
};
@@ -7399,7 +7331,6 @@ lpfc_get_cfgparam(struct lpfc_hba *phba)
lpfc_suppress_link_up_init(phba, lpfc_suppress_link_up);
lpfc_delay_discovery_init(phba, lpfc_delay_discovery);
lpfc_sli_mode_init(phba, lpfc_sli_mode);
- phba->cfg_enable_dss = 1;
lpfc_enable_mds_diags_init(phba, lpfc_enable_mds_diags);
lpfc_ras_fwlog_buffsize_init(phba, lpfc_ras_fwlog_buffsize);
lpfc_ras_fwlog_level_init(phba, lpfc_ras_fwlog_level);
diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h
index a450477a7e00..76dc8d9493d2 100644
--- a/drivers/scsi/lpfc/lpfc_crtn.h
+++ b/drivers/scsi/lpfc/lpfc_crtn.h
@@ -404,9 +404,7 @@ void lpfc_free_sysfs_attr(struct lpfc_vport *);
extern struct device_attribute *lpfc_hba_attrs[];
extern struct device_attribute *lpfc_vport_attrs[];
extern struct scsi_host_template lpfc_template;
-extern struct scsi_host_template lpfc_template_no_hr;
extern struct scsi_host_template lpfc_template_nvme;
-extern struct scsi_host_template lpfc_vport_template;
extern struct fc_function_template lpfc_transport_functions;
extern struct fc_function_template lpfc_vport_transport_functions;
@@ -590,6 +588,7 @@ struct lpfc_io_buf *lpfc_get_io_buf(struct lpfc_hba *phba,
int);
void lpfc_release_io_buf(struct lpfc_hba *phba, struct lpfc_io_buf *ncmd,
struct lpfc_sli4_hdw_queue *qp);
+void lpfc_io_ktime(struct lpfc_hba *phba, struct lpfc_io_buf *ncmd);
void lpfc_nvme_cmd_template(void);
void lpfc_nvmet_cmd_template(void);
void lpfc_nvme_cancel_iocb(struct lpfc_hba *phba, struct lpfc_iocbq *pwqeIn);
diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c
index 819335b16c2e..8a6e02aa553f 100644
--- a/drivers/scsi/lpfc/lpfc_debugfs.c
+++ b/drivers/scsi/lpfc/lpfc_debugfs.c
@@ -1300,8 +1300,88 @@ buffer_done:
return len;
}
+void
+lpfc_io_ktime(struct lpfc_hba *phba, struct lpfc_io_buf *lpfc_cmd)
+{
+ uint64_t seg1, seg2, seg3, seg4;
+ uint64_t segsum;
+
+ if (!lpfc_cmd->ts_last_cmd ||
+ !lpfc_cmd->ts_cmd_start ||
+ !lpfc_cmd->ts_cmd_wqput ||
+ !lpfc_cmd->ts_isr_cmpl ||
+ !lpfc_cmd->ts_data_io)
+ return;
+
+ if (lpfc_cmd->ts_data_io < lpfc_cmd->ts_cmd_start)
+ return;
+ if (lpfc_cmd->ts_cmd_start < lpfc_cmd->ts_last_cmd)
+ return;
+ if (lpfc_cmd->ts_cmd_wqput < lpfc_cmd->ts_cmd_start)
+ return;
+ if (lpfc_cmd->ts_isr_cmpl < lpfc_cmd->ts_cmd_wqput)
+ return;
+ if (lpfc_cmd->ts_data_io < lpfc_cmd->ts_isr_cmpl)
+ return;
+ /*
+ * Segment 1 - Time from Last FCP command cmpl is handed
+ * off to NVME Layer to start of next command.
+ * Segment 2 - Time from Driver receives a IO cmd start
+ * from NVME Layer to WQ put is done on IO cmd.
+ * Segment 3 - Time from Driver WQ put is done on IO cmd
+ * to MSI-X ISR for IO cmpl.
+ * Segment 4 - Time from MSI-X ISR for IO cmpl to when
+ * cmpl is handled off to the NVME Layer.
+ */
+ seg1 = lpfc_cmd->ts_cmd_start - lpfc_cmd->ts_last_cmd;
+ if (seg1 > 5000000) /* 5 ms - for sequential IOs only */
+ seg1 = 0;
+
+ /* Calculate times relative to start of IO */
+ seg2 = (lpfc_cmd->ts_cmd_wqput - lpfc_cmd->ts_cmd_start);
+ segsum = seg2;
+ seg3 = lpfc_cmd->ts_isr_cmpl - lpfc_cmd->ts_cmd_start;
+ if (segsum > seg3)
+ return;
+ seg3 -= segsum;
+ segsum += seg3;
+
+ seg4 = lpfc_cmd->ts_data_io - lpfc_cmd->ts_cmd_start;
+ if (segsum > seg4)
+ return;
+ seg4 -= segsum;
+
+ phba->ktime_data_samples++;
+ phba->ktime_seg1_total += seg1;
+ if (seg1 < phba->ktime_seg1_min)
+ phba->ktime_seg1_min = seg1;
+ else if (seg1 > phba->ktime_seg1_max)
+ phba->ktime_seg1_max = seg1;
+ phba->ktime_seg2_total += seg2;
+ if (seg2 < phba->ktime_seg2_min)
+ phba->ktime_seg2_min = seg2;
+ else if (seg2 > phba->ktime_seg2_max)
+ phba->ktime_seg2_max = seg2;
+ phba->ktime_seg3_total += seg3;
+ if (seg3 < phba->ktime_seg3_min)
+ phba->ktime_seg3_min = seg3;
+ else if (seg3 > phba->ktime_seg3_max)
+ phba->ktime_seg3_max = seg3;
+ phba->ktime_seg4_total += seg4;
+ if (seg4 < phba->ktime_seg4_min)
+ phba->ktime_seg4_min = seg4;
+ else if (seg4 > phba->ktime_seg4_max)
+ phba->ktime_seg4_max = seg4;
+
+ lpfc_cmd->ts_last_cmd = 0;
+ lpfc_cmd->ts_cmd_start = 0;
+ lpfc_cmd->ts_cmd_wqput = 0;
+ lpfc_cmd->ts_isr_cmpl = 0;
+ lpfc_cmd->ts_data_io = 0;
+}
+
/**
- * lpfc_debugfs_nvmektime_data - Dump target node list to a buffer
+ * lpfc_debugfs_ioktime_data - Dump target node list to a buffer
* @vport: The vport to gather target node info from.
* @buf: The buffer to dump log into.
* @size: The maximum amount of data to process.
@@ -1314,13 +1394,13 @@ buffer_done:
* not exceed @size.
**/
static int
-lpfc_debugfs_nvmektime_data(struct lpfc_vport *vport, char *buf, int size)
+lpfc_debugfs_ioktime_data(struct lpfc_vport *vport, char *buf, int size)
{
struct lpfc_hba *phba = vport->phba;
int len = 0;
if (phba->nvmet_support == 0) {
- /* NVME Initiator */
+ /* Initiator */
len += scnprintf(buf + len, PAGE_SIZE - len,
"ktime %s: Total Samples: %lld\n",
(phba->ktime_on ? "Enabled" : "Disabled"),
@@ -1330,8 +1410,8 @@ lpfc_debugfs_nvmektime_data(struct lpfc_vport *vport, char *buf, int size)
len += scnprintf(
buf + len, PAGE_SIZE - len,
- "Segment 1: Last NVME Cmd cmpl "
- "done -to- Start of next NVME cnd (in driver)\n");
+ "Segment 1: Last Cmd cmpl "
+ "done -to- Start of next Cmd (in driver)\n");
len += scnprintf(
buf + len, PAGE_SIZE - len,
"avg:%08lld min:%08lld max %08lld\n",
@@ -1341,7 +1421,7 @@ lpfc_debugfs_nvmektime_data(struct lpfc_vport *vport, char *buf, int size)
phba->ktime_seg1_max);
len += scnprintf(
buf + len, PAGE_SIZE - len,
- "Segment 2: Driver start of NVME cmd "
+ "Segment 2: Driver start of Cmd "
"-to- Firmware WQ doorbell\n");
len += scnprintf(
buf + len, PAGE_SIZE - len,
@@ -1364,7 +1444,7 @@ lpfc_debugfs_nvmektime_data(struct lpfc_vport *vport, char *buf, int size)
len += scnprintf(
buf + len, PAGE_SIZE - len,
"Segment 4: MSI-X ISR cmpl -to- "
- "NVME cmpl done\n");
+ "Cmd cmpl done\n");
len += scnprintf(
buf + len, PAGE_SIZE - len,
"avg:%08lld min:%08lld max %08lld\n",
@@ -1603,42 +1683,50 @@ out:
}
/**
- * lpfc_debugfs_cpucheck_data - Dump target node list to a buffer
+ * lpfc_debugfs_hdwqstat_data - Dump I/O stats to a buffer
* @vport: The vport to gather target node info from.
* @buf: The buffer to dump log into.
* @size: The maximum amount of data to process.
*
* Description:
- * This routine dumps the NVME statistics associated with @vport
+ * This routine dumps the NVME + SCSI statistics associated with @vport
*
* Return Value:
* This routine returns the amount of bytes that were dumped into @buf and will
* not exceed @size.
**/
static int
-lpfc_debugfs_cpucheck_data(struct lpfc_vport *vport, char *buf, int size)
+lpfc_debugfs_hdwqstat_data(struct lpfc_vport *vport, char *buf, int size)
{
struct lpfc_hba *phba = vport->phba;
struct lpfc_sli4_hdw_queue *qp;
- int i, j, max_cnt;
- int len = 0;
+ struct lpfc_hdwq_stat *c_stat;
+ int i, j, len;
uint32_t tot_xmt;
uint32_t tot_rcv;
uint32_t tot_cmpl;
+ char tmp[LPFC_MAX_SCSI_INFO_TMP_LEN] = {0};
- len += scnprintf(buf + len, PAGE_SIZE - len,
- "CPUcheck %s ",
- (phba->cpucheck_on & LPFC_CHECK_NVME_IO ?
- "Enabled" : "Disabled"));
- if (phba->nvmet_support) {
- len += scnprintf(buf + len, PAGE_SIZE - len,
- "%s\n",
- (phba->cpucheck_on & LPFC_CHECK_NVMET_RCV ?
- "Rcv Enabled\n" : "Rcv Disabled\n"));
- } else {
- len += scnprintf(buf + len, PAGE_SIZE - len, "\n");
- }
- max_cnt = size - LPFC_DEBUG_OUT_LINE_SZ;
+ scnprintf(tmp, sizeof(tmp), "HDWQ Stats:\n\n");
+ if (strlcat(buf, tmp, size) >= size)
+ goto buffer_done;
+
+ scnprintf(tmp, sizeof(tmp), "(NVME Accounting: %s) ",
+ (phba->hdwqstat_on &
+ (LPFC_CHECK_NVME_IO | LPFC_CHECK_NVMET_IO) ?
+ "Enabled" : "Disabled"));
+ if (strlcat(buf, tmp, size) >= size)
+ goto buffer_done;
+
+ scnprintf(tmp, sizeof(tmp), "(SCSI Accounting: %s) ",
+ (phba->hdwqstat_on & LPFC_CHECK_SCSI_IO ?
+ "Enabled" : "Disabled"));
+ if (strlcat(buf, tmp, size) >= size)
+ goto buffer_done;
+
+ scnprintf(tmp, sizeof(tmp), "\n\n");
+ if (strlcat(buf, tmp, size) >= size)
+ goto buffer_done;
for (i = 0; i < phba->cfg_hdw_queue; i++) {
qp = &phba->sli4_hba.hdwq[i];
@@ -1646,46 +1734,76 @@ lpfc_debugfs_cpucheck_data(struct lpfc_vport *vport, char *buf, int size)
tot_rcv = 0;
tot_xmt = 0;
tot_cmpl = 0;
- for (j = 0; j < LPFC_CHECK_CPU_CNT; j++) {
- tot_xmt += qp->cpucheck_xmt_io[j];
- tot_cmpl += qp->cpucheck_cmpl_io[j];
- if (phba->nvmet_support)
- tot_rcv += qp->cpucheck_rcv_io[j];
- }
- /* Only display Hardware Qs with something */
- if (!tot_xmt && !tot_cmpl && !tot_rcv)
- continue;
+ for_each_present_cpu(j) {
+ c_stat = per_cpu_ptr(phba->sli4_hba.c_stat, j);
+
+ /* Only display for this HDWQ */
+ if (i != c_stat->hdwq_no)
+ continue;
- len += scnprintf(buf + len, PAGE_SIZE - len,
- "HDWQ %03d: ", i);
- for (j = 0; j < LPFC_CHECK_CPU_CNT; j++) {
/* Only display non-zero counters */
- if (!qp->cpucheck_xmt_io[j] &&
- !qp->cpucheck_cmpl_io[j] &&
- !qp->cpucheck_rcv_io[j])
+ if (!c_stat->xmt_io && !c_stat->cmpl_io &&
+ !c_stat->rcv_io)
continue;
+
+ if (!tot_xmt && !tot_cmpl && !tot_rcv) {
+ /* Print HDWQ string only the first time */
+ scnprintf(tmp, sizeof(tmp), "[HDWQ %d]:\t", i);
+ if (strlcat(buf, tmp, size) >= size)
+ goto buffer_done;
+ }
+
+ tot_xmt += c_stat->xmt_io;
+ tot_cmpl += c_stat->cmpl_io;
+ if (phba->nvmet_support)
+ tot_rcv += c_stat->rcv_io;
+
+ scnprintf(tmp, sizeof(tmp), "| [CPU %d]: ", j);
+ if (strlcat(buf, tmp, size) >= size)
+ goto buffer_done;
+
if (phba->nvmet_support) {
- len += scnprintf(buf + len, PAGE_SIZE - len,
- "CPU %03d: %x/%x/%x ", j,
- qp->cpucheck_rcv_io[j],
- qp->cpucheck_xmt_io[j],
- qp->cpucheck_cmpl_io[j]);
+ scnprintf(tmp, sizeof(tmp),
+ "XMT 0x%x CMPL 0x%x RCV 0x%x |",
+ c_stat->xmt_io, c_stat->cmpl_io,
+ c_stat->rcv_io);
+ if (strlcat(buf, tmp, size) >= size)
+ goto buffer_done;
} else {
- len += scnprintf(buf + len, PAGE_SIZE - len,
- "CPU %03d: %x/%x ", j,
- qp->cpucheck_xmt_io[j],
- qp->cpucheck_cmpl_io[j]);
+ scnprintf(tmp, sizeof(tmp),
+ "XMT 0x%x CMPL 0x%x |",
+ c_stat->xmt_io, c_stat->cmpl_io);
+ if (strlcat(buf, tmp, size) >= size)
+ goto buffer_done;
}
}
- len += scnprintf(buf + len, PAGE_SIZE - len,
- "Total: %x\n", tot_xmt);
- if (len >= max_cnt) {
- len += scnprintf(buf + len, PAGE_SIZE - len,
- "Truncated ...\n");
- return len;
+
+ /* Check if nothing to display */
+ if (!tot_xmt && !tot_cmpl && !tot_rcv)
+ continue;
+
+ scnprintf(tmp, sizeof(tmp), "\t->\t[HDWQ Total: ");
+ if (strlcat(buf, tmp, size) >= size)
+ goto buffer_done;
+
+ if (phba->nvmet_support) {
+ scnprintf(tmp, sizeof(tmp),
+ "XMT 0x%x CMPL 0x%x RCV 0x%x]\n\n",
+ tot_xmt, tot_cmpl, tot_rcv);
+ if (strlcat(buf, tmp, size) >= size)
+ goto buffer_done;
+ } else {
+ scnprintf(tmp, sizeof(tmp),
+ "XMT 0x%x CMPL 0x%x]\n\n",
+ tot_xmt, tot_cmpl);
+ if (strlcat(buf, tmp, size) >= size)
+ goto buffer_done;
}
}
+
+buffer_done:
+ len = strnlen(buf, size);
return len;
}
@@ -2689,7 +2807,7 @@ lpfc_debugfs_scsistat_write(struct file *file, const char __user *buf,
}
static int
-lpfc_debugfs_nvmektime_open(struct inode *inode, struct file *file)
+lpfc_debugfs_ioktime_open(struct inode *inode, struct file *file)
{
struct lpfc_vport *vport = inode->i_private;
struct lpfc_debug *debug;
@@ -2700,14 +2818,14 @@ lpfc_debugfs_nvmektime_open(struct inode *inode, struct file *file)
goto out;
/* Round to page boundary */
- debug->buffer = kmalloc(LPFC_NVMEKTIME_SIZE, GFP_KERNEL);
+ debug->buffer = kmalloc(LPFC_IOKTIME_SIZE, GFP_KERNEL);
if (!debug->buffer) {
kfree(debug);
goto out;
}
- debug->len = lpfc_debugfs_nvmektime_data(vport, debug->buffer,
- LPFC_NVMEKTIME_SIZE);
+ debug->len = lpfc_debugfs_ioktime_data(vport, debug->buffer,
+ LPFC_IOKTIME_SIZE);
debug->i_private = inode->i_private;
file->private_data = debug;
@@ -2718,8 +2836,8 @@ out:
}
static ssize_t
-lpfc_debugfs_nvmektime_write(struct file *file, const char __user *buf,
- size_t nbytes, loff_t *ppos)
+lpfc_debugfs_ioktime_write(struct file *file, const char __user *buf,
+ size_t nbytes, loff_t *ppos)
{
struct lpfc_debug *debug = file->private_data;
struct lpfc_vport *vport = (struct lpfc_vport *)debug->i_private;
@@ -2921,7 +3039,7 @@ lpfc_debugfs_nvmeio_trc_write(struct file *file, const char __user *buf,
}
static int
-lpfc_debugfs_cpucheck_open(struct inode *inode, struct file *file)
+lpfc_debugfs_hdwqstat_open(struct inode *inode, struct file *file)
{
struct lpfc_vport *vport = inode->i_private;
struct lpfc_debug *debug;
@@ -2932,14 +3050,14 @@ lpfc_debugfs_cpucheck_open(struct inode *inode, struct file *file)
goto out;
/* Round to page boundary */
- debug->buffer = kmalloc(LPFC_CPUCHECK_SIZE, GFP_KERNEL);
+ debug->buffer = kcalloc(1, LPFC_SCSISTAT_SIZE, GFP_KERNEL);
if (!debug->buffer) {
kfree(debug);
goto out;
}
- debug->len = lpfc_debugfs_cpucheck_data(vport, debug->buffer,
- LPFC_CPUCHECK_SIZE);
+ debug->len = lpfc_debugfs_hdwqstat_data(vport, debug->buffer,
+ LPFC_SCSISTAT_SIZE);
debug->i_private = inode->i_private;
file->private_data = debug;
@@ -2950,16 +3068,16 @@ out:
}
static ssize_t
-lpfc_debugfs_cpucheck_write(struct file *file, const char __user *buf,
+lpfc_debugfs_hdwqstat_write(struct file *file, const char __user *buf,
size_t nbytes, loff_t *ppos)
{
struct lpfc_debug *debug = file->private_data;
struct lpfc_vport *vport = (struct lpfc_vport *)debug->i_private;
struct lpfc_hba *phba = vport->phba;
- struct lpfc_sli4_hdw_queue *qp;
+ struct lpfc_hdwq_stat *c_stat;
char mybuf[64];
char *pbuf;
- int i, j;
+ int i;
if (nbytes > 64)
nbytes = 64;
@@ -2972,41 +3090,39 @@ lpfc_debugfs_cpucheck_write(struct file *file, const char __user *buf,
if ((strncmp(pbuf, "on", sizeof("on") - 1) == 0)) {
if (phba->nvmet_support)
- phba->cpucheck_on |= LPFC_CHECK_NVMET_IO;
+ phba->hdwqstat_on |= LPFC_CHECK_NVMET_IO;
else
- phba->cpucheck_on |= (LPFC_CHECK_NVME_IO |
+ phba->hdwqstat_on |= (LPFC_CHECK_NVME_IO |
LPFC_CHECK_SCSI_IO);
return strlen(pbuf);
} else if ((strncmp(pbuf, "nvme_on", sizeof("nvme_on") - 1) == 0)) {
if (phba->nvmet_support)
- phba->cpucheck_on |= LPFC_CHECK_NVMET_IO;
+ phba->hdwqstat_on |= LPFC_CHECK_NVMET_IO;
else
- phba->cpucheck_on |= LPFC_CHECK_NVME_IO;
+ phba->hdwqstat_on |= LPFC_CHECK_NVME_IO;
return strlen(pbuf);
} else if ((strncmp(pbuf, "scsi_on", sizeof("scsi_on") - 1) == 0)) {
- phba->cpucheck_on |= LPFC_CHECK_SCSI_IO;
+ if (!phba->nvmet_support)
+ phba->hdwqstat_on |= LPFC_CHECK_SCSI_IO;
return strlen(pbuf);
- } else if ((strncmp(pbuf, "rcv",
- sizeof("rcv") - 1) == 0)) {
- if (phba->nvmet_support)
- phba->cpucheck_on |= LPFC_CHECK_NVMET_RCV;
- else
- return -EINVAL;
+ } else if ((strncmp(pbuf, "nvme_off", sizeof("nvme_off") - 1) == 0)) {
+ phba->hdwqstat_on &= ~(LPFC_CHECK_NVME_IO |
+ LPFC_CHECK_NVMET_IO);
+ return strlen(pbuf);
+ } else if ((strncmp(pbuf, "scsi_off", sizeof("scsi_off") - 1) == 0)) {
+ phba->hdwqstat_on &= ~LPFC_CHECK_SCSI_IO;
return strlen(pbuf);
} else if ((strncmp(pbuf, "off",
sizeof("off") - 1) == 0)) {
- phba->cpucheck_on = LPFC_CHECK_OFF;
+ phba->hdwqstat_on = LPFC_CHECK_OFF;
return strlen(pbuf);
} else if ((strncmp(pbuf, "zero",
sizeof("zero") - 1) == 0)) {
- for (i = 0; i < phba->cfg_hdw_queue; i++) {
- qp = &phba->sli4_hba.hdwq[i];
-
- for (j = 0; j < LPFC_CHECK_CPU_CNT; j++) {
- qp->cpucheck_rcv_io[j] = 0;
- qp->cpucheck_xmt_io[j] = 0;
- qp->cpucheck_cmpl_io[j] = 0;
- }
+ for_each_present_cpu(i) {
+ c_stat = per_cpu_ptr(phba->sli4_hba.c_stat, i);
+ c_stat->xmt_io = 0;
+ c_stat->cmpl_io = 0;
+ c_stat->rcv_io = 0;
}
return strlen(pbuf);
}
@@ -5431,13 +5547,13 @@ static const struct file_operations lpfc_debugfs_op_scsistat = {
.release = lpfc_debugfs_release,
};
-#undef lpfc_debugfs_op_nvmektime
-static const struct file_operations lpfc_debugfs_op_nvmektime = {
+#undef lpfc_debugfs_op_ioktime
+static const struct file_operations lpfc_debugfs_op_ioktime = {
.owner = THIS_MODULE,
- .open = lpfc_debugfs_nvmektime_open,
+ .open = lpfc_debugfs_ioktime_open,
.llseek = lpfc_debugfs_lseek,
.read = lpfc_debugfs_read,
- .write = lpfc_debugfs_nvmektime_write,
+ .write = lpfc_debugfs_ioktime_write,
.release = lpfc_debugfs_release,
};
@@ -5451,13 +5567,13 @@ static const struct file_operations lpfc_debugfs_op_nvmeio_trc = {
.release = lpfc_debugfs_release,
};
-#undef lpfc_debugfs_op_cpucheck
-static const struct file_operations lpfc_debugfs_op_cpucheck = {
+#undef lpfc_debugfs_op_hdwqstat
+static const struct file_operations lpfc_debugfs_op_hdwqstat = {
.owner = THIS_MODULE,
- .open = lpfc_debugfs_cpucheck_open,
+ .open = lpfc_debugfs_hdwqstat_open,
.llseek = lpfc_debugfs_lseek,
.read = lpfc_debugfs_read,
- .write = lpfc_debugfs_cpucheck_write,
+ .write = lpfc_debugfs_hdwqstat_write,
.release = lpfc_debugfs_release,
};
@@ -6075,17 +6191,22 @@ nvmeio_off:
goto debug_failed;
}
- snprintf(name, sizeof(name), "nvmektime");
- vport->debug_nvmektime =
+ snprintf(name, sizeof(name), "ioktime");
+ vport->debug_ioktime =
debugfs_create_file(name, 0644,
vport->vport_debugfs_root,
- vport, &lpfc_debugfs_op_nvmektime);
+ vport, &lpfc_debugfs_op_ioktime);
+ if (!vport->debug_ioktime) {
+ lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT,
+ "0815 Cannot create debugfs ioktime\n");
+ goto debug_failed;
+ }
- snprintf(name, sizeof(name), "cpucheck");
- vport->debug_cpucheck =
+ snprintf(name, sizeof(name), "hdwqstat");
+ vport->debug_hdwqstat =
debugfs_create_file(name, 0644,
vport->vport_debugfs_root,
- vport, &lpfc_debugfs_op_cpucheck);
+ vport, &lpfc_debugfs_op_hdwqstat);
/*
* The following section is for additional directories/files for the
@@ -6216,11 +6337,11 @@ lpfc_debugfs_terminate(struct lpfc_vport *vport)
debugfs_remove(vport->debug_scsistat); /* scsistat */
vport->debug_scsistat = NULL;
- debugfs_remove(vport->debug_nvmektime); /* nvmektime */
- vport->debug_nvmektime = NULL;
+ debugfs_remove(vport->debug_ioktime); /* ioktime */
+ vport->debug_ioktime = NULL;
- debugfs_remove(vport->debug_cpucheck); /* cpucheck */
- vport->debug_cpucheck = NULL;
+ debugfs_remove(vport->debug_hdwqstat); /* hdwqstat */
+ vport->debug_hdwqstat = NULL;
if (vport->vport_debugfs_root) {
debugfs_remove(vport->vport_debugfs_root); /* vportX */
diff --git a/drivers/scsi/lpfc/lpfc_debugfs.h b/drivers/scsi/lpfc/lpfc_debugfs.h
index 20f2537af511..7ab6d3b08698 100644
--- a/drivers/scsi/lpfc/lpfc_debugfs.h
+++ b/drivers/scsi/lpfc/lpfc_debugfs.h
@@ -46,8 +46,7 @@
/* nvmestat output buffer size */
#define LPFC_NVMESTAT_SIZE 8192
-#define LPFC_NVMEKTIME_SIZE 8192
-#define LPFC_CPUCHECK_SIZE 8192
+#define LPFC_IOKTIME_SIZE 8192
#define LPFC_NVMEIO_TRC_SIZE 8192
/* scsistat output buffer size */
diff --git a/drivers/scsi/lpfc/lpfc_hw.h b/drivers/scsi/lpfc/lpfc_hw.h
index ae51c0dbba0a..c20034b3101c 100644
--- a/drivers/scsi/lpfc/lpfc_hw.h
+++ b/drivers/scsi/lpfc/lpfc_hw.h
@@ -3262,8 +3262,7 @@ typedef struct {
#endif
#ifdef __BIG_ENDIAN_BITFIELD
- uint32_t rsvd1 : 19; /* Reserved */
- uint32_t cdss : 1; /* Configure Data Security SLI */
+ uint32_t rsvd1 : 20; /* Reserved */
uint32_t casabt : 1; /* Configure async abts status notice */
uint32_t rsvd2 : 2; /* Reserved */
uint32_t cbg : 1; /* Configure BlockGuard */
@@ -3287,12 +3286,10 @@ typedef struct {
uint32_t cbg : 1; /* Configure BlockGuard */
uint32_t rsvd2 : 2; /* Reserved */
uint32_t casabt : 1; /* Configure async abts status notice */
- uint32_t cdss : 1; /* Configure Data Security SLI */
- uint32_t rsvd1 : 19; /* Reserved */
+ uint32_t rsvd1 : 20; /* Reserved */
#endif
#ifdef __BIG_ENDIAN_BITFIELD
- uint32_t rsvd3 : 19; /* Reserved */
- uint32_t gdss : 1; /* Configure Data Security SLI */
+ uint32_t rsvd3 : 20; /* Reserved */
uint32_t gasabt : 1; /* Grant async abts status notice */
uint32_t rsvd4 : 2; /* Reserved */
uint32_t gbg : 1; /* Grant BlockGuard */
@@ -3316,8 +3313,7 @@ typedef struct {
uint32_t gbg : 1; /* Grant BlockGuard */
uint32_t rsvd4 : 2; /* Reserved */
uint32_t gasabt : 1; /* Grant async abts status notice */
- uint32_t gdss : 1; /* Configure Data Security SLI */
- uint32_t rsvd3 : 19; /* Reserved */
+ uint32_t rsvd3 : 20; /* Reserved */
#endif
#ifdef __BIG_ENDIAN_BITFIELD
@@ -3339,15 +3335,11 @@ typedef struct {
uint32_t rsvd6; /* Reserved */
#ifdef __BIG_ENDIAN_BITFIELD
- uint32_t fips_rev : 3; /* FIPS Spec Revision */
- uint32_t fips_level : 4; /* FIPS Level */
- uint32_t sec_err : 9; /* security crypto error */
+ uint32_t rsvd7 : 16;
uint32_t max_vpi : 16; /* Max number of virt N-Ports */
#else /* __LITTLE_ENDIAN */
uint32_t max_vpi : 16; /* Max number of virt N-Ports */
- uint32_t sec_err : 9; /* security crypto error */
- uint32_t fips_level : 4; /* FIPS Level */
- uint32_t fips_rev : 3; /* FIPS Spec Revision */
+ uint32_t rsvd7 : 16;
#endif
} CONFIG_PORT_VAR;
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index 9d03e9b71efb..4104bdcdbb6f 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -4231,6 +4231,7 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev)
{
struct lpfc_vport *vport;
struct Scsi_Host *shost = NULL;
+ struct scsi_host_template *template;
int error = 0;
int i;
uint64_t wwn;
@@ -4259,22 +4260,50 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev)
}
}
- if (phba->cfg_enable_fc4_type & LPFC_ENABLE_FCP) {
- if (dev != &phba->pcidev->dev) {
- shost = scsi_host_alloc(&lpfc_vport_template,
- sizeof(struct lpfc_vport));
+ /* Seed template for SCSI host registration */
+ if (dev == &phba->pcidev->dev) {
+ template = &phba->port_template;
+
+ if (phba->cfg_enable_fc4_type & LPFC_ENABLE_FCP) {
+ /* Seed physical port template */
+ memcpy(template, &lpfc_template, sizeof(*template));
+
+ if (use_no_reset_hba) {
+ /* template is for a no reset SCSI Host */
+ template->max_sectors = 0xffff;
+ template->eh_host_reset_handler = NULL;
+ }
+
+ /* Template for all vports this physical port creates */
+ memcpy(&phba->vport_template, &lpfc_template,
+ sizeof(*template));
+ phba->vport_template.max_sectors = 0xffff;
+ phba->vport_template.shost_attrs = lpfc_vport_attrs;
+ phba->vport_template.eh_bus_reset_handler = NULL;
+ phba->vport_template.eh_host_reset_handler = NULL;
+ phba->vport_template.vendor_id = 0;
+
+ /* Initialize the host templates with updated value */
+ if (phba->sli_rev == LPFC_SLI_REV4) {
+ template->sg_tablesize = phba->cfg_scsi_seg_cnt;
+ phba->vport_template.sg_tablesize =
+ phba->cfg_scsi_seg_cnt;
+ } else {
+ template->sg_tablesize = phba->cfg_sg_seg_cnt;
+ phba->vport_template.sg_tablesize =
+ phba->cfg_sg_seg_cnt;
+ }
+
} else {
- if (!use_no_reset_hba)
- shost = scsi_host_alloc(&lpfc_template,
- sizeof(struct lpfc_vport));
- else
- shost = scsi_host_alloc(&lpfc_template_no_hr,
- sizeof(struct lpfc_vport));
+ /* NVMET is for physical port only */
+ memcpy(template, &lpfc_template_nvme,
+ sizeof(*template));
}
- } else if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME) {
- shost = scsi_host_alloc(&lpfc_template_nvme,
- sizeof(struct lpfc_vport));
+ } else {
+ template = &phba->vport_template;
}
+
+ shost = scsi_host_alloc(template, sizeof(struct lpfc_vport));
if (!shost)
goto out;
@@ -4329,6 +4358,12 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev)
vport->port_type = LPFC_PHYSICAL_PORT;
}
+ lpfc_printf_log(phba, KERN_INFO, LOG_INIT | LOG_FCP,
+ "9081 CreatePort TMPLATE type %x TBLsize %d "
+ "SEGcnt %d/%d\n",
+ vport->port_type, shost->sg_tablesize,
+ phba->cfg_scsi_seg_cnt, phba->cfg_sg_seg_cnt);
+
/* Initialize all internally managed lists. */
INIT_LIST_HEAD(&vport->fc_nodes);
INIT_LIST_HEAD(&vport->rcv_buffer_list);
@@ -6301,11 +6336,6 @@ lpfc_sli_driver_resource_setup(struct lpfc_hba *phba)
* used to create the sg_dma_buf_pool must be dynamically calculated.
*/
- /* Initialize the host templates the configured values. */
- lpfc_vport_template.sg_tablesize = phba->cfg_sg_seg_cnt;
- lpfc_template_no_hr.sg_tablesize = phba->cfg_sg_seg_cnt;
- lpfc_template.sg_tablesize = phba->cfg_sg_seg_cnt;
-
if (phba->sli_rev == LPFC_SLI_REV4)
entry_sz = sizeof(struct sli4_sge);
else
@@ -6346,7 +6376,7 @@ lpfc_sli_driver_resource_setup(struct lpfc_hba *phba)
}
lpfc_printf_log(phba, KERN_INFO, LOG_INIT | LOG_FCP,
- "9088 sg_tablesize:%d dmabuf_size:%d total_bde:%d\n",
+ "9088 INIT sg_tablesize:%d dmabuf_size:%d total_bde:%d\n",
phba->cfg_sg_seg_cnt, phba->cfg_sg_dma_buf_size,
phba->cfg_total_seg_cnt);
@@ -6816,11 +6846,6 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba)
phba->cfg_nvme_seg_cnt = phba->cfg_sg_seg_cnt;
}
- /* Initialize the host templates with the updated values. */
- lpfc_vport_template.sg_tablesize = phba->cfg_scsi_seg_cnt;
- lpfc_template.sg_tablesize = phba->cfg_scsi_seg_cnt;
- lpfc_template_no_hr.sg_tablesize = phba->cfg_scsi_seg_cnt;
-
lpfc_printf_log(phba, KERN_INFO, LOG_INIT | LOG_FCP,
"9087 sg_seg_cnt:%d dmabuf_size:%d "
"total:%d scsi:%d nvme:%d\n",
@@ -6926,6 +6951,17 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba)
rc = -ENOMEM;
goto out_free_hba_cpu_map;
}
+
+#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+ phba->sli4_hba.c_stat = alloc_percpu(struct lpfc_hdwq_stat);
+ if (!phba->sli4_hba.c_stat) {
+ lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
+ "3332 Failed allocating per cpu hdwq stats\n");
+ rc = -ENOMEM;
+ goto out_free_hba_eq_info;
+ }
+#endif
+
/*
* Enable sr-iov virtual functions if supported and configured
* through the module parameter.
@@ -6945,6 +6981,10 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba)
return 0;
+#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+out_free_hba_eq_info:
+ free_percpu(phba->sli4_hba.eq_info);
+#endif
out_free_hba_cpu_map:
kfree(phba->sli4_hba.cpu_map);
out_free_hba_eq_hdl:
@@ -6983,6 +7023,9 @@ lpfc_sli4_driver_resource_unset(struct lpfc_hba *phba)
struct lpfc_fcf_conn_entry *conn_entry, *next_conn_entry;
free_percpu(phba->sli4_hba.eq_info);
+#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+ free_percpu(phba->sli4_hba.c_stat);
+#endif
/* Free memory allocated for msi-x interrupt vector to CPU mapping */
kfree(phba->sli4_hba.cpu_map);
@@ -10823,6 +10866,9 @@ lpfc_cpu_affinity_check(struct lpfc_hba *phba, int vectors)
#ifdef CONFIG_X86
struct cpuinfo_x86 *cpuinfo;
#endif
+#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+ struct lpfc_hdwq_stat *c_stat;
+#endif
max_phys_id = 0;
min_phys_id = LPFC_VECTOR_MAP_EMPTY;
@@ -11074,10 +11120,17 @@ found_any:
idx = 0;
for_each_possible_cpu(cpu) {
cpup = &phba->sli4_hba.cpu_map[cpu];
+#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+ c_stat = per_cpu_ptr(phba->sli4_hba.c_stat, cpu);
+ c_stat->hdwq_no = cpup->hdwq;
+#endif
if (cpup->hdwq != LPFC_VECTOR_MAP_EMPTY)
continue;
cpup->hdwq = idx++ % phba->cfg_hdw_queue;
+#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+ c_stat->hdwq_no = cpup->hdwq;
+#endif
lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
"3340 Set Affinity: not present "
"CPU %d hdwq %d\n",
@@ -11173,11 +11226,9 @@ static void lpfc_cpuhp_add(struct lpfc_hba *phba)
rcu_read_lock();
- if (!list_empty(&phba->poll_list)) {
- timer_setup(&phba->cpuhp_poll_timer, lpfc_sli4_poll_hbtimer, 0);
+ if (!list_empty(&phba->poll_list))
mod_timer(&phba->cpuhp_poll_timer,
jiffies + msecs_to_jiffies(LPFC_POLL_HB));
- }
rcu_read_unlock();
@@ -13145,6 +13196,7 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid)
lpfc_sli4_ras_setup(phba);
INIT_LIST_HEAD(&phba->poll_list);
+ timer_setup(&phba->cpuhp_poll_timer, lpfc_sli4_poll_hbtimer, 0);
cpuhp_state_add_instance_nocalls(lpfc_cpuhp_state, &phba->cpuhp);
return 0;
diff --git a/drivers/scsi/lpfc/lpfc_mbox.c b/drivers/scsi/lpfc/lpfc_mbox.c
index d1773c01d2b3..e35b52b66d6c 100644
--- a/drivers/scsi/lpfc/lpfc_mbox.c
+++ b/drivers/scsi/lpfc/lpfc_mbox.c
@@ -1299,8 +1299,6 @@ lpfc_config_port(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
if (phba->sli_rev == LPFC_SLI_REV3 && phba->vpd.sli3Feat.cerbm) {
if (phba->cfg_enable_bg)
mb->un.varCfgPort.cbg = 1; /* configure BlockGuard */
- if (phba->cfg_enable_dss)
- mb->un.varCfgPort.cdss = 1; /* Configure Security */
mb->un.varCfgPort.cerbm = 1; /* Request HBQs */
mb->un.varCfgPort.ccrp = 1; /* Command Ring Polling */
mb->un.varCfgPort.max_hbq = lpfc_sli_hbq_count();
diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c
index f6c8963c915d..a45936e08031 100644
--- a/drivers/scsi/lpfc/lpfc_nvme.c
+++ b/drivers/scsi/lpfc/lpfc_nvme.c
@@ -382,13 +382,15 @@ lpfc_nvme_remoteport_delete(struct nvme_fc_remote_port *remoteport)
if (ndlp->upcall_flags & NLP_WAIT_FOR_UNREG) {
ndlp->nrport = NULL;
ndlp->upcall_flags &= ~NLP_WAIT_FOR_UNREG;
- }
- spin_unlock_irq(&vport->phba->hbalock);
+ spin_unlock_irq(&vport->phba->hbalock);
- /* Remove original register reference. The host transport
- * won't reference this rport/remoteport any further.
- */
- lpfc_nlp_put(ndlp);
+ /* Remove original register reference. The host transport
+ * won't reference this rport/remoteport any further.
+ */
+ lpfc_nlp_put(ndlp);
+ } else {
+ spin_unlock_irq(&vport->phba->hbalock);
+ }
rport_err:
return;
@@ -897,88 +899,6 @@ lpfc_nvme_adj_fcp_sgls(struct lpfc_vport *vport,
sgl->sge_len = cpu_to_le32(nCmd->rsplen);
}
-#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
-static void
-lpfc_nvme_ktime(struct lpfc_hba *phba,
- struct lpfc_io_buf *lpfc_ncmd)
-{
- uint64_t seg1, seg2, seg3, seg4;
- uint64_t segsum;
-
- if (!lpfc_ncmd->ts_last_cmd ||
- !lpfc_ncmd->ts_cmd_start ||
- !lpfc_ncmd->ts_cmd_wqput ||
- !lpfc_ncmd->ts_isr_cmpl ||
- !lpfc_ncmd->ts_data_nvme)
- return;
-
- if (lpfc_ncmd->ts_data_nvme < lpfc_ncmd->ts_cmd_start)
- return;
- if (lpfc_ncmd->ts_cmd_start < lpfc_ncmd->ts_last_cmd)
- return;
- if (lpfc_ncmd->ts_cmd_wqput < lpfc_ncmd->ts_cmd_start)
- return;
- if (lpfc_ncmd->ts_isr_cmpl < lpfc_ncmd->ts_cmd_wqput)
- return;
- if (lpfc_ncmd->ts_data_nvme < lpfc_ncmd->ts_isr_cmpl)
- return;
- /*
- * Segment 1 - Time from Last FCP command cmpl is handed
- * off to NVME Layer to start of next command.
- * Segment 2 - Time from Driver receives a IO cmd start
- * from NVME Layer to WQ put is done on IO cmd.
- * Segment 3 - Time from Driver WQ put is done on IO cmd
- * to MSI-X ISR for IO cmpl.
- * Segment 4 - Time from MSI-X ISR for IO cmpl to when
- * cmpl is handled off to the NVME Layer.
- */
- seg1 = lpfc_ncmd->ts_cmd_start - lpfc_ncmd->ts_last_cmd;
- if (seg1 > 5000000) /* 5 ms - for sequential IOs only */
- seg1 = 0;
-
- /* Calculate times relative to start of IO */
- seg2 = (lpfc_ncmd->ts_cmd_wqput - lpfc_ncmd->ts_cmd_start);
- segsum = seg2;
- seg3 = lpfc_ncmd->ts_isr_cmpl - lpfc_ncmd->ts_cmd_start;
- if (segsum > seg3)
- return;
- seg3 -= segsum;
- segsum += seg3;
-
- seg4 = lpfc_ncmd->ts_data_nvme - lpfc_ncmd->ts_cmd_start;
- if (segsum > seg4)
- return;
- seg4 -= segsum;
-
- phba->ktime_data_samples++;
- phba->ktime_seg1_total += seg1;
- if (seg1 < phba->ktime_seg1_min)
- phba->ktime_seg1_min = seg1;
- else if (seg1 > phba->ktime_seg1_max)
- phba->ktime_seg1_max = seg1;
- phba->ktime_seg2_total += seg2;
- if (seg2 < phba->ktime_seg2_min)
- phba->ktime_seg2_min = seg2;
- else if (seg2 > phba->ktime_seg2_max)
- phba->ktime_seg2_max = seg2;
- phba->ktime_seg3_total += seg3;
- if (seg3 < phba->ktime_seg3_min)
- phba->ktime_seg3_min = seg3;
- else if (seg3 > phba->ktime_seg3_max)
- phba->ktime_seg3_max = seg3;
- phba->ktime_seg4_total += seg4;
- if (seg4 < phba->ktime_seg4_min)
- phba->ktime_seg4_min = seg4;
- else if (seg4 > phba->ktime_seg4_max)
- phba->ktime_seg4_max = seg4;
-
- lpfc_ncmd->ts_last_cmd = 0;
- lpfc_ncmd->ts_cmd_start = 0;
- lpfc_ncmd->ts_cmd_wqput = 0;
- lpfc_ncmd->ts_isr_cmpl = 0;
- lpfc_ncmd->ts_data_nvme = 0;
-}
-#endif
/**
* lpfc_nvme_io_cmd_wqe_cmpl - Complete an NVME-over-FCP IO
@@ -1010,6 +930,9 @@ lpfc_nvme_io_cmd_wqe_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *pwqeIn,
uint32_t code, status, idx;
uint16_t cid, sqhd, data;
uint32_t *ptr;
+#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+ int cpu;
+#endif
/* Sanity check on return of outstanding command */
if (!lpfc_ncmd) {
@@ -1178,23 +1101,19 @@ out_err:
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
if (lpfc_ncmd->ts_cmd_start) {
lpfc_ncmd->ts_isr_cmpl = pwqeIn->isr_timestamp;
- lpfc_ncmd->ts_data_nvme = ktime_get_ns();
- phba->ktime_last_cmd = lpfc_ncmd->ts_data_nvme;
- lpfc_nvme_ktime(phba, lpfc_ncmd);
+ lpfc_ncmd->ts_data_io = ktime_get_ns();
+ phba->ktime_last_cmd = lpfc_ncmd->ts_data_io;
+ lpfc_io_ktime(phba, lpfc_ncmd);
}
- if (unlikely(phba->cpucheck_on & LPFC_CHECK_NVME_IO)) {
- uint32_t cpu;
- idx = lpfc_ncmd->cur_iocbq.hba_wqidx;
+ if (unlikely(phba->hdwqstat_on & LPFC_CHECK_NVME_IO)) {
cpu = raw_smp_processor_id();
- if (cpu < LPFC_CHECK_CPU_CNT) {
- if (lpfc_ncmd->cpu != cpu)
- lpfc_printf_vlog(vport,
- KERN_INFO, LOG_NVME_IOERR,
- "6701 CPU Check cmpl: "
- "cpu %d expect %d\n",
- cpu, lpfc_ncmd->cpu);
- phba->sli4_hba.hdwq[idx].cpucheck_cmpl_io[cpu]++;
- }
+ this_cpu_inc(phba->sli4_hba.c_stat->cmpl_io);
+ if (lpfc_ncmd->cpu != cpu)
+ lpfc_printf_vlog(vport,
+ KERN_INFO, LOG_NVME_IOERR,
+ "6701 CPU Check cmpl: "
+ "cpu %d expect %d\n",
+ cpu, lpfc_ncmd->cpu);
}
#endif
@@ -1743,19 +1662,17 @@ lpfc_nvme_fcp_io_submit(struct nvme_fc_local_port *pnvme_lport,
if (lpfc_ncmd->ts_cmd_start)
lpfc_ncmd->ts_cmd_wqput = ktime_get_ns();
- if (phba->cpucheck_on & LPFC_CHECK_NVME_IO) {
+ if (phba->hdwqstat_on & LPFC_CHECK_NVME_IO) {
cpu = raw_smp_processor_id();
- if (cpu < LPFC_CHECK_CPU_CNT) {
- lpfc_ncmd->cpu = cpu;
- if (idx != cpu)
- lpfc_printf_vlog(vport,
- KERN_INFO, LOG_NVME_IOERR,
- "6702 CPU Check cmd: "
- "cpu %d wq %d\n",
- lpfc_ncmd->cpu,
- lpfc_queue_info->index);
- phba->sli4_hba.hdwq[idx].cpucheck_xmt_io[cpu]++;
- }
+ this_cpu_inc(phba->sli4_hba.c_stat->xmt_io);
+ lpfc_ncmd->cpu = cpu;
+ if (idx != cpu)
+ lpfc_printf_vlog(vport,
+ KERN_INFO, LOG_NVME_IOERR,
+ "6702 CPU Check cmd: "
+ "cpu %d wq %d\n",
+ lpfc_ncmd->cpu,
+ lpfc_queue_info->index);
}
#endif
return 0;
@@ -1985,8 +1902,6 @@ out_unlock:
/* Declare and initialization an instance of the FC NVME template. */
static struct nvme_fc_port_template lpfc_nvme_template = {
- .module = THIS_MODULE,
-
/* initiator-based functions */
.localport_delete = lpfc_nvme_localport_delete,
.remoteport_delete = lpfc_nvme_remoteport_delete,
diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c
index 9dc9afe1c255..565419bf8d74 100644
--- a/drivers/scsi/lpfc/lpfc_nvmet.c
+++ b/drivers/scsi/lpfc/lpfc_nvmet.c
@@ -707,7 +707,7 @@ lpfc_nvmet_xmt_fcp_op_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
struct lpfc_nvmet_rcv_ctx *ctxp;
uint32_t status, result, op, start_clean, logerr;
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
- uint32_t id;
+ int id;
#endif
ctxp = cmdwqe->context2;
@@ -814,16 +814,14 @@ lpfc_nvmet_xmt_fcp_op_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
rsp->done(rsp);
}
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
- if (phba->cpucheck_on & LPFC_CHECK_NVMET_IO) {
+ if (phba->hdwqstat_on & LPFC_CHECK_NVMET_IO) {
id = raw_smp_processor_id();
- if (id < LPFC_CHECK_CPU_CNT) {
- if (ctxp->cpu != id)
- lpfc_printf_log(phba, KERN_INFO, LOG_NVME_IOERR,
- "6704 CPU Check cmdcmpl: "
- "cpu %d expect %d\n",
- id, ctxp->cpu);
- phba->sli4_hba.hdwq[rsp->hwqid].cpucheck_cmpl_io[id]++;
- }
+ this_cpu_inc(phba->sli4_hba.c_stat->cmpl_io);
+ if (ctxp->cpu != id)
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME_IOERR,
+ "6704 CPU Check cmdcmpl: "
+ "cpu %d expect %d\n",
+ id, ctxp->cpu);
}
#endif
}
@@ -931,6 +929,9 @@ lpfc_nvmet_xmt_fcp_op(struct nvmet_fc_target_port *tgtport,
struct lpfc_sli_ring *pring;
unsigned long iflags;
int rc;
+#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+ int id;
+#endif
if (phba->pport->load_flag & FC_UNLOADING) {
rc = -ENODEV;
@@ -954,16 +955,14 @@ lpfc_nvmet_xmt_fcp_op(struct nvmet_fc_target_port *tgtport,
if (!ctxp->hdwq)
ctxp->hdwq = &phba->sli4_hba.hdwq[rsp->hwqid];
- if (phba->cpucheck_on & LPFC_CHECK_NVMET_IO) {
- int id = raw_smp_processor_id();
- if (id < LPFC_CHECK_CPU_CNT) {
- if (rsp->hwqid != id)
- lpfc_printf_log(phba, KERN_INFO, LOG_NVME_IOERR,
- "6705 CPU Check OP: "
- "cpu %d expect %d\n",
- id, rsp->hwqid);
- phba->sli4_hba.hdwq[rsp->hwqid].cpucheck_xmt_io[id]++;
- }
+ if (phba->hdwqstat_on & LPFC_CHECK_NVMET_IO) {
+ id = raw_smp_processor_id();
+ this_cpu_inc(phba->sli4_hba.c_stat->xmt_io);
+ if (rsp->hwqid != id)
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME_IOERR,
+ "6705 CPU Check OP: "
+ "cpu %d expect %d\n",
+ id, rsp->hwqid);
ctxp->cpu = id; /* Setup cpu for cmpl check */
}
#endif
@@ -2270,15 +2269,13 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba,
size = nvmebuf->bytes_recv;
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
- if (phba->cpucheck_on & LPFC_CHECK_NVMET_RCV) {
- if (current_cpu < LPFC_CHECK_CPU_CNT) {
- if (idx != current_cpu)
- lpfc_printf_log(phba, KERN_INFO, LOG_NVME_IOERR,
- "6703 CPU Check rcv: "
- "cpu %d expect %d\n",
- current_cpu, idx);
- phba->sli4_hba.hdwq[idx].cpucheck_rcv_io[current_cpu]++;
- }
+ if (phba->hdwqstat_on & LPFC_CHECK_NVMET_IO) {
+ this_cpu_inc(phba->sli4_hba.c_stat->rcv_io);
+ if (idx != current_cpu)
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME_IOERR,
+ "6703 CPU Check rcv: "
+ "cpu %d expect %d\n",
+ current_cpu, idx);
}
#endif
@@ -2598,7 +2595,7 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba *phba,
union lpfc_wqe128 *wqe;
struct ulp_bde64 *bde;
dma_addr_t physaddr;
- int i, cnt;
+ int i, cnt, nsegs;
int do_pbde;
int xc = 1;
@@ -2629,6 +2626,7 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba *phba,
phba->cfg_nvme_seg_cnt);
return NULL;
}
+ nsegs = rsp->sg_cnt;
tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private;
nvmewqe = ctxp->wqeq;
@@ -2868,7 +2866,7 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba *phba,
wqe->fcp_trsp.rsvd_12_15[0] = 0;
/* Use rspbuf, NOT sg list */
- rsp->sg_cnt = 0;
+ nsegs = 0;
sgl->word2 = 0;
atomic_inc(&tgtp->xmt_fcp_rsp);
break;
@@ -2885,7 +2883,7 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba *phba,
nvmewqe->drvrTimeout = (phba->fc_ratov * 3) + LPFC_DRVR_TIMEOUT;
nvmewqe->context1 = ndlp;
- for_each_sg(rsp->sg, sgel, rsp->sg_cnt, i) {
+ for_each_sg(rsp->sg, sgel, nsegs, i) {
physaddr = sg_dma_address(sgel);
cnt = sg_dma_len(sgel);
sgl->addr_hi = putPaddrHigh(physaddr);
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index 0fc9a242bc65..ad62fb3f3a54 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -3805,9 +3805,6 @@ lpfc_scsi_cmd_iocb_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *pIocbIn,
struct Scsi_Host *shost;
int idx;
uint32_t logit = LOG_FCP;
-#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
- int cpu;
-#endif
/* Guard against abort handler being called at same time */
spin_lock(&lpfc_cmd->buf_lock);
@@ -3826,11 +3823,8 @@ lpfc_scsi_cmd_iocb_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *pIocbIn,
phba->sli4_hba.hdwq[idx].scsi_cstat.io_cmpls++;
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
- if (unlikely(phba->cpucheck_on & LPFC_CHECK_SCSI_IO)) {
- cpu = raw_smp_processor_id();
- if (cpu < LPFC_CHECK_CPU_CNT && phba->sli4_hba.hdwq)
- phba->sli4_hba.hdwq[idx].cpucheck_cmpl_io[cpu]++;
- }
+ if (unlikely(phba->hdwqstat_on & LPFC_CHECK_SCSI_IO))
+ this_cpu_inc(phba->sli4_hba.c_stat->cmpl_io);
#endif
shost = cmd->device->host;
@@ -4031,6 +4025,14 @@ lpfc_scsi_cmd_iocb_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *pIocbIn,
lpfc_cmd->pCmd = NULL;
spin_unlock(&lpfc_cmd->buf_lock);
+#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+ if (lpfc_cmd->ts_cmd_start) {
+ lpfc_cmd->ts_isr_cmpl = pIocbIn->isr_timestamp;
+ lpfc_cmd->ts_data_io = ktime_get_ns();
+ phba->ktime_last_cmd = lpfc_cmd->ts_data_io;
+ lpfc_io_ktime(phba, lpfc_cmd);
+ }
+#endif
/* The sdev is not guaranteed to be valid post scsi_done upcall. */
cmd->scsi_done(cmd);
@@ -4504,7 +4506,10 @@ lpfc_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *cmnd)
struct fc_rport *rport = starget_to_rport(scsi_target(cmnd->device));
int err, idx;
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
- int cpu;
+ uint64_t start = 0L;
+
+ if (phba->ktime_on)
+ start = ktime_get_ns();
#endif
rdata = lpfc_rport_data_from_scsi_device(cmnd->device);
@@ -4626,17 +4631,20 @@ lpfc_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *cmnd)
lpfc_scsi_prep_cmnd(vport, lpfc_cmd, ndlp);
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
- if (unlikely(phba->cpucheck_on & LPFC_CHECK_SCSI_IO)) {
- cpu = raw_smp_processor_id();
- if (cpu < LPFC_CHECK_CPU_CNT) {
- struct lpfc_sli4_hdw_queue *hdwq =
- &phba->sli4_hba.hdwq[lpfc_cmd->hdwq_no];
- hdwq->cpucheck_xmt_io[cpu]++;
- }
- }
+ if (unlikely(phba->hdwqstat_on & LPFC_CHECK_SCSI_IO))
+ this_cpu_inc(phba->sli4_hba.c_stat->xmt_io);
#endif
err = lpfc_sli_issue_iocb(phba, LPFC_FCP_RING,
&lpfc_cmd->cur_iocbq, SLI_IOCB_RET_IOCB);
+#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+ if (start) {
+ lpfc_cmd->ts_cmd_start = start;
+ lpfc_cmd->ts_last_cmd = phba->ktime_last_cmd;
+ lpfc_cmd->ts_cmd_wqput = ktime_get_ns();
+ } else {
+ lpfc_cmd->ts_cmd_start = 0;
+ }
+#endif
if (err) {
lpfc_printf_vlog(vport, KERN_INFO, LOG_FCP,
"3376 FCP could not issue IOCB err %x"
@@ -6023,31 +6031,6 @@ struct scsi_host_template lpfc_template_nvme = {
.track_queue_depth = 0,
};
-struct scsi_host_template lpfc_template_no_hr = {
- .module = THIS_MODULE,
- .name = LPFC_DRIVER_NAME,
- .proc_name = LPFC_DRIVER_NAME,
- .info = lpfc_info,
- .queuecommand = lpfc_queuecommand,
- .eh_timed_out = fc_eh_timed_out,
- .eh_abort_handler = lpfc_abort_handler,
- .eh_device_reset_handler = lpfc_device_reset_handler,
- .eh_target_reset_handler = lpfc_target_reset_handler,
- .eh_bus_reset_handler = lpfc_bus_reset_handler,
- .slave_alloc = lpfc_slave_alloc,
- .slave_configure = lpfc_slave_configure,
- .slave_destroy = lpfc_slave_destroy,
- .scan_finished = lpfc_scan_finished,
- .this_id = -1,
- .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT,
- .cmd_per_lun = LPFC_CMD_PER_LUN,
- .shost_attrs = lpfc_hba_attrs,
- .max_sectors = 0xFFFFFFFF,
- .vendor_id = LPFC_NL_VENDOR_ID,
- .change_queue_depth = scsi_change_queue_depth,
- .track_queue_depth = 1,
-};
-
struct scsi_host_template lpfc_template = {
.module = THIS_MODULE,
.name = LPFC_DRIVER_NAME,
@@ -6073,26 +6056,3 @@ struct scsi_host_template lpfc_template = {
.change_queue_depth = scsi_change_queue_depth,
.track_queue_depth = 1,
};
-
-struct scsi_host_template lpfc_vport_template = {
- .module = THIS_MODULE,
- .name = LPFC_DRIVER_NAME,
- .proc_name = LPFC_DRIVER_NAME,
- .info = lpfc_info,
- .queuecommand = lpfc_queuecommand,
- .eh_timed_out = fc_eh_timed_out,
- .eh_abort_handler = lpfc_abort_handler,
- .eh_device_reset_handler = lpfc_device_reset_handler,
- .eh_target_reset_handler = lpfc_target_reset_handler,
- .slave_alloc = lpfc_slave_alloc,
- .slave_configure = lpfc_slave_configure,
- .slave_destroy = lpfc_slave_destroy,
- .scan_finished = lpfc_scan_finished,
- .this_id = -1,
- .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT,
- .cmd_per_lun = LPFC_CMD_PER_LUN,
- .shost_attrs = lpfc_vport_attrs,
- .max_sectors = 0xFFFF,
- .change_queue_depth = scsi_change_queue_depth,
- .track_queue_depth = 1,
-};
diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index 0b26b5c0527e..b6fb665e6ec4 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -230,25 +230,16 @@ lpfc_sli4_wq_put(struct lpfc_queue *q, union lpfc_wqe128 *wqe)
* This routine will update the HBA index of a queue to reflect consumption of
* Work Queue Entries by the HBA. When the HBA indicates that it has consumed
* an entry the host calls this function to update the queue's internal
- * pointers. This routine returns the number of entries that were consumed by
- * the HBA.
+ * pointers.
**/
-static uint32_t
+static void
lpfc_sli4_wq_release(struct lpfc_queue *q, uint32_t index)
{
- uint32_t released = 0;
-
/* sanity check on queue memory */
if (unlikely(!q))
- return 0;
+ return;
- if (q->hba_index == index)
- return 0;
- do {
- q->hba_index = ((q->hba_index + 1) % q->entry_count);
- released++;
- } while (q->hba_index != index);
- return released;
+ q->hba_index = index;
}
/**
@@ -2511,6 +2502,8 @@ lpfc_sli_def_mbox_cmpl(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
!pmb->u.mb.mbxStatus) {
rpi = pmb->u.mb.un.varWords[0];
vpi = pmb->u.mb.un.varRegLogin.vpi;
+ if (phba->sli_rev == LPFC_SLI_REV4)
+ vpi -= phba->sli4_hba.max_cfg_param.vpi_base;
lpfc_unreg_login(phba, vpi, rpi, pmb);
pmb->vport = vport;
pmb->mbox_cmpl = lpfc_sli_def_mbox_cmpl;
@@ -4044,6 +4037,11 @@ lpfc_sli_flush_io_rings(struct lpfc_hba *phba)
struct lpfc_iocbq *piocb, *next_iocb;
spin_lock_irq(&phba->hbalock);
+ if (phba->hba_flag & HBA_IOQ_FLUSH ||
+ !phba->sli4_hba.hdwq) {
+ spin_unlock_irq(&phba->hbalock);
+ return;
+ }
/* Indicate the I/O queues are flushed */
phba->hba_flag |= HBA_IOQ_FLUSH;
spin_unlock_irq(&phba->hbalock);
@@ -5034,23 +5032,6 @@ lpfc_sli_config_port(struct lpfc_hba *phba, int sli_mode)
} else
phba->max_vpi = 0;
- phba->fips_level = 0;
- phba->fips_spec_rev = 0;
- if (pmb->u.mb.un.varCfgPort.gdss) {
- phba->sli3_options |= LPFC_SLI3_DSS_ENABLED;
- phba->fips_level = pmb->u.mb.un.varCfgPort.fips_level;
- phba->fips_spec_rev = pmb->u.mb.un.varCfgPort.fips_rev;
- lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
- "2850 Security Crypto Active. FIPS x%d "
- "(Spec Rev: x%d)",
- phba->fips_level, phba->fips_spec_rev);
- }
- if (pmb->u.mb.un.varCfgPort.sec_err) {
- lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
- "2856 Config Port Security Crypto "
- "Error: x%x ",
- pmb->u.mb.un.varCfgPort.sec_err);
- }
if (pmb->u.mb.un.varCfgPort.gerbm)
phba->sli3_options |= LPFC_SLI3_HBQ_ENABLED;
if (pmb->u.mb.un.varCfgPort.gcrp)
@@ -14442,12 +14423,10 @@ static inline void lpfc_sli4_add_to_poll_list(struct lpfc_queue *eq)
{
struct lpfc_hba *phba = eq->phba;
- if (list_empty(&phba->poll_list)) {
- timer_setup(&phba->cpuhp_poll_timer, lpfc_sli4_poll_hbtimer, 0);
- /* kickstart slowpath processing for this eq */
+ /* kickstart slowpath processing if needed */
+ if (list_empty(&phba->poll_list))
mod_timer(&phba->cpuhp_poll_timer,
jiffies + msecs_to_jiffies(LPFC_POLL_HB));
- }
list_add_rcu(&eq->_poll_list, &phba->poll_list);
synchronize_rcu();
diff --git a/drivers/scsi/lpfc/lpfc_sli.h b/drivers/scsi/lpfc/lpfc_sli.h
index 7bcf922a8be2..93d976ea8c5d 100644
--- a/drivers/scsi/lpfc/lpfc_sli.h
+++ b/drivers/scsi/lpfc/lpfc_sli.h
@@ -446,6 +446,6 @@ struct lpfc_io_buf {
uint64_t ts_last_cmd;
uint64_t ts_cmd_wqput;
uint64_t ts_isr_cmpl;
- uint64_t ts_data_nvme;
+ uint64_t ts_data_io;
#endif
};
diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h
index d963ca871383..8da7429e385a 100644
--- a/drivers/scsi/lpfc/lpfc_sli4.h
+++ b/drivers/scsi/lpfc/lpfc_sli4.h
@@ -697,13 +697,6 @@ struct lpfc_sli4_hdw_queue {
struct lpfc_lock_stat lock_conflict;
#endif
-#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
-#define LPFC_CHECK_CPU_CNT 128
- uint32_t cpucheck_rcv_io[LPFC_CHECK_CPU_CNT];
- uint32_t cpucheck_xmt_io[LPFC_CHECK_CPU_CNT];
- uint32_t cpucheck_cmpl_io[LPFC_CHECK_CPU_CNT];
-#endif
-
/* Per HDWQ pool resources */
struct list_head sgl_list;
struct list_head cmd_rsp_buf_list;
@@ -740,6 +733,15 @@ struct lpfc_sli4_hdw_queue {
#define lpfc_qp_spin_lock(lock, qp, lstat) spin_lock(lock)
#endif
+#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+struct lpfc_hdwq_stat {
+ u32 hdwq_no;
+ u32 rcv_io;
+ u32 xmt_io;
+ u32 cmpl_io;
+};
+#endif
+
struct lpfc_sli4_hba {
void __iomem *conf_regs_memmap_p; /* Kernel memory mapped address for
* config space registers
@@ -921,6 +923,9 @@ struct lpfc_sli4_hba {
struct cpumask numa_mask;
uint16_t curr_disp_cpu;
struct lpfc_eq_intr_info __percpu *eq_info;
+#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+ struct lpfc_hdwq_stat __percpu *c_stat;
+#endif
uint32_t conf_trunk;
#define lpfc_conf_trunk_port0_WORD conf_trunk
#define lpfc_conf_trunk_port0_SHIFT 0
diff --git a/drivers/scsi/lpfc/lpfc_version.h b/drivers/scsi/lpfc/lpfc_version.h
index c4ab006e6ecc..ca40c47cfbe0 100644
--- a/drivers/scsi/lpfc/lpfc_version.h
+++ b/drivers/scsi/lpfc/lpfc_version.h
@@ -20,7 +20,7 @@
* included with this package. *
*******************************************************************/
-#define LPFC_DRIVER_VERSION "12.6.0.4"
+#define LPFC_DRIVER_VERSION "12.8.0.0"
#define LPFC_DRIVER_NAME "lpfc"
/* Used for SLI 2/3 */
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index 778d5e6ce385..04a40afe60e3 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -9908,8 +9908,8 @@ static void scsih_remove(struct pci_dev *pdev)
ioc->remove_host = 1;
- mpt3sas_wait_for_commands_to_complete(ioc);
- _scsih_flush_running_cmds(ioc);
+ if (!pci_device_is_present(pdev))
+ _scsih_flush_running_cmds(ioc);
_scsih_fw_event_cleanup_queue(ioc);
@@ -9992,8 +9992,8 @@ scsih_shutdown(struct pci_dev *pdev)
ioc->remove_host = 1;
- mpt3sas_wait_for_commands_to_complete(ioc);
- _scsih_flush_running_cmds(ioc);
+ if (!pci_device_is_present(pdev))
+ _scsih_flush_running_cmds(ioc);
_scsih_fw_event_cleanup_queue(ioc);
diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c
index 84e2a980dea0..4886d247df6f 100644
--- a/drivers/scsi/qla2xxx/qla_nvme.c
+++ b/drivers/scsi/qla2xxx/qla_nvme.c
@@ -610,7 +610,6 @@ static void qla_nvme_remoteport_delete(struct nvme_fc_remote_port *rport)
}
static struct nvme_fc_port_template qla_nvme_fc_transport = {
- .module = THIS_MODULE,
.localport_delete = qla_nvme_localport_delete,
.remoteport_delete = qla_nvme_remoteport_delete,
.create_queue = qla_nvme_alloc_queue,
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 0ec1b31c75a9..b2a803c51288 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2022,7 +2022,7 @@ static void __iscsi_unbind_session(struct work_struct *work)
if (session->target_id == ISCSI_MAX_TARGET) {
spin_unlock_irqrestore(&session->lock, flags);
mutex_unlock(&ihost->mutex);
- return;
+ goto unbind_session_exit;
}
target_id = session->target_id;
@@ -2034,6 +2034,8 @@ static void __iscsi_unbind_session(struct work_struct *work)
ida_simple_remove(&iscsi_sess_ida, target_id);
scsi_remove_target(&session->dev);
+
+unbind_session_exit:
iscsi_session_event(session, ISCSI_KEVENT_UNBIND_SESSION);
ISCSI_DBG_TRANS_SESSION(session, "Completed target removal\n");
}
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 1c270e6034d5..d2fe3fa470f9 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -550,10 +550,12 @@ out:
static void sr_block_release(struct gendisk *disk, fmode_t mode)
{
struct scsi_cd *cd = scsi_cd(disk);
+
mutex_lock(&cd->lock);
cdrom_release(&cd->cdi, mode);
- scsi_cd_put(cd);
mutex_unlock(&cd->lock);
+
+ scsi_cd_put(cd);
}
static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
diff --git a/drivers/scsi/ufs/ufs-mediatek.c b/drivers/scsi/ufs/ufs-mediatek.c
index 40a66b31b31f..673c16596fb2 100644
--- a/drivers/scsi/ufs/ufs-mediatek.c
+++ b/drivers/scsi/ufs/ufs-mediatek.c
@@ -499,8 +499,15 @@ static int ufs_mtk_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op)
if (ufshcd_is_link_hibern8(hba)) {
err = ufs_mtk_link_set_lpm(hba);
- if (err)
+ if (err) {
+ /*
+ * Set link as off state enforcedly to trigger
+ * ufshcd_host_reset_and_restore() in ufshcd_suspend()
+ * for completed host reset.
+ */
+ ufshcd_set_link_off(hba);
return -EAGAIN;
+ }
}
if (!ufshcd_is_link_active(hba))
@@ -519,8 +526,10 @@ static int ufs_mtk_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op)
if (ufshcd_is_link_hibern8(hba)) {
err = ufs_mtk_link_set_hpm(hba);
- if (err)
+ if (err) {
+ err = ufshcd_link_recovery(hba);
return err;
+ }
}
return 0;
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index e04e8b8bdca6..698e8d20b4ba 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -172,19 +172,6 @@ enum {
#define ufshcd_clear_eh_in_progress(h) \
((h)->eh_flags &= ~UFSHCD_EH_IN_PROGRESS)
-#define ufshcd_set_ufs_dev_active(h) \
- ((h)->curr_dev_pwr_mode = UFS_ACTIVE_PWR_MODE)
-#define ufshcd_set_ufs_dev_sleep(h) \
- ((h)->curr_dev_pwr_mode = UFS_SLEEP_PWR_MODE)
-#define ufshcd_set_ufs_dev_poweroff(h) \
- ((h)->curr_dev_pwr_mode = UFS_POWERDOWN_PWR_MODE)
-#define ufshcd_is_ufs_dev_active(h) \
- ((h)->curr_dev_pwr_mode == UFS_ACTIVE_PWR_MODE)
-#define ufshcd_is_ufs_dev_sleep(h) \
- ((h)->curr_dev_pwr_mode == UFS_SLEEP_PWR_MODE)
-#define ufshcd_is_ufs_dev_poweroff(h) \
- ((h)->curr_dev_pwr_mode == UFS_POWERDOWN_PWR_MODE)
-
struct ufs_pm_lvl_states ufs_pm_lvl_states[] = {
{UFS_ACTIVE_PWR_MODE, UIC_LINK_ACTIVE_STATE},
{UFS_ACTIVE_PWR_MODE, UIC_LINK_HIBERN8_STATE},
@@ -868,28 +855,29 @@ static bool ufshcd_is_unipro_pa_params_tuning_req(struct ufs_hba *hba)
return false;
}
-static int ufshcd_scale_clks(struct ufs_hba *hba, bool scale_up)
+/**
+ * ufshcd_set_clk_freq - set UFS controller clock frequencies
+ * @hba: per adapter instance
+ * @scale_up: If True, set max possible frequency othewise set low frequency
+ *
+ * Returns 0 if successful
+ * Returns < 0 for any other errors
+ */
+static int ufshcd_set_clk_freq(struct ufs_hba *hba, bool scale_up)
{
int ret = 0;
struct ufs_clk_info *clki;
struct list_head *head = &hba->clk_list_head;
- ktime_t start = ktime_get();
- bool clk_state_changed = false;
if (list_empty(head))
goto out;
- ret = ufshcd_vops_clk_scale_notify(hba, scale_up, PRE_CHANGE);
- if (ret)
- return ret;
-
list_for_each_entry(clki, head, list) {
if (!IS_ERR_OR_NULL(clki->clk)) {
if (scale_up && clki->max_freq) {
if (clki->curr_freq == clki->max_freq)
continue;
- clk_state_changed = true;
ret = clk_set_rate(clki->clk, clki->max_freq);
if (ret) {
dev_err(hba->dev, "%s: %s clk set rate(%dHz) failed, %d\n",
@@ -908,7 +896,6 @@ static int ufshcd_scale_clks(struct ufs_hba *hba, bool scale_up)
if (clki->curr_freq == clki->min_freq)
continue;
- clk_state_changed = true;
ret = clk_set_rate(clki->clk, clki->min_freq);
if (ret) {
dev_err(hba->dev, "%s: %s clk set rate(%dHz) failed, %d\n",
@@ -927,11 +914,37 @@ static int ufshcd_scale_clks(struct ufs_hba *hba, bool scale_up)
clki->name, clk_get_rate(clki->clk));
}
+out:
+ return ret;
+}
+
+/**
+ * ufshcd_scale_clks - scale up or scale down UFS controller clocks
+ * @hba: per adapter instance
+ * @scale_up: True if scaling up and false if scaling down
+ *
+ * Returns 0 if successful
+ * Returns < 0 for any other errors
+ */
+static int ufshcd_scale_clks(struct ufs_hba *hba, bool scale_up)
+{
+ int ret = 0;
+ ktime_t start = ktime_get();
+
+ ret = ufshcd_vops_clk_scale_notify(hba, scale_up, PRE_CHANGE);
+ if (ret)
+ goto out;
+
+ ret = ufshcd_set_clk_freq(hba, scale_up);
+ if (ret)
+ goto out;
+
ret = ufshcd_vops_clk_scale_notify(hba, scale_up, POST_CHANGE);
+ if (ret)
+ ufshcd_set_clk_freq(hba, !scale_up);
out:
- if (clk_state_changed)
- trace_ufshcd_profile_clk_scaling(dev_name(hba->dev),
+ trace_ufshcd_profile_clk_scaling(dev_name(hba->dev),
(scale_up ? "up" : "down"),
ktime_to_us(ktime_sub(ktime_get(), start)), ret);
return ret;
@@ -1065,8 +1078,7 @@ static int ufshcd_scale_gear(struct ufs_hba *hba, bool scale_up)
}
/* check if the power mode needs to be changed or not? */
- ret = ufshcd_change_power_mode(hba, &new_pwr_info);
-
+ ret = ufshcd_config_pwr_mode(hba, &new_pwr_info);
if (ret)
dev_err(hba->dev, "%s: failed err %d, old gear: (tx %d rx %d), new gear: (tx %d rx %d)",
__func__, ret,
@@ -1119,35 +1131,32 @@ static int ufshcd_devfreq_scale(struct ufs_hba *hba, bool scale_up)
ret = ufshcd_clock_scaling_prepare(hba);
if (ret)
- return ret;
+ goto out;
/* scale down the gear before scaling down clocks */
if (!scale_up) {
ret = ufshcd_scale_gear(hba, false);
if (ret)
- goto out;
+ goto out_unprepare;
}
ret = ufshcd_scale_clks(hba, scale_up);
if (ret) {
if (!scale_up)
ufshcd_scale_gear(hba, true);
- goto out;
+ goto out_unprepare;
}
/* scale up the gear after scaling up clocks */
if (scale_up) {
ret = ufshcd_scale_gear(hba, true);
- if (ret) {
+ if (ret)
ufshcd_scale_clks(hba, false);
- goto out;
- }
}
- ret = ufshcd_vops_clk_scale_notify(hba, scale_up, POST_CHANGE);
-
-out:
+out_unprepare:
ufshcd_clock_scaling_unprepare(hba);
+out:
ufshcd_release(hba);
return ret;
}
@@ -3785,7 +3794,7 @@ out:
return ret;
}
-static int ufshcd_link_recovery(struct ufs_hba *hba)
+int ufshcd_link_recovery(struct ufs_hba *hba)
{
int ret;
unsigned long flags;
@@ -3812,6 +3821,7 @@ static int ufshcd_link_recovery(struct ufs_hba *hba)
return ret;
}
+EXPORT_SYMBOL_GPL(ufshcd_link_recovery);
static int __ufshcd_uic_hibern8_enter(struct ufs_hba *hba)
{
@@ -4112,8 +4122,6 @@ int ufshcd_config_pwr_mode(struct ufs_hba *hba,
memcpy(&final_params, desired_pwr_mode, sizeof(final_params));
ret = ufshcd_change_power_mode(hba, &final_params);
- if (!ret)
- ufshcd_print_pwr_info(hba);
return ret;
}
@@ -6299,7 +6307,7 @@ static int ufshcd_host_reset_and_restore(struct ufs_hba *hba)
spin_unlock_irqrestore(hba->host->host_lock, flags);
/* scale up clocks to max frequency before full reinitialization */
- ufshcd_scale_clks(hba, true);
+ ufshcd_set_clk_freq(hba, true);
err = ufshcd_hba_enable(hba);
if (err)
@@ -7127,6 +7135,7 @@ static int ufshcd_probe_hba(struct ufs_hba *hba, bool async)
__func__, ret);
goto out;
}
+ ufshcd_print_pwr_info(hba);
}
/*
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index dd1ee277069a..6ffc08ad85f6 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -130,6 +130,19 @@ enum uic_link_state {
#define ufshcd_set_link_hibern8(hba) ((hba)->uic_link_state = \
UIC_LINK_HIBERN8_STATE)
+#define ufshcd_set_ufs_dev_active(h) \
+ ((h)->curr_dev_pwr_mode = UFS_ACTIVE_PWR_MODE)
+#define ufshcd_set_ufs_dev_sleep(h) \
+ ((h)->curr_dev_pwr_mode = UFS_SLEEP_PWR_MODE)
+#define ufshcd_set_ufs_dev_poweroff(h) \
+ ((h)->curr_dev_pwr_mode = UFS_POWERDOWN_PWR_MODE)
+#define ufshcd_is_ufs_dev_active(h) \
+ ((h)->curr_dev_pwr_mode == UFS_ACTIVE_PWR_MODE)
+#define ufshcd_is_ufs_dev_sleep(h) \
+ ((h)->curr_dev_pwr_mode == UFS_SLEEP_PWR_MODE)
+#define ufshcd_is_ufs_dev_poweroff(h) \
+ ((h)->curr_dev_pwr_mode == UFS_POWERDOWN_PWR_MODE)
+
/*
* UFS Power management levels.
* Each level is in increasing order of power savings.
@@ -788,6 +801,7 @@ int ufshcd_alloc_host(struct device *, struct ufs_hba **);
void ufshcd_dealloc_host(struct ufs_hba *);
int ufshcd_hba_enable(struct ufs_hba *hba);
int ufshcd_init(struct ufs_hba * , void __iomem * , unsigned int);
+int ufshcd_link_recovery(struct ufs_hba *hba);
int ufshcd_make_hba_operational(struct ufs_hba *hba);
void ufshcd_remove(struct ufs_hba *);
int ufshcd_uic_hibern8_exit(struct ufs_hba *hba);
@@ -1083,6 +1097,7 @@ static inline void ufshcd_vops_device_reset(struct ufs_hba *hba)
{
if (hba->vops && hba->vops->device_reset) {
hba->vops->device_reset(hba);
+ ufshcd_set_ufs_dev_active(hba);
ufshcd_update_reg_hist(&hba->ufs_stats.dev_reset, 0);
}
}
diff --git a/drivers/soc/Kconfig b/drivers/soc/Kconfig
index 1778f8c62861..425ab6f7e375 100644
--- a/drivers/soc/Kconfig
+++ b/drivers/soc/Kconfig
@@ -22,5 +22,6 @@ source "drivers/soc/ux500/Kconfig"
source "drivers/soc/versatile/Kconfig"
source "drivers/soc/xilinx/Kconfig"
source "drivers/soc/zte/Kconfig"
+source "drivers/soc/kendryte/Kconfig"
endmenu
diff --git a/drivers/soc/Makefile b/drivers/soc/Makefile
index a39f17cea376..36452bed86ef 100644
--- a/drivers/soc/Makefile
+++ b/drivers/soc/Makefile
@@ -28,3 +28,4 @@ obj-$(CONFIG_ARCH_U8500) += ux500/
obj-$(CONFIG_PLAT_VERSATILE) += versatile/
obj-y += xilinx/
obj-$(CONFIG_ARCH_ZX) += zte/
+obj-$(CONFIG_SOC_KENDRYTE) += kendryte/
diff --git a/drivers/soc/kendryte/Kconfig b/drivers/soc/kendryte/Kconfig
new file mode 100644
index 000000000000..49785b1b0217
--- /dev/null
+++ b/drivers/soc/kendryte/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+
+if SOC_KENDRYTE
+
+config K210_SYSCTL
+ bool "Kendryte K210 system controller"
+ default y
+ depends on RISCV
+ help
+ Enables controlling the K210 various clocks and to enable
+ general purpose use of the extra 2MB of SRAM normally
+ reserved for the AI engine.
+
+endif
diff --git a/drivers/soc/kendryte/Makefile b/drivers/soc/kendryte/Makefile
new file mode 100644
index 000000000000..002d9ce95c0d
--- /dev/null
+++ b/drivers/soc/kendryte/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_K210_SYSCTL) += k210-sysctl.o
diff --git a/drivers/soc/kendryte/k210-sysctl.c b/drivers/soc/kendryte/k210-sysctl.c
new file mode 100644
index 000000000000..4608fbca20e1
--- /dev/null
+++ b/drivers/soc/kendryte/k210-sysctl.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2019 Christoph Hellwig.
+ * Copyright (c) 2019 Western Digital Corporation or its affiliates.
+ */
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/clk-provider.h>
+#include <linux/clkdev.h>
+#include <linux/bitfield.h>
+#include <asm/soc.h>
+
+#define K210_SYSCTL_CLK0_FREQ 26000000UL
+
+/* Registers base address */
+#define K210_SYSCTL_SYSCTL_BASE_ADDR 0x50440000ULL
+
+/* Registers */
+#define K210_SYSCTL_PLL0 0x08
+#define K210_SYSCTL_PLL1 0x0c
+/* clkr: 4bits, clkf1: 6bits, clkod: 4bits, bwadj: 4bits */
+#define PLL_RESET (1 << 20)
+#define PLL_PWR (1 << 21)
+#define PLL_INTFB (1 << 22)
+#define PLL_BYPASS (1 << 23)
+#define PLL_TEST (1 << 24)
+#define PLL_OUT_EN (1 << 25)
+#define PLL_TEST_EN (1 << 26)
+#define K210_SYSCTL_PLL_LOCK 0x18
+#define PLL0_LOCK1 (1 << 0)
+#define PLL0_LOCK2 (1 << 1)
+#define PLL0_SLIP_CLEAR (1 << 2)
+#define PLL0_TEST_CLK_OUT (1 << 3)
+#define PLL1_LOCK1 (1 << 8)
+#define PLL1_LOCK2 (1 << 9)
+#define PLL1_SLIP_CLEAR (1 << 10)
+#define PLL1_TEST_CLK_OUT (1 << 11)
+#define PLL2_LOCK1 (1 << 16)
+#define PLL2_LOCK2 (1 << 16)
+#define PLL2_SLIP_CLEAR (1 << 18)
+#define PLL2_TEST_CLK_OUT (1 << 19)
+#define K210_SYSCTL_CLKSEL0 0x20
+#define CLKSEL_ACLK (1 << 0)
+#define K210_SYSCTL_CLKEN_CENT 0x28
+#define CLKEN_CPU (1 << 0)
+#define CLKEN_SRAM0 (1 << 1)
+#define CLKEN_SRAM1 (1 << 2)
+#define CLKEN_APB0 (1 << 3)
+#define CLKEN_APB1 (1 << 4)
+#define CLKEN_APB2 (1 << 5)
+#define K210_SYSCTL_CLKEN_PERI 0x2c
+#define CLKEN_ROM (1 << 0)
+#define CLKEN_DMA (1 << 1)
+#define CLKEN_AI (1 << 2)
+#define CLKEN_DVP (1 << 3)
+#define CLKEN_FFT (1 << 4)
+#define CLKEN_GPIO (1 << 5)
+#define CLKEN_SPI0 (1 << 6)
+#define CLKEN_SPI1 (1 << 7)
+#define CLKEN_SPI2 (1 << 8)
+#define CLKEN_SPI3 (1 << 9)
+#define CLKEN_I2S0 (1 << 10)
+#define CLKEN_I2S1 (1 << 11)
+#define CLKEN_I2S2 (1 << 12)
+#define CLKEN_I2C0 (1 << 13)
+#define CLKEN_I2C1 (1 << 14)
+#define CLKEN_I2C2 (1 << 15)
+#define CLKEN_UART1 (1 << 16)
+#define CLKEN_UART2 (1 << 17)
+#define CLKEN_UART3 (1 << 18)
+#define CLKEN_AES (1 << 19)
+#define CLKEN_FPIO (1 << 20)
+#define CLKEN_TIMER0 (1 << 21)
+#define CLKEN_TIMER1 (1 << 22)
+#define CLKEN_TIMER2 (1 << 23)
+#define CLKEN_WDT0 (1 << 24)
+#define CLKEN_WDT1 (1 << 25)
+#define CLKEN_SHA (1 << 26)
+#define CLKEN_OTP (1 << 27)
+#define CLKEN_RTC (1 << 29)
+
+struct k210_sysctl {
+ void __iomem *regs;
+ struct clk_hw hw;
+};
+
+static void k210_set_bits(u32 val, void __iomem *reg)
+{
+ writel(readl(reg) | val, reg);
+}
+
+static void k210_clear_bits(u32 val, void __iomem *reg)
+{
+ writel(readl(reg) & ~val, reg);
+}
+
+static void k210_pll1_enable(void __iomem *regs)
+{
+ u32 val;
+
+ val = readl(regs + K210_SYSCTL_PLL1);
+ val &= ~GENMASK(19, 0); /* clkr1 = 0 */
+ val |= FIELD_PREP(GENMASK(9, 4), 0x3B); /* clkf1 = 59 */
+ val |= FIELD_PREP(GENMASK(13, 10), 0x3); /* clkod1 = 3 */
+ val |= FIELD_PREP(GENMASK(19, 14), 0x3B); /* bwadj1 = 59 */
+ writel(val, regs + K210_SYSCTL_PLL1);
+
+ k210_clear_bits(PLL_BYPASS, regs + K210_SYSCTL_PLL1);
+ k210_set_bits(PLL_PWR, regs + K210_SYSCTL_PLL1);
+
+ /*
+ * Reset the pll. The magic NOPs come from the Kendryte reference SDK.
+ */
+ k210_clear_bits(PLL_RESET, regs + K210_SYSCTL_PLL1);
+ k210_set_bits(PLL_RESET, regs + K210_SYSCTL_PLL1);
+ nop();
+ nop();
+ k210_clear_bits(PLL_RESET, regs + K210_SYSCTL_PLL1);
+
+ for (;;) {
+ val = readl(regs + K210_SYSCTL_PLL_LOCK);
+ if (val & PLL1_LOCK2)
+ break;
+ writel(val | PLL1_SLIP_CLEAR, regs + K210_SYSCTL_PLL_LOCK);
+ }
+
+ k210_set_bits(PLL_OUT_EN, regs + K210_SYSCTL_PLL1);
+}
+
+static unsigned long k210_sysctl_clk_recalc_rate(struct clk_hw *hw,
+ unsigned long parent_rate)
+{
+ struct k210_sysctl *s = container_of(hw, struct k210_sysctl, hw);
+ u32 clksel0, pll0;
+ u64 pll0_freq, clkr0, clkf0, clkod0;
+
+ /*
+ * If the clock selector is not set, use the base frequency.
+ * Otherwise, use PLL0 frequency with a frequency divisor.
+ */
+ clksel0 = readl(s->regs + K210_SYSCTL_CLKSEL0);
+ if (!(clksel0 & CLKSEL_ACLK))
+ return K210_SYSCTL_CLK0_FREQ;
+
+ /*
+ * Get PLL0 frequency:
+ * freq = base frequency * clkf0 / (clkr0 * clkod0)
+ */
+ pll0 = readl(s->regs + K210_SYSCTL_PLL0);
+ clkr0 = 1 + FIELD_GET(GENMASK(3, 0), pll0);
+ clkf0 = 1 + FIELD_GET(GENMASK(9, 4), pll0);
+ clkod0 = 1 + FIELD_GET(GENMASK(13, 10), pll0);
+ pll0_freq = clkf0 * K210_SYSCTL_CLK0_FREQ / (clkr0 * clkod0);
+
+ /* Get the frequency divisor from the clock selector */
+ return pll0_freq / (2ULL << FIELD_GET(0x00000006, clksel0));
+}
+
+static const struct clk_ops k210_sysctl_clk_ops = {
+ .recalc_rate = k210_sysctl_clk_recalc_rate,
+};
+
+static const struct clk_init_data k210_clk_init_data = {
+ .name = "k210-sysctl-pll1",
+ .ops = &k210_sysctl_clk_ops,
+};
+
+static int k210_sysctl_probe(struct platform_device *pdev)
+{
+ struct k210_sysctl *s;
+ int error;
+
+ pr_info("Kendryte K210 SoC sysctl\n");
+
+ s = devm_kzalloc(&pdev->dev, sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ s->regs = devm_ioremap_resource(&pdev->dev,
+ platform_get_resource(pdev, IORESOURCE_MEM, 0));
+ if (IS_ERR(s->regs))
+ return PTR_ERR(s->regs);
+
+ s->hw.init = &k210_clk_init_data;
+ error = devm_clk_hw_register(&pdev->dev, &s->hw);
+ if (error) {
+ dev_err(&pdev->dev, "failed to register clk");
+ return error;
+ }
+
+ error = devm_of_clk_add_hw_provider(&pdev->dev, of_clk_hw_simple_get,
+ &s->hw);
+ if (error) {
+ dev_err(&pdev->dev, "adding clk provider failed\n");
+ return error;
+ }
+
+ return 0;
+}
+
+static const struct of_device_id k210_sysctl_of_match[] = {
+ { .compatible = "kendryte,k210-sysctl", },
+ {}
+};
+
+static struct platform_driver k210_sysctl_driver = {
+ .driver = {
+ .name = "k210-sysctl",
+ .of_match_table = k210_sysctl_of_match,
+ },
+ .probe = k210_sysctl_probe,
+};
+
+static int __init k210_sysctl_init(void)
+{
+ return platform_driver_register(&k210_sysctl_driver);
+}
+core_initcall(k210_sysctl_init);
+
+/*
+ * This needs to be called very early during initialization, given that
+ * PLL1 needs to be enabled to be able to use all SRAM.
+ */
+static void __init k210_soc_early_init(const void *fdt)
+{
+ void __iomem *regs;
+
+ regs = ioremap(K210_SYSCTL_SYSCTL_BASE_ADDR, 0x1000);
+ if (!regs)
+ panic("K210 sysctl ioremap");
+
+ /* Enable PLL1 to make the KPU SRAM useable */
+ k210_pll1_enable(regs);
+
+ k210_set_bits(PLL_OUT_EN, regs + K210_SYSCTL_PLL0);
+
+ k210_set_bits(CLKEN_CPU | CLKEN_SRAM0 | CLKEN_SRAM1,
+ regs + K210_SYSCTL_CLKEN_CENT);
+ k210_set_bits(CLKEN_ROM | CLKEN_TIMER0 | CLKEN_RTC,
+ regs + K210_SYSCTL_CLKEN_PERI);
+
+ k210_set_bits(CLKSEL_ACLK, regs + K210_SYSCTL_CLKSEL0);
+
+ iounmap(regs);
+}
+SOC_EARLY_INIT_DECLARE(generic_k210, "kendryte,k210", k210_soc_early_init);
diff --git a/drivers/staging/gasket/gasket_core.c b/drivers/staging/gasket/gasket_core.c
index cd181a64f737..8e0575fcb4c8 100644
--- a/drivers/staging/gasket/gasket_core.c
+++ b/drivers/staging/gasket/gasket_core.c
@@ -689,7 +689,7 @@ static bool gasket_mmap_has_permissions(struct gasket_dev *gasket_dev,
/* Make sure that no wrong flags are set. */
requested_permissions =
- (vma->vm_flags & (VM_WRITE | VM_READ | VM_EXEC));
+ (vma->vm_flags & VM_ACCESS_FLAGS);
if (requested_permissions & ~(bar_permissions)) {
dev_dbg(gasket_dev->dev,
"Attempting to map a region with requested permissions 0x%x, but region has permissions 0x%x.\n",
diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c
index 425c1070de08..bd3ed6ce7571 100644
--- a/drivers/target/target_core_xcopy.c
+++ b/drivers/target/target_core_xcopy.c
@@ -134,7 +134,7 @@ static int target_xcopy_parse_tiddesc_e4(struct se_cmd *se_cmd, struct xcopy_op
* Assigned designator
*/
desig_len = desc[7];
- if (desig_len != 16) {
+ if (desig_len != XCOPY_NAA_IEEE_REGEX_LEN) {
pr_err("XCOPY 0xe4: invalid desig_len: %d\n", (int)desig_len);
return -EINVAL;
}
@@ -315,11 +315,6 @@ static int target_xcopy_parse_segdesc_02(struct se_cmd *se_cmd, struct xcopy_op
xop->nolb, (unsigned long long)xop->src_lba,
(unsigned long long)xop->dst_lba);
- if (dc != 0) {
- xop->dbl = get_unaligned_be24(&desc[29]);
-
- pr_debug("XCOPY seg desc 0x02: DC=1 w/ dbl: %u\n", xop->dbl);
- }
return 0;
}
@@ -415,7 +410,8 @@ static void xcopy_pt_release_cmd(struct se_cmd *se_cmd)
struct xcopy_pt_cmd *xpt_cmd = container_of(se_cmd,
struct xcopy_pt_cmd, se_cmd);
- kfree(xpt_cmd);
+ /* xpt_cmd is on the stack, nothing to free here */
+ pr_debug("xpt_cmd done: %p\n", xpt_cmd);
}
static int xcopy_pt_check_stop_free(struct se_cmd *se_cmd)
@@ -504,7 +500,6 @@ void target_xcopy_release_pt(void)
* @cdb: SCSI CDB to be copied into @xpt_cmd.
* @remote_port: If false, use the LUN through which the XCOPY command has
* been received. If true, use @se_dev->xcopy_lun.
- * @alloc_mem: Whether or not to allocate an SGL list.
*
* Set up a SCSI command (READ or WRITE) that will be used to execute an
* XCOPY command.
@@ -514,12 +509,9 @@ static int target_xcopy_setup_pt_cmd(
struct xcopy_op *xop,
struct se_device *se_dev,
unsigned char *cdb,
- bool remote_port,
- bool alloc_mem)
+ bool remote_port)
{
struct se_cmd *cmd = &xpt_cmd->se_cmd;
- sense_reason_t sense_rc;
- int ret = 0, rc;
/*
* Setup LUN+port to honor reservations based upon xop->op_origin for
@@ -535,46 +527,17 @@ static int target_xcopy_setup_pt_cmd(
cmd->se_cmd_flags |= SCF_SE_LUN_CMD;
cmd->tag = 0;
- sense_rc = target_setup_cmd_from_cdb(cmd, cdb);
- if (sense_rc) {
- ret = -EINVAL;
- goto out;
- }
+ if (target_setup_cmd_from_cdb(cmd, cdb))
+ return -EINVAL;
- if (alloc_mem) {
- rc = target_alloc_sgl(&cmd->t_data_sg, &cmd->t_data_nents,
- cmd->data_length, false, false);
- if (rc < 0) {
- ret = rc;
- goto out;
- }
- /*
- * Set this bit so that transport_free_pages() allows the
- * caller to release SGLs + physical memory allocated by
- * transport_generic_get_mem()..
- */
- cmd->se_cmd_flags |= SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC;
- } else {
- /*
- * Here the previously allocated SGLs for the internal READ
- * are mapped zero-copy to the internal WRITE.
- */
- sense_rc = transport_generic_map_mem_to_cmd(cmd,
- xop->xop_data_sg, xop->xop_data_nents,
- NULL, 0);
- if (sense_rc) {
- ret = -EINVAL;
- goto out;
- }
+ if (transport_generic_map_mem_to_cmd(cmd, xop->xop_data_sg,
+ xop->xop_data_nents, NULL, 0))
+ return -EINVAL;
- pr_debug("Setup PASSTHROUGH_NOALLOC t_data_sg: %p t_data_nents:"
- " %u\n", cmd->t_data_sg, cmd->t_data_nents);
- }
+ pr_debug("Setup PASSTHROUGH_NOALLOC t_data_sg: %p t_data_nents:"
+ " %u\n", cmd->t_data_sg, cmd->t_data_nents);
return 0;
-
-out:
- return ret;
}
static int target_xcopy_issue_pt_cmd(struct xcopy_pt_cmd *xpt_cmd)
@@ -604,20 +567,15 @@ static int target_xcopy_read_source(
sector_t src_lba,
u32 src_sectors)
{
- struct xcopy_pt_cmd *xpt_cmd;
- struct se_cmd *se_cmd;
+ struct xcopy_pt_cmd xpt_cmd;
+ struct se_cmd *se_cmd = &xpt_cmd.se_cmd;
u32 length = (src_sectors * src_dev->dev_attrib.block_size);
int rc;
unsigned char cdb[16];
bool remote_port = (xop->op_origin == XCOL_DEST_RECV_OP);
- xpt_cmd = kzalloc(sizeof(struct xcopy_pt_cmd), GFP_KERNEL);
- if (!xpt_cmd) {
- pr_err("Unable to allocate xcopy_pt_cmd\n");
- return -ENOMEM;
- }
- init_completion(&xpt_cmd->xpt_passthrough_sem);
- se_cmd = &xpt_cmd->se_cmd;
+ memset(&xpt_cmd, 0, sizeof(xpt_cmd));
+ init_completion(&xpt_cmd.xpt_passthrough_sem);
memset(&cdb[0], 0, 16);
cdb[0] = READ_16;
@@ -627,36 +585,24 @@ static int target_xcopy_read_source(
(unsigned long long)src_lba, src_sectors, length);
transport_init_se_cmd(se_cmd, &xcopy_pt_tfo, &xcopy_pt_sess, length,
- DMA_FROM_DEVICE, 0, &xpt_cmd->sense_buffer[0]);
- xop->src_pt_cmd = xpt_cmd;
+ DMA_FROM_DEVICE, 0, &xpt_cmd.sense_buffer[0]);
- rc = target_xcopy_setup_pt_cmd(xpt_cmd, xop, src_dev, &cdb[0],
- remote_port, true);
+ rc = target_xcopy_setup_pt_cmd(&xpt_cmd, xop, src_dev, &cdb[0],
+ remote_port);
if (rc < 0) {
- ec_cmd->scsi_status = xpt_cmd->se_cmd.scsi_status;
- transport_generic_free_cmd(se_cmd, 0);
- return rc;
+ ec_cmd->scsi_status = se_cmd->scsi_status;
+ goto out;
}
- xop->xop_data_sg = se_cmd->t_data_sg;
- xop->xop_data_nents = se_cmd->t_data_nents;
pr_debug("XCOPY-READ: Saved xop->xop_data_sg: %p, num: %u for READ"
" memory\n", xop->xop_data_sg, xop->xop_data_nents);
- rc = target_xcopy_issue_pt_cmd(xpt_cmd);
- if (rc < 0) {
- ec_cmd->scsi_status = xpt_cmd->se_cmd.scsi_status;
- transport_generic_free_cmd(se_cmd, 0);
- return rc;
- }
- /*
- * Clear off the allocated t_data_sg, that has been saved for
- * zero-copy WRITE submission reuse in struct xcopy_op..
- */
- se_cmd->t_data_sg = NULL;
- se_cmd->t_data_nents = 0;
-
- return 0;
+ rc = target_xcopy_issue_pt_cmd(&xpt_cmd);
+ if (rc < 0)
+ ec_cmd->scsi_status = se_cmd->scsi_status;
+out:
+ transport_generic_free_cmd(se_cmd, 0);
+ return rc;
}
static int target_xcopy_write_destination(
@@ -666,20 +612,15 @@ static int target_xcopy_write_destination(
sector_t dst_lba,
u32 dst_sectors)
{
- struct xcopy_pt_cmd *xpt_cmd;
- struct se_cmd *se_cmd;
+ struct xcopy_pt_cmd xpt_cmd;
+ struct se_cmd *se_cmd = &xpt_cmd.se_cmd;
u32 length = (dst_sectors * dst_dev->dev_attrib.block_size);
int rc;
unsigned char cdb[16];
bool remote_port = (xop->op_origin == XCOL_SOURCE_RECV_OP);
- xpt_cmd = kzalloc(sizeof(struct xcopy_pt_cmd), GFP_KERNEL);
- if (!xpt_cmd) {
- pr_err("Unable to allocate xcopy_pt_cmd\n");
- return -ENOMEM;
- }
- init_completion(&xpt_cmd->xpt_passthrough_sem);
- se_cmd = &xpt_cmd->se_cmd;
+ memset(&xpt_cmd, 0, sizeof(xpt_cmd));
+ init_completion(&xpt_cmd.xpt_passthrough_sem);
memset(&cdb[0], 0, 16);
cdb[0] = WRITE_16;
@@ -689,36 +630,21 @@ static int target_xcopy_write_destination(
(unsigned long long)dst_lba, dst_sectors, length);
transport_init_se_cmd(se_cmd, &xcopy_pt_tfo, &xcopy_pt_sess, length,
- DMA_TO_DEVICE, 0, &xpt_cmd->sense_buffer[0]);
- xop->dst_pt_cmd = xpt_cmd;
+ DMA_TO_DEVICE, 0, &xpt_cmd.sense_buffer[0]);
- rc = target_xcopy_setup_pt_cmd(xpt_cmd, xop, dst_dev, &cdb[0],
- remote_port, false);
+ rc = target_xcopy_setup_pt_cmd(&xpt_cmd, xop, dst_dev, &cdb[0],
+ remote_port);
if (rc < 0) {
- struct se_cmd *src_cmd = &xop->src_pt_cmd->se_cmd;
- ec_cmd->scsi_status = xpt_cmd->se_cmd.scsi_status;
- /*
- * If the failure happened before the t_mem_list hand-off in
- * target_xcopy_setup_pt_cmd(), Reset memory + clear flag so that
- * core releases this memory on error during X-COPY WRITE I/O.
- */
- src_cmd->se_cmd_flags &= ~SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC;
- src_cmd->t_data_sg = xop->xop_data_sg;
- src_cmd->t_data_nents = xop->xop_data_nents;
-
- transport_generic_free_cmd(se_cmd, 0);
- return rc;
- }
-
- rc = target_xcopy_issue_pt_cmd(xpt_cmd);
- if (rc < 0) {
- ec_cmd->scsi_status = xpt_cmd->se_cmd.scsi_status;
- se_cmd->se_cmd_flags &= ~SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC;
- transport_generic_free_cmd(se_cmd, 0);
- return rc;
+ ec_cmd->scsi_status = se_cmd->scsi_status;
+ goto out;
}
- return 0;
+ rc = target_xcopy_issue_pt_cmd(&xpt_cmd);
+ if (rc < 0)
+ ec_cmd->scsi_status = se_cmd->scsi_status;
+out:
+ transport_generic_free_cmd(se_cmd, 0);
+ return rc;
}
static void target_xcopy_do_work(struct work_struct *work)
@@ -729,7 +655,7 @@ static void target_xcopy_do_work(struct work_struct *work)
sector_t src_lba, dst_lba, end_lba;
unsigned int max_sectors;
int rc = 0;
- unsigned short nolb, cur_nolb, max_nolb, copied_nolb = 0;
+ unsigned short nolb, max_nolb, copied_nolb = 0;
if (target_parse_xcopy_cmd(xop) != TCM_NO_SENSE)
goto err_free;
@@ -759,7 +685,23 @@ static void target_xcopy_do_work(struct work_struct *work)
(unsigned long long)src_lba, (unsigned long long)dst_lba);
while (src_lba < end_lba) {
- cur_nolb = min(nolb, max_nolb);
+ unsigned short cur_nolb = min(nolb, max_nolb);
+ u32 cur_bytes = cur_nolb * src_dev->dev_attrib.block_size;
+
+ if (cur_bytes != xop->xop_data_bytes) {
+ /*
+ * (Re)allocate a buffer large enough to hold the XCOPY
+ * I/O size, which can be reused each read / write loop.
+ */
+ target_free_sgl(xop->xop_data_sg, xop->xop_data_nents);
+ rc = target_alloc_sgl(&xop->xop_data_sg,
+ &xop->xop_data_nents,
+ cur_bytes,
+ false, false);
+ if (rc < 0)
+ goto out;
+ xop->xop_data_bytes = cur_bytes;
+ }
pr_debug("target_xcopy_do_work: Calling read src_dev: %p src_lba: %llu,"
" cur_nolb: %hu\n", src_dev, (unsigned long long)src_lba, cur_nolb);
@@ -777,10 +719,8 @@ static void target_xcopy_do_work(struct work_struct *work)
rc = target_xcopy_write_destination(ec_cmd, xop, dst_dev,
dst_lba, cur_nolb);
- if (rc < 0) {
- transport_generic_free_cmd(&xop->src_pt_cmd->se_cmd, 0);
+ if (rc < 0)
goto out;
- }
dst_lba += cur_nolb;
pr_debug("target_xcopy_do_work: Incremented WRITE dst_lba to %llu\n",
@@ -788,14 +728,10 @@ static void target_xcopy_do_work(struct work_struct *work)
copied_nolb += cur_nolb;
nolb -= cur_nolb;
-
- transport_generic_free_cmd(&xop->src_pt_cmd->se_cmd, 0);
- xop->dst_pt_cmd->se_cmd.se_cmd_flags &= ~SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC;
-
- transport_generic_free_cmd(&xop->dst_pt_cmd->se_cmd, 0);
}
xcopy_pt_undepend_remotedev(xop);
+ target_free_sgl(xop->xop_data_sg, xop->xop_data_nents);
kfree(xop);
pr_debug("target_xcopy_do_work: Final src_lba: %llu, dst_lba: %llu\n",
@@ -809,6 +745,7 @@ static void target_xcopy_do_work(struct work_struct *work)
out:
xcopy_pt_undepend_remotedev(xop);
+ target_free_sgl(xop->xop_data_sg, xop->xop_data_nents);
err_free:
kfree(xop);
diff --git a/drivers/target/target_core_xcopy.h b/drivers/target/target_core_xcopy.h
index 26ba4c3c9cff..c56a1bde9417 100644
--- a/drivers/target/target_core_xcopy.h
+++ b/drivers/target/target_core_xcopy.h
@@ -5,7 +5,7 @@
#define XCOPY_TARGET_DESC_LEN 32
#define XCOPY_SEGMENT_DESC_LEN 28
#define XCOPY_NAA_IEEE_REGEX_LEN 16
-#define XCOPY_MAX_SECTORS 1024
+#define XCOPY_MAX_SECTORS 4096
/*
* SPC4r37 6.4.6.1
@@ -18,8 +18,6 @@ enum xcopy_origin_list {
XCOL_DEST_RECV_OP = 0x02,
};
-struct xcopy_pt_cmd;
-
struct xcopy_op {
int op_origin;
@@ -35,11 +33,8 @@ struct xcopy_op {
unsigned short stdi;
unsigned short dtdi;
unsigned short nolb;
- unsigned int dbl;
-
- struct xcopy_pt_cmd *src_pt_cmd;
- struct xcopy_pt_cmd *dst_pt_cmd;
+ u32 xop_data_bytes;
u32 xop_data_nents;
struct scatterlist *xop_data_sg;
struct work_struct xop_work;
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index 5a05db5438d6..91af271e9bb0 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -1,17 +1,18 @@
# SPDX-License-Identifier: GPL-2.0-only
#
-# Generic thermal sysfs drivers configuration
+# Generic thermal drivers configuration
#
menuconfig THERMAL
- bool "Generic Thermal sysfs driver"
+ bool "Thermal drivers"
help
- Generic Thermal Sysfs driver offers a generic mechanism for
+ Thermal drivers offer a generic mechanism for
thermal management. Usually it's made up of one or more thermal
- zone and cooling device.
+ zones and cooling devices.
Each thermal zone contains its own temperature, trip points,
- cooling devices.
- All platforms with ACPI thermal support can use this driver.
+ and cooling devices.
+ All platforms with ACPI or Open Firmware thermal support can use
+ this driver.
If you want this support, you should say Y here.
if THERMAL
@@ -251,6 +252,27 @@ config IMX_THERMAL
cpufreq is used as the cooling device to throttle CPUs when the
passive trip is crossed.
+config IMX_SC_THERMAL
+ tristate "Temperature sensor driver for NXP i.MX SoCs with System Controller"
+ depends on IMX_SCU
+ depends on OF
+ help
+ Support for Temperature Monitor (TEMPMON) found on NXP i.MX SoCs with
+ system controller inside, Linux kernel has to communicate with system
+ controller via MU (message unit) IPC to get temperature from thermal
+ sensor. It supports one critical trip point and one
+ passive trip point for each thermal sensor.
+
+config IMX8MM_THERMAL
+ tristate "Temperature sensor driver for Freescale i.MX8MM SoC"
+ depends on ARCH_MXC || COMPILE_TEST
+ depends on OF
+ help
+ Support for Thermal Monitoring Unit (TMU) found on Freescale i.MX8MM SoC.
+ It supports one critical trip point and one passive trip point. The
+ cpufreq is used as the cooling device to throttle CPUs when the passive
+ trip is crossed.
+
config MAX77620_THERMAL
tristate "Temperature sensor driver for Maxim MAX77620 PMIC"
depends on MFD_MAX77620
@@ -265,6 +287,7 @@ config QORIQ_THERMAL
tristate "QorIQ Thermal Monitoring Unit"
depends on THERMAL_OF
depends on HAS_IOMEM
+ select REGMAP_MMIO
help
Support for Thermal Monitoring Unit (TMU) found on QorIQ platforms.
It supports one critical trip point and one passive trip point. The
@@ -460,4 +483,11 @@ config UNIPHIER_THERMAL
Enable this to plug in UniPhier on-chip PVT thermal driver into the
thermal framework. The driver supports CPU thermal zone temperature
reporting and a couple of trip points.
+
+config SPRD_THERMAL
+ tristate "Temperature sensor on Spreadtrum SoCs"
+ depends on ARCH_SPRD || COMPILE_TEST
+ help
+ Support for the Spreadtrum thermal sensor driver in the Linux thermal
+ framework.
endif
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index 9fb88e26fb10..8c8ed7b79915 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -43,6 +43,8 @@ obj-$(CONFIG_DB8500_THERMAL) += db8500_thermal.o
obj-$(CONFIG_ARMADA_THERMAL) += armada_thermal.o
obj-$(CONFIG_TANGO_THERMAL) += tango_thermal.o
obj-$(CONFIG_IMX_THERMAL) += imx_thermal.o
+obj-$(CONFIG_IMX_SC_THERMAL) += imx_sc_thermal.o
+obj-$(CONFIG_IMX8MM_THERMAL) += imx8mm_thermal.o
obj-$(CONFIG_MAX77620_THERMAL) += max77620_thermal.o
obj-$(CONFIG_QORIQ_THERMAL) += qoriq_thermal.o
obj-$(CONFIG_DA9062_THERMAL) += da9062-thermal.o
@@ -57,3 +59,4 @@ obj-$(CONFIG_GENERIC_ADC_THERMAL) += thermal-generic-adc.o
obj-$(CONFIG_ZX2967_THERMAL) += zx2967_thermal.o
obj-$(CONFIG_UNIPHIER_THERMAL) += uniphier_thermal.o
obj-$(CONFIG_AMLOGIC_THERMAL) += amlogic_thermal.o
+obj-$(CONFIG_SPRD_THERMAL) += sprd_thermal.o
diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index 4ae8c856c88e..e297e135c031 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -273,7 +273,7 @@ static int cpufreq_state2power(struct thermal_cooling_device *cdev,
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
/* Request state should be less than max_level */
- if (WARN_ON(state > cpufreq_cdev->max_level))
+ if (state > cpufreq_cdev->max_level)
return -EINVAL;
num_cpus = cpumask_weight(cpufreq_cdev->policy->cpus);
@@ -437,7 +437,7 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
int ret;
/* Request state should be less than max_level */
- if (WARN_ON(state > cpufreq_cdev->max_level))
+ if (state > cpufreq_cdev->max_level)
return -EINVAL;
/* Check if the old cooling action is same as new cooling action */
@@ -456,6 +456,7 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
capacity = frequency * max_capacity;
capacity /= cpufreq_cdev->policy->cpuinfo.max_freq;
arch_set_thermal_pressure(cpus, max_capacity - capacity);
+ ret = 0;
}
return ret;
diff --git a/drivers/thermal/imx8mm_thermal.c b/drivers/thermal/imx8mm_thermal.c
new file mode 100644
index 000000000000..0d60f8d7894f
--- /dev/null
+++ b/drivers/thermal/imx8mm_thermal.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2020 NXP.
+ *
+ * Author: Anson Huang <Anson.Huang@nxp.com>
+ */
+
+#include <linux/bitfield.h>
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/thermal.h>
+
+#include "thermal_core.h"
+
+#define TER 0x0 /* TMU enable */
+#define TPS 0x4
+#define TRITSR 0x20 /* TMU immediate temp */
+
+#define TER_EN BIT(31)
+#define TRITSR_TEMP0_VAL_MASK 0xff
+#define TRITSR_TEMP1_VAL_MASK 0xff0000
+
+#define PROBE_SEL_ALL GENMASK(31, 30)
+
+#define probe_status_offset(x) (30 + x)
+#define SIGN_BIT BIT(7)
+#define TEMP_VAL_MASK GENMASK(6, 0)
+
+#define VER1_TEMP_LOW_LIMIT 10000
+#define VER2_TEMP_LOW_LIMIT -40000
+#define VER2_TEMP_HIGH_LIMIT 125000
+
+#define TMU_VER1 0x1
+#define TMU_VER2 0x2
+
+struct thermal_soc_data {
+ u32 num_sensors;
+ u32 version;
+ int (*get_temp)(void *, int *);
+};
+
+struct tmu_sensor {
+ struct imx8mm_tmu *priv;
+ u32 hw_id;
+ struct thermal_zone_device *tzd;
+};
+
+struct imx8mm_tmu {
+ void __iomem *base;
+ struct clk *clk;
+ const struct thermal_soc_data *socdata;
+ struct tmu_sensor sensors[0];
+};
+
+static int imx8mm_tmu_get_temp(void *data, int *temp)
+{
+ struct tmu_sensor *sensor = data;
+ struct imx8mm_tmu *tmu = sensor->priv;
+ u32 val;
+
+ val = readl_relaxed(tmu->base + TRITSR) & TRITSR_TEMP0_VAL_MASK;
+ *temp = val * 1000;
+ if (*temp < VER1_TEMP_LOW_LIMIT)
+ return -EAGAIN;
+
+ return 0;
+}
+
+static int imx8mp_tmu_get_temp(void *data, int *temp)
+{
+ struct tmu_sensor *sensor = data;
+ struct imx8mm_tmu *tmu = sensor->priv;
+ unsigned long val;
+ bool ready;
+
+ val = readl_relaxed(tmu->base + TRITSR);
+ ready = test_bit(probe_status_offset(sensor->hw_id), &val);
+ if (!ready)
+ return -EAGAIN;
+
+ val = sensor->hw_id ? FIELD_GET(TRITSR_TEMP1_VAL_MASK, val) :
+ FIELD_GET(TRITSR_TEMP0_VAL_MASK, val);
+ if (val & SIGN_BIT) /* negative */
+ val = (~(val & TEMP_VAL_MASK) + 1);
+
+ *temp = val * 1000;
+ if (*temp < VER2_TEMP_LOW_LIMIT || *temp > VER2_TEMP_HIGH_LIMIT)
+ return -EAGAIN;
+
+ return 0;
+}
+
+static int tmu_get_temp(void *data, int *temp)
+{
+ struct tmu_sensor *sensor = data;
+ struct imx8mm_tmu *tmu = sensor->priv;
+
+ return tmu->socdata->get_temp(data, temp);
+}
+
+static struct thermal_zone_of_device_ops tmu_tz_ops = {
+ .get_temp = tmu_get_temp,
+};
+
+static void imx8mm_tmu_enable(struct imx8mm_tmu *tmu, bool enable)
+{
+ u32 val;
+
+ val = readl_relaxed(tmu->base + TER);
+ val = enable ? (val | TER_EN) : (val & ~TER_EN);
+ writel_relaxed(val, tmu->base + TER);
+}
+
+static void imx8mm_tmu_probe_sel_all(struct imx8mm_tmu *tmu)
+{
+ u32 val;
+
+ val = readl_relaxed(tmu->base + TPS);
+ val |= PROBE_SEL_ALL;
+ writel_relaxed(val, tmu->base + TPS);
+}
+
+static int imx8mm_tmu_probe(struct platform_device *pdev)
+{
+ const struct thermal_soc_data *data;
+ struct imx8mm_tmu *tmu;
+ int ret;
+ int i;
+
+ data = of_device_get_match_data(&pdev->dev);
+
+ tmu = devm_kzalloc(&pdev->dev, struct_size(tmu, sensors,
+ data->num_sensors), GFP_KERNEL);
+ if (!tmu)
+ return -ENOMEM;
+
+ tmu->socdata = data;
+
+ tmu->base = devm_platform_ioremap_resource(pdev, 0);
+ if (IS_ERR(tmu->base))
+ return PTR_ERR(tmu->base);
+
+ tmu->clk = devm_clk_get(&pdev->dev, NULL);
+ if (IS_ERR(tmu->clk)) {
+ ret = PTR_ERR(tmu->clk);
+ if (ret != -EPROBE_DEFER)
+ dev_err(&pdev->dev,
+ "failed to get tmu clock: %d\n", ret);
+ return ret;
+ }
+
+ ret = clk_prepare_enable(tmu->clk);
+ if (ret) {
+ dev_err(&pdev->dev, "failed to enable tmu clock: %d\n", ret);
+ return ret;
+ }
+
+ /* disable the monitor during initialization */
+ imx8mm_tmu_enable(tmu, false);
+
+ for (i = 0; i < data->num_sensors; i++) {
+ tmu->sensors[i].priv = tmu;
+ tmu->sensors[i].tzd =
+ devm_thermal_zone_of_sensor_register(&pdev->dev, i,
+ &tmu->sensors[i],
+ &tmu_tz_ops);
+ if (IS_ERR(tmu->sensors[i].tzd)) {
+ dev_err(&pdev->dev,
+ "failed to register thermal zone sensor[%d]: %d\n",
+ i, ret);
+ return PTR_ERR(tmu->sensors[i].tzd);
+ }
+ tmu->sensors[i].hw_id = i;
+ }
+
+ platform_set_drvdata(pdev, tmu);
+
+ /* enable all the probes for V2 TMU */
+ if (tmu->socdata->version == TMU_VER2)
+ imx8mm_tmu_probe_sel_all(tmu);
+
+ /* enable the monitor */
+ imx8mm_tmu_enable(tmu, true);
+
+ return 0;
+}
+
+static int imx8mm_tmu_remove(struct platform_device *pdev)
+{
+ struct imx8mm_tmu *tmu = platform_get_drvdata(pdev);
+
+ /* disable TMU */
+ imx8mm_tmu_enable(tmu, false);
+
+ clk_disable_unprepare(tmu->clk);
+ platform_set_drvdata(pdev, NULL);
+
+ return 0;
+}
+
+static struct thermal_soc_data imx8mm_tmu_data = {
+ .num_sensors = 1,
+ .version = TMU_VER1,
+ .get_temp = imx8mm_tmu_get_temp,
+};
+
+static struct thermal_soc_data imx8mp_tmu_data = {
+ .num_sensors = 2,
+ .version = TMU_VER2,
+ .get_temp = imx8mp_tmu_get_temp,
+};
+
+static const struct of_device_id imx8mm_tmu_table[] = {
+ { .compatible = "fsl,imx8mm-tmu", .data = &imx8mm_tmu_data, },
+ { .compatible = "fsl,imx8mp-tmu", .data = &imx8mp_tmu_data, },
+ { },
+};
+
+static struct platform_driver imx8mm_tmu = {
+ .driver = {
+ .name = "i.mx8mm_thermal",
+ .of_match_table = imx8mm_tmu_table,
+ },
+ .probe = imx8mm_tmu_probe,
+ .remove = imx8mm_tmu_remove,
+};
+module_platform_driver(imx8mm_tmu);
+
+MODULE_AUTHOR("Anson Huang <Anson.Huang@nxp.com>");
+MODULE_DESCRIPTION("i.MX8MM Thermal Monitor Unit driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/imx_sc_thermal.c b/drivers/thermal/imx_sc_thermal.c
new file mode 100644
index 000000000000..a8723b1eb8b0
--- /dev/null
+++ b/drivers/thermal/imx_sc_thermal.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright 2018-2020 NXP.
+ */
+
+#include <linux/err.h>
+#include <linux/firmware/imx/sci.h>
+#include <linux/firmware/imx/types.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/thermal.h>
+
+#include "thermal_core.h"
+
+#define IMX_SC_MISC_FUNC_GET_TEMP 13
+
+static struct imx_sc_ipc *thermal_ipc_handle;
+
+struct imx_sc_sensor {
+ struct thermal_zone_device *tzd;
+ u32 resource_id;
+};
+
+struct req_get_temp {
+ u16 resource_id;
+ u8 type;
+} __packed __aligned(4);
+
+struct resp_get_temp {
+ s16 celsius;
+ s8 tenths;
+} __packed __aligned(4);
+
+struct imx_sc_msg_misc_get_temp {
+ struct imx_sc_rpc_msg hdr;
+ union {
+ struct req_get_temp req;
+ struct resp_get_temp resp;
+ } data;
+} __packed __aligned(4);
+
+static int imx_sc_thermal_get_temp(void *data, int *temp)
+{
+ struct imx_sc_msg_misc_get_temp msg;
+ struct imx_sc_rpc_msg *hdr = &msg.hdr;
+ struct imx_sc_sensor *sensor = data;
+ int ret;
+
+ msg.data.req.resource_id = sensor->resource_id;
+ msg.data.req.type = IMX_SC_C_TEMP;
+
+ hdr->ver = IMX_SC_RPC_VERSION;
+ hdr->svc = IMX_SC_RPC_SVC_MISC;
+ hdr->func = IMX_SC_MISC_FUNC_GET_TEMP;
+ hdr->size = 2;
+
+ ret = imx_scu_call_rpc(thermal_ipc_handle, &msg, true);
+ if (ret) {
+ dev_err(&sensor->tzd->device, "read temp sensor %d failed, ret %d\n",
+ sensor->resource_id, ret);
+ return ret;
+ }
+
+ *temp = msg.data.resp.celsius * 1000 + msg.data.resp.tenths * 100;
+
+ return 0;
+}
+
+static const struct thermal_zone_of_device_ops imx_sc_thermal_ops = {
+ .get_temp = imx_sc_thermal_get_temp,
+};
+
+static int imx_sc_thermal_probe(struct platform_device *pdev)
+{
+ struct device_node *np, *child, *sensor_np;
+ struct imx_sc_sensor *sensor;
+ int ret;
+
+ ret = imx_scu_get_handle(&thermal_ipc_handle);
+ if (ret)
+ return ret;
+
+ np = of_find_node_by_name(NULL, "thermal-zones");
+ if (!np)
+ return -ENODEV;
+
+ sensor_np = of_node_get(pdev->dev.of_node);
+
+ for_each_available_child_of_node(np, child) {
+ sensor = devm_kzalloc(&pdev->dev, sizeof(*sensor), GFP_KERNEL);
+ if (!sensor) {
+ of_node_put(sensor_np);
+ return -ENOMEM;
+ }
+
+ ret = thermal_zone_of_get_sensor_id(child,
+ sensor_np,
+ &sensor->resource_id);
+ if (ret < 0) {
+ dev_err(&pdev->dev,
+ "failed to get valid sensor resource id: %d\n",
+ ret);
+ break;
+ }
+
+ sensor->tzd = devm_thermal_zone_of_sensor_register(&pdev->dev,
+ sensor->resource_id,
+ sensor,
+ &imx_sc_thermal_ops);
+ if (IS_ERR(sensor->tzd)) {
+ dev_err(&pdev->dev, "failed to register thermal zone\n");
+ ret = PTR_ERR(sensor->tzd);
+ break;
+ }
+ }
+
+ of_node_put(sensor_np);
+
+ return ret;
+}
+
+static int imx_sc_thermal_remove(struct platform_device *pdev)
+{
+ return 0;
+}
+
+static const struct of_device_id imx_sc_thermal_table[] = {
+ { .compatible = "fsl,imx-sc-thermal", },
+ {}
+};
+MODULE_DEVICE_TABLE(of, imx_sc_thermal_table);
+
+static struct platform_driver imx_sc_thermal_driver = {
+ .probe = imx_sc_thermal_probe,
+ .remove = imx_sc_thermal_remove,
+ .driver = {
+ .name = "imx-sc-thermal",
+ .of_match_table = imx_sc_thermal_table,
+ },
+};
+module_platform_driver(imx_sc_thermal_driver);
+
+MODULE_AUTHOR("Anson Huang <Anson.Huang@nxp.com>");
+MODULE_DESCRIPTION("Thermal driver for NXP i.MX SoCs with system controller");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c
index bb6754a5342c..e761c9b42217 100644
--- a/drivers/thermal/imx_thermal.c
+++ b/drivers/thermal/imx_thermal.c
@@ -3,24 +3,17 @@
// Copyright 2013 Freescale Semiconductor, Inc.
#include <linux/clk.h>
-#include <linux/cpu.h>
#include <linux/cpufreq.h>
#include <linux/cpu_cooling.h>
#include <linux/delay.h>
-#include <linux/device.h>
-#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
-#include <linux/kernel.h>
#include <linux/mfd/syscon.h>
#include <linux/module.h>
#include <linux/of.h>
#include <linux/of_device.h>
-#include <linux/platform_device.h>
#include <linux/regmap.h>
-#include <linux/slab.h>
#include <linux/thermal.h>
-#include <linux/types.h>
#include <linux/nvmem-consumer.h>
#define REG_SET 0x4
@@ -872,14 +865,12 @@ static int imx_thermal_remove(struct platform_device *pdev)
clk_disable_unprepare(data->thermal_clk);
thermal_zone_device_unregister(data->tz);
- cpufreq_cooling_unregister(data->cdev);
- cpufreq_cpu_put(data->policy);
+ imx_thermal_unregister_legacy_cooling(data);
return 0;
}
-#ifdef CONFIG_PM_SLEEP
-static int imx_thermal_suspend(struct device *dev)
+static int __maybe_unused imx_thermal_suspend(struct device *dev)
{
struct imx_thermal_data *data = dev_get_drvdata(dev);
struct regmap *map = data->tempmon;
@@ -900,7 +891,7 @@ static int imx_thermal_suspend(struct device *dev)
return 0;
}
-static int imx_thermal_resume(struct device *dev)
+static int __maybe_unused imx_thermal_resume(struct device *dev)
{
struct imx_thermal_data *data = dev_get_drvdata(dev);
struct regmap *map = data->tempmon;
@@ -918,7 +909,6 @@ static int imx_thermal_resume(struct device *dev)
return 0;
}
-#endif
static SIMPLE_DEV_PM_OPS(imx_thermal_pm_ops,
imx_thermal_suspend, imx_thermal_resume);
diff --git a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c
index 6cad15eb9cf4..ceef89c956bd 100644
--- a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c
+++ b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c
@@ -65,7 +65,7 @@ static ssize_t available_uuids_show(struct device *dev,
for (i = 0; i < INT3400_THERMAL_MAXIMUM_UUID; i++) {
if (priv->uuid_bitmap & (1 << i))
if (PAGE_SIZE - length > 0)
- length += snprintf(&buf[length],
+ length += scnprintf(&buf[length],
PAGE_SIZE - length,
"%s\n",
int3400_thermal_uuids[i]);
diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c
index b1fd34516e28..297db1d2d960 100644
--- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c
+++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c
@@ -45,6 +45,9 @@
/* JasperLake thermal reporting device */
#define PCI_DEVICE_ID_PROC_JSL_THERMAL 0x4503
+/* TigerLake thermal reporting device */
+#define PCI_DEVICE_ID_PROC_TGL_THERMAL 0x9A03
+
#define DRV_NAME "proc_thermal"
struct power_config {
@@ -728,6 +731,8 @@ static const struct pci_device_id proc_thermal_pci_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_ICL_THERMAL),
.driver_data = (kernel_ulong_t)&rapl_mmio_hsw, },
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_JSL_THERMAL)},
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_TGL_THERMAL),
+ .driver_data = (kernel_ulong_t)&rapl_mmio_hsw, },
{ 0, },
};
diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c
index ef0baa954ff0..874a47d6923f 100644
--- a/drivers/thermal/of-thermal.c
+++ b/drivers/thermal/of-thermal.c
@@ -449,6 +449,50 @@ thermal_zone_of_add_sensor(struct device_node *zone,
}
/**
+ * thermal_zone_of_get_sensor_id - get sensor ID from a DT thermal zone
+ * @tz_np: a valid thermal zone device node.
+ * @sensor_np: a sensor node of a valid sensor device.
+ * @id: the sensor ID returned if success.
+ *
+ * This function will get sensor ID from a given thermal zone node and
+ * the sensor node must match the temperature provider @sensor_np.
+ *
+ * Return: 0 on success, proper error code otherwise.
+ */
+
+int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
+ struct device_node *sensor_np,
+ u32 *id)
+{
+ struct of_phandle_args sensor_specs;
+ int ret;
+
+ ret = of_parse_phandle_with_args(tz_np,
+ "thermal-sensors",
+ "#thermal-sensor-cells",
+ 0,
+ &sensor_specs);
+ if (ret)
+ return ret;
+
+ if (sensor_specs.np != sensor_np) {
+ of_node_put(sensor_specs.np);
+ return -ENODEV;
+ }
+
+ if (sensor_specs.args_count > 1)
+ pr_warn("%pOFn: too many cells in sensor specifier %d\n",
+ sensor_specs.np, sensor_specs.args_count);
+
+ *id = sensor_specs.args_count ? sensor_specs.args[0] : 0;
+
+ of_node_put(sensor_specs.np);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_of_get_sensor_id);
+
+/**
* thermal_zone_of_sensor_register - registers a sensor to a DT thermal zone
* @dev: a valid struct device pointer of a sensor device. Must contain
* a valid .of_node, for the sensor node.
@@ -499,36 +543,22 @@ thermal_zone_of_sensor_register(struct device *dev, int sensor_id, void *data,
sensor_np = of_node_get(dev->of_node);
for_each_available_child_of_node(np, child) {
- struct of_phandle_args sensor_specs;
int ret, id;
/* For now, thermal framework supports only 1 sensor per zone */
- ret = of_parse_phandle_with_args(child, "thermal-sensors",
- "#thermal-sensor-cells",
- 0, &sensor_specs);
+ ret = thermal_zone_of_get_sensor_id(child, sensor_np, &id);
if (ret)
continue;
- if (sensor_specs.args_count >= 1) {
- id = sensor_specs.args[0];
- WARN(sensor_specs.args_count > 1,
- "%pOFn: too many cells in sensor specifier %d\n",
- sensor_specs.np, sensor_specs.args_count);
- } else {
- id = 0;
- }
-
- if (sensor_specs.np == sensor_np && id == sensor_id) {
+ if (id == sensor_id) {
tzd = thermal_zone_of_add_sensor(child, sensor_np,
data, ops);
if (!IS_ERR(tzd))
tzd->ops->set_mode(tzd, THERMAL_DEVICE_ENABLED);
- of_node_put(sensor_specs.np);
of_node_put(child);
goto exit;
}
- of_node_put(sensor_specs.np);
}
exit:
of_node_put(sensor_np);
diff --git a/drivers/thermal/qcom/tsens-8960.c b/drivers/thermal/qcom/tsens-8960.c
index fb77acb8d13b..2a28a5af209e 100644
--- a/drivers/thermal/qcom/tsens-8960.c
+++ b/drivers/thermal/qcom/tsens-8960.c
@@ -245,7 +245,7 @@ static inline int code_to_mdegC(u32 adc_code, const struct tsens_sensor *s)
return adc_code * slope + offset;
}
-static int get_temp_8960(struct tsens_sensor *s, int *temp)
+static int get_temp_8960(const struct tsens_sensor *s, int *temp)
{
int ret;
u32 code, trdy;
@@ -279,7 +279,7 @@ static const struct tsens_ops ops_8960 = {
.resume = resume_8960,
};
-const struct tsens_plat_data data_8960 = {
+struct tsens_plat_data data_8960 = {
.num_sensors = 11,
.ops = &ops_8960,
};
diff --git a/drivers/thermal/qcom/tsens-common.c b/drivers/thermal/qcom/tsens-common.c
index c8d57ee0a5bb..172545366636 100644
--- a/drivers/thermal/qcom/tsens-common.c
+++ b/drivers/thermal/qcom/tsens-common.c
@@ -23,6 +23,10 @@
* @low_thresh: lower threshold temperature value
* @low_irq_mask: mask register for lower threshold irqs
* @low_irq_clear: clear register for lower threshold irqs
+ * @crit_viol: critical threshold violated
+ * @crit_thresh: critical threshold temperature value
+ * @crit_irq_mask: mask register for critical threshold irqs
+ * @crit_irq_clear: clear register for critical threshold irqs
*
* Structure containing data about temperature threshold settings and
* irq status if they were violated.
@@ -36,6 +40,10 @@ struct tsens_irq_data {
int low_thresh;
u32 low_irq_mask;
u32 low_irq_clear;
+ u32 crit_viol;
+ u32 crit_thresh;
+ u32 crit_irq_mask;
+ u32 crit_irq_clear;
};
char *qfprom_read(struct device *dev, const char *cname)
@@ -128,7 +136,7 @@ static inline int code_to_degc(u32 adc_code, const struct tsens_sensor *s)
* Return: Temperature in milliCelsius on success, a negative errno will
* be returned in error cases
*/
-static int tsens_hw_to_mC(struct tsens_sensor *s, int field)
+static int tsens_hw_to_mC(const struct tsens_sensor *s, int field)
{
struct tsens_priv *priv = s->priv;
u32 resolution;
@@ -160,7 +168,7 @@ static int tsens_hw_to_mC(struct tsens_sensor *s, int field)
*
* Return: ADC code or temperature in deciCelsius.
*/
-static int tsens_mC_to_hw(struct tsens_sensor *s, int temp)
+static int tsens_mC_to_hw(const struct tsens_sensor *s, int temp)
{
struct tsens_priv *priv = s->priv;
@@ -189,6 +197,9 @@ static void tsens_set_interrupt_v1(struct tsens_priv *priv, u32 hw_id,
case LOWER:
index = LOW_INT_CLEAR_0 + hw_id;
break;
+ case CRITICAL:
+ /* No critical interrupts before v2 */
+ return;
}
regmap_field_write(priv->rf[index], enable ? 0 : 1);
}
@@ -214,6 +225,10 @@ static void tsens_set_interrupt_v2(struct tsens_priv *priv, u32 hw_id,
index_mask = LOW_INT_MASK_0 + hw_id;
index_clear = LOW_INT_CLEAR_0 + hw_id;
break;
+ case CRITICAL:
+ index_mask = CRIT_INT_MASK_0 + hw_id;
+ index_clear = CRIT_INT_CLEAR_0 + hw_id;
+ break;
}
if (enable) {
@@ -268,14 +283,23 @@ static int tsens_threshold_violated(struct tsens_priv *priv, u32 hw_id,
ret = regmap_field_read(priv->rf[LOWER_STATUS_0 + hw_id], &d->low_viol);
if (ret)
return ret;
- if (d->up_viol || d->low_viol)
+
+ if (priv->feat->crit_int) {
+ ret = regmap_field_read(priv->rf[CRITICAL_STATUS_0 + hw_id],
+ &d->crit_viol);
+ if (ret)
+ return ret;
+ }
+
+ if (d->up_viol || d->low_viol || d->crit_viol)
return 1;
return 0;
}
static int tsens_read_irq_state(struct tsens_priv *priv, u32 hw_id,
- struct tsens_sensor *s, struct tsens_irq_data *d)
+ const struct tsens_sensor *s,
+ struct tsens_irq_data *d)
{
int ret;
@@ -292,22 +316,37 @@ static int tsens_read_irq_state(struct tsens_priv *priv, u32 hw_id,
ret = regmap_field_read(priv->rf[LOW_INT_MASK_0 + hw_id], &d->low_irq_mask);
if (ret)
return ret;
+ ret = regmap_field_read(priv->rf[CRIT_INT_CLEAR_0 + hw_id],
+ &d->crit_irq_clear);
+ if (ret)
+ return ret;
+ ret = regmap_field_read(priv->rf[CRIT_INT_MASK_0 + hw_id],
+ &d->crit_irq_mask);
+ if (ret)
+ return ret;
+
+ d->crit_thresh = tsens_hw_to_mC(s, CRIT_THRESH_0 + hw_id);
} else {
/* No mask register on older TSENS */
d->up_irq_mask = 0;
d->low_irq_mask = 0;
+ d->crit_irq_clear = 0;
+ d->crit_irq_mask = 0;
+ d->crit_thresh = 0;
}
d->up_thresh = tsens_hw_to_mC(s, UP_THRESH_0 + hw_id);
d->low_thresh = tsens_hw_to_mC(s, LOW_THRESH_0 + hw_id);
- dev_dbg(priv->dev, "[%u] %s%s: status(%u|%u) | clr(%u|%u) | mask(%u|%u)\n",
- hw_id, __func__, (d->up_viol || d->low_viol) ? "(V)" : "",
- d->low_viol, d->up_viol, d->low_irq_clear, d->up_irq_clear,
- d->low_irq_mask, d->up_irq_mask);
- dev_dbg(priv->dev, "[%u] %s%s: thresh: (%d:%d)\n", hw_id, __func__,
- (d->up_viol || d->low_viol) ? "(violation)" : "",
- d->low_thresh, d->up_thresh);
+ dev_dbg(priv->dev, "[%u] %s%s: status(%u|%u|%u) | clr(%u|%u|%u) | mask(%u|%u|%u)\n",
+ hw_id, __func__,
+ (d->up_viol || d->low_viol || d->crit_viol) ? "(V)" : "",
+ d->low_viol, d->up_viol, d->crit_viol,
+ d->low_irq_clear, d->up_irq_clear, d->crit_irq_clear,
+ d->low_irq_mask, d->up_irq_mask, d->crit_irq_mask);
+ dev_dbg(priv->dev, "[%u] %s%s: thresh: (%d:%d:%d)\n", hw_id, __func__,
+ (d->up_viol || d->low_viol || d->crit_viol) ? "(V)" : "",
+ d->low_thresh, d->up_thresh, d->crit_thresh);
return 0;
}
@@ -322,6 +361,76 @@ static inline u32 masked_irq(u32 hw_id, u32 mask, enum tsens_ver ver)
}
/**
+ * tsens_critical_irq_thread() - Threaded handler for critical interrupts
+ * @irq: irq number
+ * @data: tsens controller private data
+ *
+ * Check FSM watchdog bark status and clear if needed.
+ * Check all sensors to find ones that violated their critical threshold limits.
+ * Clear and then re-enable the interrupt.
+ *
+ * The level-triggered interrupt might deassert if the temperature returned to
+ * within the threshold limits by the time the handler got scheduled. We
+ * consider the irq to have been handled in that case.
+ *
+ * Return: IRQ_HANDLED
+ */
+irqreturn_t tsens_critical_irq_thread(int irq, void *data)
+{
+ struct tsens_priv *priv = data;
+ struct tsens_irq_data d;
+ int temp, ret, i;
+ u32 wdog_status, wdog_count;
+
+ if (priv->feat->has_watchdog) {
+ ret = regmap_field_read(priv->rf[WDOG_BARK_STATUS],
+ &wdog_status);
+ if (ret)
+ return ret;
+
+ if (wdog_status) {
+ /* Clear WDOG interrupt */
+ regmap_field_write(priv->rf[WDOG_BARK_CLEAR], 1);
+ regmap_field_write(priv->rf[WDOG_BARK_CLEAR], 0);
+ ret = regmap_field_read(priv->rf[WDOG_BARK_COUNT],
+ &wdog_count);
+ if (ret)
+ return ret;
+ if (wdog_count)
+ dev_dbg(priv->dev, "%s: watchdog count: %d\n",
+ __func__, wdog_count);
+
+ /* Fall through to handle critical interrupts if any */
+ }
+ }
+
+ for (i = 0; i < priv->num_sensors; i++) {
+ const struct tsens_sensor *s = &priv->sensor[i];
+ u32 hw_id = s->hw_id;
+
+ if (IS_ERR(s->tzd))
+ continue;
+ if (!tsens_threshold_violated(priv, hw_id, &d))
+ continue;
+ ret = get_temp_tsens_valid(s, &temp);
+ if (ret) {
+ dev_err(priv->dev, "[%u] %s: error reading sensor\n",
+ hw_id, __func__);
+ continue;
+ }
+
+ tsens_read_irq_state(priv, hw_id, s, &d);
+ if (d.crit_viol &&
+ !masked_irq(hw_id, d.crit_irq_mask, tsens_version(priv))) {
+ /* Mask critical interrupts, unused on Linux */
+ tsens_set_interrupt(priv, hw_id, CRITICAL, false);
+ }
+ }
+
+ return IRQ_HANDLED;
+}
+
+/**
* tsens_irq_thread - Threaded interrupt handler for uplow interrupts
* @irq: irq number
* @data: tsens controller private data
@@ -346,10 +455,10 @@ irqreturn_t tsens_irq_thread(int irq, void *data)
for (i = 0; i < priv->num_sensors; i++) {
bool trigger = false;
- struct tsens_sensor *s = &priv->sensor[i];
+ const struct tsens_sensor *s = &priv->sensor[i];
u32 hw_id = s->hw_id;
- if (IS_ERR(priv->sensor[i].tzd))
+ if (IS_ERR(s->tzd))
continue;
if (!tsens_threshold_violated(priv, hw_id, &d))
continue;
@@ -368,7 +477,7 @@ irqreturn_t tsens_irq_thread(int irq, void *data)
tsens_set_interrupt(priv, hw_id, UPPER, disable);
if (d.up_thresh > temp) {
dev_dbg(priv->dev, "[%u] %s: re-arm upper\n",
- priv->sensor[i].hw_id, __func__);
+ hw_id, __func__);
tsens_set_interrupt(priv, hw_id, UPPER, enable);
} else {
trigger = true;
@@ -379,7 +488,7 @@ irqreturn_t tsens_irq_thread(int irq, void *data)
tsens_set_interrupt(priv, hw_id, LOWER, disable);
if (d.low_thresh < temp) {
dev_dbg(priv->dev, "[%u] %s: re-arm low\n",
- priv->sensor[i].hw_id, __func__);
+ hw_id, __func__);
tsens_set_interrupt(priv, hw_id, LOWER, enable);
} else {
trigger = true;
@@ -392,7 +501,7 @@ irqreturn_t tsens_irq_thread(int irq, void *data)
if (trigger) {
dev_dbg(priv->dev, "[%u] %s: TZ update trigger (%d mC)\n",
hw_id, __func__, temp);
- thermal_zone_device_update(priv->sensor[i].tzd,
+ thermal_zone_device_update(s->tzd,
THERMAL_EVENT_UNSPECIFIED);
} else {
dev_dbg(priv->dev, "[%u] %s: no violation: %d\n",
@@ -435,7 +544,7 @@ int tsens_set_trips(void *_sensor, int low, int high)
spin_unlock_irqrestore(&priv->ul_lock, flags);
dev_dbg(dev, "[%u] %s: (%d:%d)->(%d:%d)\n",
- s->hw_id, __func__, d.low_thresh, d.up_thresh, cl_low, cl_high);
+ hw_id, __func__, d.low_thresh, d.up_thresh, cl_low, cl_high);
return 0;
}
@@ -457,7 +566,7 @@ void tsens_disable_irq(struct tsens_priv *priv)
regmap_field_write(priv->rf[INT_EN], 0);
}
-int get_temp_tsens_valid(struct tsens_sensor *s, int *temp)
+int get_temp_tsens_valid(const struct tsens_sensor *s, int *temp)
{
struct tsens_priv *priv = s->priv;
int hw_id = s->hw_id;
@@ -486,7 +595,7 @@ int get_temp_tsens_valid(struct tsens_sensor *s, int *temp)
return 0;
}
-int get_temp_common(struct tsens_sensor *s, int *temp)
+int get_temp_common(const struct tsens_sensor *s, int *temp)
{
struct tsens_priv *priv = s->priv;
int hw_id = s->hw_id;
@@ -590,6 +699,7 @@ int __init init_common(struct tsens_priv *priv)
{
void __iomem *tm_base, *srot_base;
struct device *dev = priv->dev;
+ u32 ver_minor;
struct resource *res;
u32 enabled;
int ret, i, j;
@@ -602,7 +712,7 @@ int __init init_common(struct tsens_priv *priv)
/* DT with separate SROT and TM address space */
priv->tm_offset = 0;
res = platform_get_resource(op, IORESOURCE_MEM, 1);
- srot_base = devm_ioremap_resource(&op->dev, res);
+ srot_base = devm_ioremap_resource(dev, res);
if (IS_ERR(srot_base)) {
ret = PTR_ERR(srot_base);
goto err_put_device;
@@ -620,7 +730,7 @@ int __init init_common(struct tsens_priv *priv)
}
res = platform_get_resource(op, IORESOURCE_MEM, 0);
- tm_base = devm_ioremap_resource(&op->dev, res);
+ tm_base = devm_ioremap_resource(dev, res);
if (IS_ERR(tm_base)) {
ret = PTR_ERR(tm_base);
goto err_put_device;
@@ -639,6 +749,9 @@ int __init init_common(struct tsens_priv *priv)
if (IS_ERR(priv->rf[i]))
return PTR_ERR(priv->rf[i]);
}
+ ret = regmap_field_read(priv->rf[VER_MINOR], &ver_minor);
+ if (ret)
+ goto err_put_device;
}
priv->rf[TSENS_EN] = devm_regmap_field_alloc(dev, priv->srot_map,
@@ -683,12 +796,47 @@ int __init init_common(struct tsens_priv *priv)
}
}
+ if (priv->feat->crit_int) {
+ /* Loop might need changes if enum regfield_ids is reordered */
+ for (j = CRITICAL_STATUS_0; j <= CRIT_THRESH_15; j += 16) {
+ for (i = 0; i < priv->feat->max_sensors; i++) {
+ int idx = j + i;
+
+ priv->rf[idx] =
+ devm_regmap_field_alloc(dev,
+ priv->tm_map,
+ priv->fields[idx]);
+ if (IS_ERR(priv->rf[idx])) {
+ ret = PTR_ERR(priv->rf[idx]);
+ goto err_put_device;
+ }
+ }
+ }
+ }
+
+ if (tsens_version(priv) > VER_1_X && ver_minor > 2) {
+ /* Watchdog is present only on v2.3+ */
+ priv->feat->has_watchdog = 1;
+ for (i = WDOG_BARK_STATUS; i <= CC_MON_MASK; i++) {
+ priv->rf[i] = devm_regmap_field_alloc(dev, priv->tm_map,
+ priv->fields[i]);
+ if (IS_ERR(priv->rf[i])) {
+ ret = PTR_ERR(priv->rf[i]);
+ goto err_put_device;
+ }
+ }
+ /*
+ * Watchdog is already enabled, unmask the bark.
+ * Disable cycle completion monitoring
+ */
+ regmap_field_write(priv->rf[WDOG_BARK_MASK], 0);
+ regmap_field_write(priv->rf[CC_MON_MASK], 1);
+ }
+
spin_lock_init(&priv->ul_lock);
tsens_enable_irq(priv);
tsens_debug_init(op);
- return 0;
-
err_put_device:
put_device(&op->dev);
return ret;
diff --git a/drivers/thermal/qcom/tsens-v0_1.c b/drivers/thermal/qcom/tsens-v0_1.c
index 4b8dd6de02ce..959a9371d205 100644
--- a/drivers/thermal/qcom/tsens-v0_1.c
+++ b/drivers/thermal/qcom/tsens-v0_1.c
@@ -327,7 +327,7 @@ static int calibrate_8974(struct tsens_priv *priv)
/* v0.1: 8916, 8974 */
-static const struct tsens_features tsens_v0_1_feat = {
+static struct tsens_features tsens_v0_1_feat = {
.ver_major = VER_0_1,
.crit_int = 0,
.adc = 1,
@@ -377,7 +377,7 @@ static const struct tsens_ops ops_8916 = {
.get_temp = get_temp_common,
};
-const struct tsens_plat_data data_8916 = {
+struct tsens_plat_data data_8916 = {
.num_sensors = 5,
.ops = &ops_8916,
.hw_ids = (unsigned int []){0, 1, 2, 4, 5 },
@@ -392,7 +392,7 @@ static const struct tsens_ops ops_8974 = {
.get_temp = get_temp_common,
};
-const struct tsens_plat_data data_8974 = {
+struct tsens_plat_data data_8974 = {
.num_sensors = 11,
.ops = &ops_8974,
.feat = &tsens_v0_1_feat,
diff --git a/drivers/thermal/qcom/tsens-v1.c b/drivers/thermal/qcom/tsens-v1.c
index bd2ddb684a45..b682a4df0081 100644
--- a/drivers/thermal/qcom/tsens-v1.c
+++ b/drivers/thermal/qcom/tsens-v1.c
@@ -299,7 +299,7 @@ static int calibrate_8976(struct tsens_priv *priv)
/* v1.x: msm8956,8976,qcs404,405 */
-static const struct tsens_features tsens_v1_feat = {
+static struct tsens_features tsens_v1_feat = {
.ver_major = VER_1_X,
.crit_int = 0,
.adc = 1,
@@ -368,7 +368,7 @@ static const struct tsens_ops ops_generic_v1 = {
.get_temp = get_temp_tsens_valid,
};
-const struct tsens_plat_data data_tsens_v1 = {
+struct tsens_plat_data data_tsens_v1 = {
.ops = &ops_generic_v1,
.feat = &tsens_v1_feat,
.fields = tsens_v1_regfields,
@@ -381,7 +381,7 @@ static const struct tsens_ops ops_8976 = {
};
/* Valid for both MSM8956 and MSM8976. Sensor ID 3 is unused. */
-const struct tsens_plat_data data_8976 = {
+struct tsens_plat_data data_8976 = {
.num_sensors = 11,
.ops = &ops_8976,
.hw_ids = (unsigned int[]){0, 1, 2, 4, 5, 6, 7, 8, 9, 10},
diff --git a/drivers/thermal/qcom/tsens-v2.c b/drivers/thermal/qcom/tsens-v2.c
index a4d15e1abfdd..b293ed32174b 100644
--- a/drivers/thermal/qcom/tsens-v2.c
+++ b/drivers/thermal/qcom/tsens-v2.c
@@ -24,10 +24,11 @@
#define TM_Sn_CRITICAL_THRESHOLD_OFF 0x0060
#define TM_Sn_STATUS_OFF 0x00a0
#define TM_TRDY_OFF 0x00e4
+#define TM_WDOG_LOG_OFF 0x013c
/* v2.x: 8996, 8998, sdm845 */
-static const struct tsens_features tsens_v2_feat = {
+static struct tsens_features tsens_v2_feat = {
.ver_major = VER_2_X,
.crit_int = 1,
.adc = 0,
@@ -51,8 +52,9 @@ static const struct reg_field tsens_v2_regfields[MAX_REGFIELDS] = {
[INT_EN] = REG_FIELD(TM_INT_EN_OFF, 0, 2),
/* TEMPERATURE THRESHOLDS */
- REG_FIELD_FOR_EACH_SENSOR16(LOW_THRESH, TM_Sn_UPPER_LOWER_THRESHOLD_OFF, 0, 11),
- REG_FIELD_FOR_EACH_SENSOR16(UP_THRESH, TM_Sn_UPPER_LOWER_THRESHOLD_OFF, 12, 23),
+ REG_FIELD_FOR_EACH_SENSOR16(LOW_THRESH, TM_Sn_UPPER_LOWER_THRESHOLD_OFF, 0, 11),
+ REG_FIELD_FOR_EACH_SENSOR16(UP_THRESH, TM_Sn_UPPER_LOWER_THRESHOLD_OFF, 12, 23),
+ REG_FIELD_FOR_EACH_SENSOR16(CRIT_THRESH, TM_Sn_CRITICAL_THRESHOLD_OFF, 0, 11),
/* INTERRUPTS [CLEAR/STATUS/MASK] */
REG_FIELD_SPLIT_BITS_0_15(LOW_INT_STATUS, TM_UPPER_LOWER_INT_STATUS_OFF),
@@ -61,6 +63,18 @@ static const struct reg_field tsens_v2_regfields[MAX_REGFIELDS] = {
REG_FIELD_SPLIT_BITS_16_31(UP_INT_STATUS, TM_UPPER_LOWER_INT_STATUS_OFF),
REG_FIELD_SPLIT_BITS_16_31(UP_INT_CLEAR, TM_UPPER_LOWER_INT_CLEAR_OFF),
REG_FIELD_SPLIT_BITS_16_31(UP_INT_MASK, TM_UPPER_LOWER_INT_MASK_OFF),
+ REG_FIELD_SPLIT_BITS_0_15(CRIT_INT_STATUS, TM_CRITICAL_INT_STATUS_OFF),
+ REG_FIELD_SPLIT_BITS_0_15(CRIT_INT_CLEAR, TM_CRITICAL_INT_CLEAR_OFF),
+ REG_FIELD_SPLIT_BITS_0_15(CRIT_INT_MASK, TM_CRITICAL_INT_MASK_OFF),
+
+ /* WATCHDOG on v2.3 or later */
+ [WDOG_BARK_STATUS] = REG_FIELD(TM_CRITICAL_INT_STATUS_OFF, 31, 31),
+ [WDOG_BARK_CLEAR] = REG_FIELD(TM_CRITICAL_INT_CLEAR_OFF, 31, 31),
+ [WDOG_BARK_MASK] = REG_FIELD(TM_CRITICAL_INT_MASK_OFF, 31, 31),
+ [CC_MON_STATUS] = REG_FIELD(TM_CRITICAL_INT_STATUS_OFF, 30, 30),
+ [CC_MON_CLEAR] = REG_FIELD(TM_CRITICAL_INT_CLEAR_OFF, 30, 30),
+ [CC_MON_MASK] = REG_FIELD(TM_CRITICAL_INT_MASK_OFF, 30, 30),
+ [WDOG_BARK_COUNT] = REG_FIELD(TM_WDOG_LOG_OFF, 0, 7),
/* Sn_STATUS */
REG_FIELD_FOR_EACH_SENSOR16(LAST_TEMP, TM_Sn_STATUS_OFF, 0, 11),
@@ -81,14 +95,14 @@ static const struct tsens_ops ops_generic_v2 = {
.get_temp = get_temp_tsens_valid,
};
-const struct tsens_plat_data data_tsens_v2 = {
+struct tsens_plat_data data_tsens_v2 = {
.ops = &ops_generic_v2,
.feat = &tsens_v2_feat,
.fields = tsens_v2_regfields,
};
/* Kept around for backward compatibility with old msm8996.dtsi */
-const struct tsens_plat_data data_8996 = {
+struct tsens_plat_data data_8996 = {
.num_sensors = 13,
.ops = &ops_generic_v2,
.feat = &tsens_v2_feat,
diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c
index 0e7cf5236932..2f77d235cf73 100644
--- a/drivers/thermal/qcom/tsens.c
+++ b/drivers/thermal/qcom/tsens.c
@@ -85,11 +85,42 @@ static const struct thermal_zone_of_device_ops tsens_of_ops = {
.set_trips = tsens_set_trips,
};
+static int tsens_register_irq(struct tsens_priv *priv, char *irqname,
+ irq_handler_t thread_fn)
+{
+ struct platform_device *pdev;
+ int ret, irq;
+
+ pdev = of_find_device_by_node(priv->dev->of_node);
+ if (!pdev)
+ return -ENODEV;
+
+ irq = platform_get_irq_byname(pdev, irqname);
+ if (irq < 0) {
+ ret = irq;
+ /* For old DTs with no IRQ defined */
+ if (irq == -ENXIO)
+ ret = 0;
+ } else {
+ ret = devm_request_threaded_irq(&pdev->dev, irq,
+ NULL, thread_fn,
+ IRQF_ONESHOT,
+ dev_name(&pdev->dev), priv);
+ if (ret)
+ dev_err(&pdev->dev, "%s: failed to get irq\n",
+ __func__);
+ else
+ enable_irq_wake(irq);
+ }
+
+ put_device(&pdev->dev);
+ return ret;
+}
+
static int tsens_register(struct tsens_priv *priv)
{
- int i, ret, irq;
+ int i, ret;
struct thermal_zone_device *tzd;
- struct platform_device *pdev;
for (i = 0; i < priv->num_sensors; i++) {
priv->sensor[i].priv = priv;
@@ -103,32 +134,14 @@ static int tsens_register(struct tsens_priv *priv)
priv->ops->enable(priv, i);
}
- pdev = of_find_device_by_node(priv->dev->of_node);
- if (!pdev)
- return -ENODEV;
-
- irq = platform_get_irq_byname(pdev, "uplow");
- if (irq < 0) {
- ret = irq;
- /* For old DTs with no IRQ defined */
- if (irq == -ENXIO)
- ret = 0;
- goto err_put_device;
- }
-
- ret = devm_request_threaded_irq(&pdev->dev, irq,
- NULL, tsens_irq_thread,
- IRQF_TRIGGER_HIGH | IRQF_ONESHOT,
- dev_name(&pdev->dev), priv);
- if (ret) {
- dev_err(&pdev->dev, "%s: failed to get irq\n", __func__);
- goto err_put_device;
- }
+ ret = tsens_register_irq(priv, "uplow", tsens_irq_thread);
+ if (ret < 0)
+ return ret;
- enable_irq_wake(irq);
+ if (priv->feat->crit_int)
+ ret = tsens_register_irq(priv, "critical",
+ tsens_critical_irq_thread);
-err_put_device:
- put_device(&pdev->dev);
return ret;
}
diff --git a/drivers/thermal/qcom/tsens.h b/drivers/thermal/qcom/tsens.h
index e24a865fbc34..502acf0e6828 100644
--- a/drivers/thermal/qcom/tsens.h
+++ b/drivers/thermal/qcom/tsens.h
@@ -23,6 +23,7 @@
struct tsens_priv;
+/* IP version numbers in ascending order */
enum tsens_ver {
VER_0_1 = 0,
VER_1_X,
@@ -32,6 +33,7 @@ enum tsens_ver {
enum tsens_irq_type {
LOWER,
UPPER,
+ CRITICAL,
};
/**
@@ -67,7 +69,7 @@ struct tsens_ops {
/* mandatory callbacks */
int (*init)(struct tsens_priv *priv);
int (*calibrate)(struct tsens_priv *priv);
- int (*get_temp)(struct tsens_sensor *s, int *temp);
+ int (*get_temp)(const struct tsens_sensor *s, int *temp);
/* optional callbacks */
int (*enable)(struct tsens_priv *priv, int i);
void (*disable)(struct tsens_priv *priv);
@@ -374,6 +376,82 @@ enum regfield_ids {
CRITICAL_STATUS_13,
CRITICAL_STATUS_14,
CRITICAL_STATUS_15,
+ CRIT_INT_STATUS_0, /* CRITICAL interrupt status */
+ CRIT_INT_STATUS_1,
+ CRIT_INT_STATUS_2,
+ CRIT_INT_STATUS_3,
+ CRIT_INT_STATUS_4,
+ CRIT_INT_STATUS_5,
+ CRIT_INT_STATUS_6,
+ CRIT_INT_STATUS_7,
+ CRIT_INT_STATUS_8,
+ CRIT_INT_STATUS_9,
+ CRIT_INT_STATUS_10,
+ CRIT_INT_STATUS_11,
+ CRIT_INT_STATUS_12,
+ CRIT_INT_STATUS_13,
+ CRIT_INT_STATUS_14,
+ CRIT_INT_STATUS_15,
+ CRIT_INT_CLEAR_0, /* CRITICAL interrupt clear */
+ CRIT_INT_CLEAR_1,
+ CRIT_INT_CLEAR_2,
+ CRIT_INT_CLEAR_3,
+ CRIT_INT_CLEAR_4,
+ CRIT_INT_CLEAR_5,
+ CRIT_INT_CLEAR_6,
+ CRIT_INT_CLEAR_7,
+ CRIT_INT_CLEAR_8,
+ CRIT_INT_CLEAR_9,
+ CRIT_INT_CLEAR_10,
+ CRIT_INT_CLEAR_11,
+ CRIT_INT_CLEAR_12,
+ CRIT_INT_CLEAR_13,
+ CRIT_INT_CLEAR_14,
+ CRIT_INT_CLEAR_15,
+ CRIT_INT_MASK_0, /* CRITICAL interrupt mask */
+ CRIT_INT_MASK_1,
+ CRIT_INT_MASK_2,
+ CRIT_INT_MASK_3,
+ CRIT_INT_MASK_4,
+ CRIT_INT_MASK_5,
+ CRIT_INT_MASK_6,
+ CRIT_INT_MASK_7,
+ CRIT_INT_MASK_8,
+ CRIT_INT_MASK_9,
+ CRIT_INT_MASK_10,
+ CRIT_INT_MASK_11,
+ CRIT_INT_MASK_12,
+ CRIT_INT_MASK_13,
+ CRIT_INT_MASK_14,
+ CRIT_INT_MASK_15,
+ CRIT_THRESH_0, /* CRITICAL threshold values */
+ CRIT_THRESH_1,
+ CRIT_THRESH_2,
+ CRIT_THRESH_3,
+ CRIT_THRESH_4,
+ CRIT_THRESH_5,
+ CRIT_THRESH_6,
+ CRIT_THRESH_7,
+ CRIT_THRESH_8,
+ CRIT_THRESH_9,
+ CRIT_THRESH_10,
+ CRIT_THRESH_11,
+ CRIT_THRESH_12,
+ CRIT_THRESH_13,
+ CRIT_THRESH_14,
+ CRIT_THRESH_15,
+
+ /* WATCHDOG */
+ WDOG_BARK_STATUS,
+ WDOG_BARK_CLEAR,
+ WDOG_BARK_MASK,
+ WDOG_BARK_COUNT,
+
+ /* CYCLE COMPLETION MONITOR */
+ CC_MON_STATUS,
+ CC_MON_CLEAR,
+ CC_MON_MASK,
+
MIN_STATUS_0, /* MIN threshold violated */
MIN_STATUS_1,
MIN_STATUS_2,
@@ -418,6 +496,7 @@ enum regfield_ids {
* @adc: do the sensors only output adc code (instead of temperature)?
* @srot_split: does the IP neatly splits the register space into SROT and TM,
* with SROT only being available to secure boot firmware?
+ * @has_watchdog: does this IP support watchdog functionality?
* @max_sensors: maximum sensors supported by this version of the IP
*/
struct tsens_features {
@@ -425,6 +504,7 @@ struct tsens_features {
unsigned int crit_int:1;
unsigned int adc:1;
unsigned int srot_split:1;
+ unsigned int has_watchdog:1;
unsigned int max_sensors;
};
@@ -440,12 +520,14 @@ struct tsens_plat_data {
const u32 num_sensors;
const struct tsens_ops *ops;
unsigned int *hw_ids;
- const struct tsens_features *feat;
+ struct tsens_features *feat;
const struct reg_field *fields;
};
/**
* struct tsens_context - Registers to be saved/restored across a context loss
+ * @threshold: Threshold register value
+ * @control: Control register value
*/
struct tsens_context {
int threshold;
@@ -460,6 +542,8 @@ struct tsens_context {
* @srot_map: pointer to SROT register address space
* @tm_offset: deal with old device trees that don't address TM and SROT
* address space separately
+ * @ul_lock: lock while processing upper/lower threshold interrupts
+ * @crit_lock: lock while processing critical threshold interrupts
* @rf: array of regmap_fields used to store value of the field
* @ctx: registers to be saved and restored during suspend/resume
* @feat: features of the IP
@@ -481,36 +565,37 @@ struct tsens_priv {
struct regmap_field *rf[MAX_REGFIELDS];
struct tsens_context ctx;
- const struct tsens_features *feat;
+ struct tsens_features *feat;
const struct reg_field *fields;
const struct tsens_ops *ops;
struct dentry *debug_root;
struct dentry *debug;
- struct tsens_sensor sensor[0];
+ struct tsens_sensor sensor[];
};
char *qfprom_read(struct device *dev, const char *cname);
void compute_intercept_slope(struct tsens_priv *priv, u32 *pt1, u32 *pt2, u32 mode);
int init_common(struct tsens_priv *priv);
-int get_temp_tsens_valid(struct tsens_sensor *s, int *temp);
-int get_temp_common(struct tsens_sensor *s, int *temp);
+int get_temp_tsens_valid(const struct tsens_sensor *s, int *temp);
+int get_temp_common(const struct tsens_sensor *s, int *temp);
int tsens_enable_irq(struct tsens_priv *priv);
void tsens_disable_irq(struct tsens_priv *priv);
int tsens_set_trips(void *_sensor, int low, int high);
irqreturn_t tsens_irq_thread(int irq, void *data);
+irqreturn_t tsens_critical_irq_thread(int irq, void *data);
/* TSENS target */
-extern const struct tsens_plat_data data_8960;
+extern struct tsens_plat_data data_8960;
/* TSENS v0.1 targets */
-extern const struct tsens_plat_data data_8916, data_8974;
+extern struct tsens_plat_data data_8916, data_8974;
/* TSENS v1 targets */
-extern const struct tsens_plat_data data_tsens_v1, data_8976;
+extern struct tsens_plat_data data_tsens_v1, data_8976;
/* TSENS v2 targets */
-extern const struct tsens_plat_data data_8996, data_tsens_v2;
+extern struct tsens_plat_data data_8996, data_tsens_v2;
#endif /* __QCOM_TSENS_H__ */
diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c
index 874bc46e6c73..028a6bbf75dc 100644
--- a/drivers/thermal/qoriq_thermal.c
+++ b/drivers/thermal/qoriq_thermal.c
@@ -3,12 +3,11 @@
// Copyright 2016 Freescale Semiconductor, Inc.
#include <linux/clk.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
#include <linux/err.h>
#include <linux/io.h>
+#include <linux/module.h>
#include <linux/of.h>
-#include <linux/of_address.h>
+#include <linux/platform_device.h>
#include <linux/regmap.h>
#include <linux/sizes.h>
#include <linux/thermal.h>
@@ -228,6 +227,14 @@ static const struct regmap_access_table qoriq_rd_table = {
.n_yes_ranges = ARRAY_SIZE(qoriq_yes_ranges),
};
+static void qoriq_tmu_action(void *p)
+{
+ struct qoriq_tmu_data *data = p;
+
+ regmap_write(data->regmap, REGS_TMR, TMR_DISABLE);
+ clk_disable_unprepare(data->clk);
+}
+
static int qoriq_tmu_probe(struct platform_device *pdev)
{
int ret;
@@ -278,6 +285,10 @@ static int qoriq_tmu_probe(struct platform_device *pdev)
return ret;
}
+ ret = devm_add_action_or_reset(dev, qoriq_tmu_action, data);
+ if (ret)
+ return ret;
+
/* version register offset at: 0xbf8 on both v1 and v2 */
ret = regmap_read(data->regmap, REGS_IPBRR(0), &ver);
if (ret) {
@@ -290,35 +301,17 @@ static int qoriq_tmu_probe(struct platform_device *pdev)
ret = qoriq_tmu_calibration(dev, data); /* TMU calibration */
if (ret < 0)
- goto err;
+ return ret;
ret = qoriq_tmu_register_tmu_zone(dev, data);
if (ret < 0) {
dev_err(dev, "Failed to register sensors\n");
- ret = -ENODEV;
- goto err;
+ return ret;
}
platform_set_drvdata(pdev, data);
return 0;
-
-err:
- clk_disable_unprepare(data->clk);
-
- return ret;
-}
-
-static int qoriq_tmu_remove(struct platform_device *pdev)
-{
- struct qoriq_tmu_data *data = platform_get_drvdata(pdev);
-
- /* Disable monitoring */
- regmap_write(data->regmap, REGS_TMR, TMR_DISABLE);
-
- clk_disable_unprepare(data->clk);
-
- return 0;
}
static int __maybe_unused qoriq_tmu_suspend(struct device *dev)
@@ -365,7 +358,6 @@ static struct platform_driver qoriq_tmu = {
.of_match_table = qoriq_tmu_match,
},
.probe = qoriq_tmu_probe,
- .remove = qoriq_tmu_remove,
};
module_platform_driver(qoriq_tmu);
diff --git a/drivers/thermal/rcar_gen3_thermal.c b/drivers/thermal/rcar_gen3_thermal.c
index 72877bdc072d..58fe7c1ef00b 100644
--- a/drivers/thermal/rcar_gen3_thermal.c
+++ b/drivers/thermal/rcar_gen3_thermal.c
@@ -81,8 +81,6 @@ struct rcar_gen3_thermal_tsc {
void __iomem *base;
struct thermal_zone_device *zone;
struct equation_coefs coef;
- int low;
- int high;
int tj_t;
int id; /* thermal channel id */
};
@@ -204,12 +202,14 @@ static int rcar_gen3_thermal_mcelsius_to_temp(struct rcar_gen3_thermal_tsc *tsc,
return INT_FIXPT(val);
}
-static int rcar_gen3_thermal_set_trips(void *devdata, int low, int high)
+static int rcar_gen3_thermal_update_range(struct rcar_gen3_thermal_tsc *tsc)
{
- struct rcar_gen3_thermal_tsc *tsc = devdata;
+ int temperature, low, high;
+
+ rcar_gen3_thermal_get_temp(tsc, &temperature);
- low = clamp_val(low, -40000, 120000);
- high = clamp_val(high, -40000, 120000);
+ low = temperature - MCELSIUS(1);
+ high = temperature + MCELSIUS(1);
rcar_gen3_thermal_write(tsc, REG_GEN3_IRQTEMP1,
rcar_gen3_thermal_mcelsius_to_temp(tsc, low));
@@ -217,15 +217,11 @@ static int rcar_gen3_thermal_set_trips(void *devdata, int low, int high)
rcar_gen3_thermal_write(tsc, REG_GEN3_IRQTEMP2,
rcar_gen3_thermal_mcelsius_to_temp(tsc, high));
- tsc->low = low;
- tsc->high = high;
-
return 0;
}
static const struct thermal_zone_of_device_ops rcar_gen3_tz_of_ops = {
.get_temp = rcar_gen3_thermal_get_temp,
- .set_trips = rcar_gen3_thermal_set_trips,
};
static void rcar_thermal_irq_set(struct rcar_gen3_thermal_priv *priv, bool on)
@@ -246,9 +242,11 @@ static irqreturn_t rcar_gen3_thermal_irq(int irq, void *data)
for (i = 0; i < priv->num_tscs; i++) {
status = rcar_gen3_thermal_read(priv->tscs[i], REG_GEN3_IRQSTR);
rcar_gen3_thermal_write(priv->tscs[i], REG_GEN3_IRQSTR, 0);
- if (status)
+ if (status) {
+ rcar_gen3_thermal_update_range(priv->tscs[i]);
thermal_zone_device_update(priv->tscs[i]->zone,
THERMAL_EVENT_UNSPECIFIED);
+ }
}
return IRQ_HANDLED;
@@ -325,6 +323,10 @@ static const struct of_device_id rcar_gen3_thermal_dt_ids[] = {
.data = &rcar_gen3_ths_tj_1_m3_w,
},
{
+ .compatible = "renesas,r8a77961-thermal",
+ .data = &rcar_gen3_ths_tj_1_m3_w,
+ },
+ {
.compatible = "renesas,r8a77965-thermal",
.data = &rcar_gen3_ths_tj_1,
},
@@ -446,14 +448,15 @@ static int rcar_gen3_thermal_probe(struct platform_device *pdev)
goto error_unregister;
ret = devm_add_action_or_reset(dev, rcar_gen3_hwmon_action, zone);
- if (ret) {
+ if (ret)
goto error_unregister;
- }
ret = of_thermal_get_ntrips(tsc->zone);
if (ret < 0)
goto error_unregister;
+ rcar_gen3_thermal_update_range(tsc);
+
dev_info(dev, "TSC%d: Loaded %d trip points\n", i, ret);
}
@@ -492,7 +495,7 @@ static int __maybe_unused rcar_gen3_thermal_resume(struct device *dev)
struct rcar_gen3_thermal_tsc *tsc = priv->tscs[i];
priv->thermal_init(tsc);
- rcar_gen3_thermal_set_trips(tsc, tsc->low, tsc->high);
+ rcar_gen3_thermal_update_range(tsc);
}
rcar_thermal_irq_set(priv, true);
diff --git a/drivers/thermal/rcar_thermal.c b/drivers/thermal/rcar_thermal.c
index 8f1aafa2044e..e0c1f2409035 100644
--- a/drivers/thermal/rcar_thermal.c
+++ b/drivers/thermal/rcar_thermal.c
@@ -95,7 +95,6 @@ struct rcar_thermal_priv {
struct mutex lock;
struct list_head list;
int id;
- u32 ctemp;
};
#define rcar_thermal_for_each_priv(pos, common) \
@@ -201,7 +200,6 @@ static int rcar_thermal_update_temp(struct rcar_thermal_priv *priv)
struct device *dev = rcar_priv_to_dev(priv);
int i;
u32 ctemp, old, new;
- int ret = -EINVAL;
mutex_lock(&priv->lock);
@@ -247,37 +245,29 @@ static int rcar_thermal_update_temp(struct rcar_thermal_priv *priv)
((ctemp - 1) << 0)));
}
- dev_dbg(dev, "thermal%d %d -> %d\n", priv->id, priv->ctemp, ctemp);
-
- priv->ctemp = ctemp;
- ret = 0;
err_out_unlock:
mutex_unlock(&priv->lock);
- return ret;
+
+ return ctemp ? ctemp : -EINVAL;
}
static int rcar_thermal_get_current_temp(struct rcar_thermal_priv *priv,
int *temp)
{
- int tmp;
- int ret;
-
- ret = rcar_thermal_update_temp(priv);
- if (ret < 0)
- return ret;
+ int ctemp;
- mutex_lock(&priv->lock);
- if (priv->chip->ctemp_bands == 1)
- tmp = MCELSIUS((priv->ctemp * 5) - 65);
- else if (priv->ctemp < 24)
- tmp = MCELSIUS(((priv->ctemp * 55) - 720) / 10);
- else
- tmp = MCELSIUS((priv->ctemp * 5) - 60);
- mutex_unlock(&priv->lock);
+ ctemp = rcar_thermal_update_temp(priv);
+ if (ctemp < 0)
+ return ctemp;
/* Guaranteed operating range is -45C to 125C. */
- *temp = tmp;
+ if (priv->chip->ctemp_bands == 1)
+ *temp = MCELSIUS((ctemp * 5) - 65);
+ else if (ctemp < 24)
+ *temp = MCELSIUS(((ctemp * 55) - 720) / 10);
+ else
+ *temp = MCELSIUS((ctemp * 5) - 60);
return 0;
}
@@ -387,28 +377,17 @@ static void _rcar_thermal_irq_ctrl(struct rcar_thermal_priv *priv, int enable)
static void rcar_thermal_work(struct work_struct *work)
{
struct rcar_thermal_priv *priv;
- int cctemp, nctemp;
int ret;
priv = container_of(work, struct rcar_thermal_priv, work.work);
- ret = rcar_thermal_get_current_temp(priv, &cctemp);
- if (ret < 0)
- return;
-
ret = rcar_thermal_update_temp(priv);
if (ret < 0)
return;
rcar_thermal_irq_enable(priv);
- ret = rcar_thermal_get_current_temp(priv, &nctemp);
- if (ret < 0)
- return;
-
- if (nctemp != cctemp)
- thermal_zone_device_update(priv->zone,
- THERMAL_EVENT_UNSPECIFIED);
+ thermal_zone_device_update(priv->zone, THERMAL_EVENT_UNSPECIFIED);
}
static u32 rcar_thermal_had_changed(struct rcar_thermal_priv *priv, u32 status)
@@ -521,8 +500,10 @@ static int rcar_thermal_probe(struct platform_device *pdev)
res = platform_get_resource(pdev, IORESOURCE_MEM,
mres++);
common->base = devm_ioremap_resource(dev, res);
- if (IS_ERR(common->base))
- return PTR_ERR(common->base);
+ if (IS_ERR(common->base)) {
+ ret = PTR_ERR(common->base);
+ goto error_unregister;
+ }
idle = 0; /* polling delay is not needed */
}
diff --git a/drivers/thermal/samsung/exynos_tmu.c b/drivers/thermal/samsung/exynos_tmu.c
index fd4a17812f33..e9a90bc23b11 100644
--- a/drivers/thermal/samsung/exynos_tmu.c
+++ b/drivers/thermal/samsung/exynos_tmu.c
@@ -1094,7 +1094,9 @@ static int exynos_tmu_probe(struct platform_device *pdev)
&exynos_sensor_ops);
if (IS_ERR(data->tzd)) {
ret = PTR_ERR(data->tzd);
- dev_err(&pdev->dev, "Failed to register sensor: %d\n", ret);
+ if (ret != -EPROBE_DEFER)
+ dev_err(&pdev->dev, "Failed to register sensor: %d\n",
+ ret);
goto err_sclk;
}
diff --git a/drivers/thermal/sprd_thermal.c b/drivers/thermal/sprd_thermal.c
new file mode 100644
index 000000000000..a340374e8c51
--- /dev/null
+++ b/drivers/thermal/sprd_thermal.c
@@ -0,0 +1,552 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2020 Spreadtrum Communications Inc.
+
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/nvmem-consumer.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/thermal.h>
+
+#define SPRD_THM_CTL 0x0
+#define SPRD_THM_INT_EN 0x4
+#define SPRD_THM_INT_STS 0x8
+#define SPRD_THM_INT_RAW_STS 0xc
+#define SPRD_THM_DET_PERIOD 0x10
+#define SPRD_THM_INT_CLR 0x14
+#define SPRD_THM_INT_CLR_ST 0x18
+#define SPRD_THM_MON_PERIOD 0x4c
+#define SPRD_THM_MON_CTL 0x50
+#define SPRD_THM_INTERNAL_STS1 0x54
+#define SPRD_THM_RAW_READ_MSK 0x3ff
+
+#define SPRD_THM_OFFSET(id) ((id) * 0x4)
+#define SPRD_THM_TEMP(id) (SPRD_THM_OFFSET(id) + 0x5c)
+#define SPRD_THM_THRES(id) (SPRD_THM_OFFSET(id) + 0x2c)
+
+#define SPRD_THM_SEN(id) BIT((id) + 2)
+#define SPRD_THM_SEN_OVERHEAT_EN(id) BIT((id) + 8)
+#define SPRD_THM_SEN_OVERHEAT_ALARM_EN(id) BIT((id) + 0)
+
+/* bits definitions for register THM_CTL */
+#define SPRD_THM_SET_RDY_ST BIT(13)
+#define SPRD_THM_SET_RDY BIT(12)
+#define SPRD_THM_MON_EN BIT(1)
+#define SPRD_THM_EN BIT(0)
+
+/* bits definitions for register THM_INT_CTL */
+#define SPRD_THM_BIT_INT_EN BIT(26)
+#define SPRD_THM_OVERHEAT_EN BIT(25)
+#define SPRD_THM_OTP_TRIP_SHIFT 10
+
+/* bits definitions for register SPRD_THM_INTERNAL_STS1 */
+#define SPRD_THM_TEMPER_RDY BIT(0)
+
+#define SPRD_THM_DET_PERIOD_DATA 0x800
+#define SPRD_THM_DET_PERIOD_MASK GENMASK(19, 0)
+#define SPRD_THM_MON_MODE 0x7
+#define SPRD_THM_MON_MODE_MASK GENMASK(3, 0)
+#define SPRD_THM_MON_PERIOD_DATA 0x10
+#define SPRD_THM_MON_PERIOD_MASK GENMASK(15, 0)
+#define SPRD_THM_THRES_MASK GENMASK(19, 0)
+#define SPRD_THM_INT_CLR_MASK GENMASK(24, 0)
+
+/* thermal sensor calibration parameters */
+#define SPRD_THM_TEMP_LOW -40000
+#define SPRD_THM_TEMP_HIGH 120000
+#define SPRD_THM_OTP_TEMP 120000
+#define SPRD_THM_HOT_TEMP 75000
+#define SPRD_THM_RAW_DATA_LOW 0
+#define SPRD_THM_RAW_DATA_HIGH 1000
+#define SPRD_THM_SEN_NUM 8
+#define SPRD_THM_DT_OFFSET 24
+#define SPRD_THM_RATION_OFFSET 17
+#define SPRD_THM_RATION_SIGN 16
+
+#define SPRD_THM_RDYST_POLLING_TIME 10
+#define SPRD_THM_RDYST_TIMEOUT 700
+#define SPRD_THM_TEMP_READY_POLL_TIME 10000
+#define SPRD_THM_TEMP_READY_TIMEOUT 600000
+#define SPRD_THM_MAX_SENSOR 8
+
+struct sprd_thermal_sensor {
+ struct thermal_zone_device *tzd;
+ struct sprd_thermal_data *data;
+ struct device *dev;
+ int cal_slope;
+ int cal_offset;
+ int id;
+};
+
+struct sprd_thermal_data {
+ const struct sprd_thm_variant_data *var_data;
+ struct sprd_thermal_sensor *sensor[SPRD_THM_MAX_SENSOR];
+ struct clk *clk;
+ void __iomem *base;
+ u32 ratio_off;
+ int ratio_sign;
+ int nr_sensors;
+};
+
+/*
+ * The conversion between ADC and temperature is based on linear relationship,
+ * and use idea_k to specify the slope and ideal_b to specify the offset.
+ *
+ * Since different Spreadtrum SoCs have different ideal_k and ideal_b,
+ * we should save ideal_k and ideal_b in the device data structure.
+ */
+struct sprd_thm_variant_data {
+ u32 ideal_k;
+ u32 ideal_b;
+};
+
+static const struct sprd_thm_variant_data ums512_data = {
+ .ideal_k = 262,
+ .ideal_b = 66400,
+};
+
+static inline void sprd_thm_update_bits(void __iomem *reg, u32 mask, u32 val)
+{
+ u32 tmp, orig;
+
+ orig = readl(reg);
+ tmp = orig & ~mask;
+ tmp |= val & mask;
+ writel(tmp, reg);
+}
+
+static int sprd_thm_cal_read(struct device_node *np, const char *cell_id,
+ u32 *val)
+{
+ struct nvmem_cell *cell;
+ void *buf;
+ size_t len;
+
+ cell = of_nvmem_cell_get(np, cell_id);
+ if (IS_ERR(cell))
+ return PTR_ERR(cell);
+
+ buf = nvmem_cell_read(cell, &len);
+ nvmem_cell_put(cell);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
+ if (len > sizeof(u32)) {
+ kfree(buf);
+ return -EINVAL;
+ }
+
+ memcpy(val, buf, len);
+
+ kfree(buf);
+ return 0;
+}
+
+static int sprd_thm_sensor_calibration(struct device_node *np,
+ struct sprd_thermal_data *thm,
+ struct sprd_thermal_sensor *sen)
+{
+ int ret;
+ /*
+ * According to thermal datasheet, the default calibration offset is 64,
+ * and the default ratio is 1000.
+ */
+ int dt_offset = 64, ratio = 1000;
+
+ ret = sprd_thm_cal_read(np, "sen_delta_cal", &dt_offset);
+ if (ret)
+ return ret;
+
+ ratio += thm->ratio_sign * thm->ratio_off;
+
+ /*
+ * According to the ideal slope K and ideal offset B, combined with
+ * calibration value of thermal from efuse, then calibrate the real
+ * slope k and offset b:
+ * k_cal = (k * ratio) / 1000.
+ * b_cal = b + (dt_offset - 64) * 500.
+ */
+ sen->cal_slope = (thm->var_data->ideal_k * ratio) / 1000;
+ sen->cal_offset = thm->var_data->ideal_b + (dt_offset - 128) * 250;
+
+ return 0;
+}
+
+static int sprd_thm_rawdata_to_temp(struct sprd_thermal_sensor *sen,
+ u32 rawdata)
+{
+ clamp(rawdata, (u32)SPRD_THM_RAW_DATA_LOW, (u32)SPRD_THM_RAW_DATA_HIGH);
+
+ /*
+ * According to the thermal datasheet, the formula of converting
+ * adc value to the temperature value should be:
+ * T_final = k_cal * x - b_cal.
+ */
+ return sen->cal_slope * rawdata - sen->cal_offset;
+}
+
+static int sprd_thm_temp_to_rawdata(int temp, struct sprd_thermal_sensor *sen)
+{
+ u32 val;
+
+ clamp(temp, (int)SPRD_THM_TEMP_LOW, (int)SPRD_THM_TEMP_HIGH);
+
+ /*
+ * According to the thermal datasheet, the formula of converting
+ * adc value to the temperature value should be:
+ * T_final = k_cal * x - b_cal.
+ */
+ val = (temp + sen->cal_offset) / sen->cal_slope;
+
+ return clamp(val, val, (u32)(SPRD_THM_RAW_DATA_HIGH - 1));
+}
+
+static int sprd_thm_read_temp(void *devdata, int *temp)
+{
+ struct sprd_thermal_sensor *sen = devdata;
+ u32 data;
+
+ data = readl(sen->data->base + SPRD_THM_TEMP(sen->id)) &
+ SPRD_THM_RAW_READ_MSK;
+
+ *temp = sprd_thm_rawdata_to_temp(sen, data);
+
+ return 0;
+}
+
+static const struct thermal_zone_of_device_ops sprd_thm_ops = {
+ .get_temp = sprd_thm_read_temp,
+};
+
+static int sprd_thm_poll_ready_status(struct sprd_thermal_data *thm)
+{
+ u32 val;
+ int ret;
+
+ /*
+ * Wait for thermal ready status before configuring thermal parameters.
+ */
+ ret = readl_poll_timeout(thm->base + SPRD_THM_CTL, val,
+ !(val & SPRD_THM_SET_RDY_ST),
+ SPRD_THM_RDYST_POLLING_TIME,
+ SPRD_THM_RDYST_TIMEOUT);
+ if (ret)
+ return ret;
+
+ sprd_thm_update_bits(thm->base + SPRD_THM_CTL, SPRD_THM_MON_EN,
+ SPRD_THM_MON_EN);
+ sprd_thm_update_bits(thm->base + SPRD_THM_CTL, SPRD_THM_SET_RDY,
+ SPRD_THM_SET_RDY);
+ return 0;
+}
+
+static int sprd_thm_wait_temp_ready(struct sprd_thermal_data *thm)
+{
+ u32 val;
+
+ /* Wait for first temperature data ready before reading temperature */
+ return readl_poll_timeout(thm->base + SPRD_THM_INTERNAL_STS1, val,
+ !(val & SPRD_THM_TEMPER_RDY),
+ SPRD_THM_TEMP_READY_POLL_TIME,
+ SPRD_THM_TEMP_READY_TIMEOUT);
+}
+
+static int sprd_thm_set_ready(struct sprd_thermal_data *thm)
+{
+ int ret;
+
+ ret = sprd_thm_poll_ready_status(thm);
+ if (ret)
+ return ret;
+
+ /*
+ * Clear interrupt status, enable thermal interrupt and enable thermal.
+ *
+ * The SPRD thermal controller integrates a hardware interrupt signal,
+ * which means if the temperature is overheat, it will generate an
+ * interrupt and notify the event to PMIC automatically to shutdown the
+ * system. So here we should enable the interrupt bits, though we have
+ * not registered an irq handler.
+ */
+ writel(SPRD_THM_INT_CLR_MASK, thm->base + SPRD_THM_INT_CLR);
+ sprd_thm_update_bits(thm->base + SPRD_THM_INT_EN,
+ SPRD_THM_BIT_INT_EN, SPRD_THM_BIT_INT_EN);
+ sprd_thm_update_bits(thm->base + SPRD_THM_CTL,
+ SPRD_THM_EN, SPRD_THM_EN);
+ return 0;
+}
+
+static void sprd_thm_sensor_init(struct sprd_thermal_data *thm,
+ struct sprd_thermal_sensor *sen)
+{
+ u32 otp_rawdata, hot_rawdata;
+
+ otp_rawdata = sprd_thm_temp_to_rawdata(SPRD_THM_OTP_TEMP, sen);
+ hot_rawdata = sprd_thm_temp_to_rawdata(SPRD_THM_HOT_TEMP, sen);
+
+ /* Enable the sensor' overheat temperature protection interrupt */
+ sprd_thm_update_bits(thm->base + SPRD_THM_INT_EN,
+ SPRD_THM_SEN_OVERHEAT_ALARM_EN(sen->id),
+ SPRD_THM_SEN_OVERHEAT_ALARM_EN(sen->id));
+
+ /* Set the sensor' overheat and hot threshold temperature */
+ sprd_thm_update_bits(thm->base + SPRD_THM_THRES(sen->id),
+ SPRD_THM_THRES_MASK,
+ (otp_rawdata << SPRD_THM_OTP_TRIP_SHIFT) |
+ hot_rawdata);
+
+ /* Enable the corresponding sensor */
+ sprd_thm_update_bits(thm->base + SPRD_THM_CTL, SPRD_THM_SEN(sen->id),
+ SPRD_THM_SEN(sen->id));
+}
+
+static void sprd_thm_para_config(struct sprd_thermal_data *thm)
+{
+ /* Set the period of two valid temperature detection action */
+ sprd_thm_update_bits(thm->base + SPRD_THM_DET_PERIOD,
+ SPRD_THM_DET_PERIOD_MASK, SPRD_THM_DET_PERIOD);
+
+ /* Set the sensors' monitor mode */
+ sprd_thm_update_bits(thm->base + SPRD_THM_MON_CTL,
+ SPRD_THM_MON_MODE_MASK, SPRD_THM_MON_MODE);
+
+ /* Set the sensors' monitor period */
+ sprd_thm_update_bits(thm->base + SPRD_THM_MON_PERIOD,
+ SPRD_THM_MON_PERIOD_MASK, SPRD_THM_MON_PERIOD);
+}
+
+static void sprd_thm_toggle_sensor(struct sprd_thermal_sensor *sen, bool on)
+{
+ struct thermal_zone_device *tzd = sen->tzd;
+
+ tzd->ops->set_mode(tzd,
+ on ? THERMAL_DEVICE_ENABLED : THERMAL_DEVICE_DISABLED);
+}
+
+static int sprd_thm_probe(struct platform_device *pdev)
+{
+ struct device_node *np = pdev->dev.of_node;
+ struct device_node *sen_child;
+ struct sprd_thermal_data *thm;
+ struct sprd_thermal_sensor *sen;
+ const struct sprd_thm_variant_data *pdata;
+ int ret, i;
+ u32 val;
+
+ pdata = of_device_get_match_data(&pdev->dev);
+ if (!pdata) {
+ dev_err(&pdev->dev, "No matching driver data found\n");
+ return -EINVAL;
+ }
+
+ thm = devm_kzalloc(&pdev->dev, sizeof(*thm), GFP_KERNEL);
+ if (!thm)
+ return -ENOMEM;
+
+ thm->var_data = pdata;
+ thm->base = devm_platform_ioremap_resource(pdev, 0);
+ if (!thm->base)
+ return -ENOMEM;
+
+ thm->nr_sensors = of_get_child_count(np);
+ if (thm->nr_sensors == 0 || thm->nr_sensors > SPRD_THM_MAX_SENSOR) {
+ dev_err(&pdev->dev, "incorrect sensor count\n");
+ return -EINVAL;
+ }
+
+ thm->clk = devm_clk_get(&pdev->dev, "enable");
+ if (IS_ERR(thm->clk)) {
+ dev_err(&pdev->dev, "failed to get enable clock\n");
+ return PTR_ERR(thm->clk);
+ }
+
+ ret = clk_prepare_enable(thm->clk);
+ if (ret)
+ return ret;
+
+ sprd_thm_para_config(thm);
+
+ ret = sprd_thm_cal_read(np, "thm_sign_cal", &val);
+ if (ret)
+ goto disable_clk;
+
+ if (val > 0)
+ thm->ratio_sign = -1;
+ else
+ thm->ratio_sign = 1;
+
+ ret = sprd_thm_cal_read(np, "thm_ratio_cal", &thm->ratio_off);
+ if (ret)
+ goto disable_clk;
+
+ for_each_child_of_node(np, sen_child) {
+ sen = devm_kzalloc(&pdev->dev, sizeof(*sen), GFP_KERNEL);
+ if (!sen) {
+ ret = -ENOMEM;
+ goto disable_clk;
+ }
+
+ sen->data = thm;
+ sen->dev = &pdev->dev;
+
+ ret = of_property_read_u32(sen_child, "reg", &sen->id);
+ if (ret) {
+ dev_err(&pdev->dev, "get sensor reg failed");
+ goto disable_clk;
+ }
+
+ ret = sprd_thm_sensor_calibration(sen_child, thm, sen);
+ if (ret) {
+ dev_err(&pdev->dev, "efuse cal analysis failed");
+ goto disable_clk;
+ }
+
+ sprd_thm_sensor_init(thm, sen);
+
+ sen->tzd = devm_thermal_zone_of_sensor_register(sen->dev,
+ sen->id,
+ sen,
+ &sprd_thm_ops);
+ if (IS_ERR(sen->tzd)) {
+ dev_err(&pdev->dev, "register thermal zone failed %d\n",
+ sen->id);
+ ret = PTR_ERR(sen->tzd);
+ goto disable_clk;
+ }
+
+ thm->sensor[sen->id] = sen;
+ }
+
+ ret = sprd_thm_set_ready(thm);
+ if (ret)
+ goto disable_clk;
+
+ ret = sprd_thm_wait_temp_ready(thm);
+ if (ret)
+ goto disable_clk;
+
+ for (i = 0; i < thm->nr_sensors; i++)
+ sprd_thm_toggle_sensor(thm->sensor[i], true);
+
+ platform_set_drvdata(pdev, thm);
+ return 0;
+
+disable_clk:
+ clk_disable_unprepare(thm->clk);
+ return ret;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static void sprd_thm_hw_suspend(struct sprd_thermal_data *thm)
+{
+ int i;
+
+ for (i = 0; i < thm->nr_sensors; i++) {
+ sprd_thm_update_bits(thm->base + SPRD_THM_CTL,
+ SPRD_THM_SEN(thm->sensor[i]->id), 0);
+ }
+
+ sprd_thm_update_bits(thm->base + SPRD_THM_CTL,
+ SPRD_THM_EN, 0x0);
+}
+
+static int sprd_thm_suspend(struct device *dev)
+{
+ struct sprd_thermal_data *thm = dev_get_drvdata(dev);
+ int i;
+
+ for (i = 0; i < thm->nr_sensors; i++)
+ sprd_thm_toggle_sensor(thm->sensor[i], false);
+
+ sprd_thm_hw_suspend(thm);
+ clk_disable_unprepare(thm->clk);
+
+ return 0;
+}
+
+static int sprd_thm_hw_resume(struct sprd_thermal_data *thm)
+{
+ int ret, i;
+
+ for (i = 0; i < thm->nr_sensors; i++) {
+ sprd_thm_update_bits(thm->base + SPRD_THM_CTL,
+ SPRD_THM_SEN(thm->sensor[i]->id),
+ SPRD_THM_SEN(thm->sensor[i]->id));
+ }
+
+ ret = sprd_thm_poll_ready_status(thm);
+ if (ret)
+ return ret;
+
+ writel(SPRD_THM_INT_CLR_MASK, thm->base + SPRD_THM_INT_CLR);
+ sprd_thm_update_bits(thm->base + SPRD_THM_CTL,
+ SPRD_THM_EN, SPRD_THM_EN);
+ return sprd_thm_wait_temp_ready(thm);
+}
+
+static int sprd_thm_resume(struct device *dev)
+{
+ struct sprd_thermal_data *thm = dev_get_drvdata(dev);
+ int ret, i;
+
+ ret = clk_prepare_enable(thm->clk);
+ if (ret)
+ return ret;
+
+ ret = sprd_thm_hw_resume(thm);
+ if (ret)
+ goto disable_clk;
+
+ for (i = 0; i < thm->nr_sensors; i++)
+ sprd_thm_toggle_sensor(thm->sensor[i], true);
+
+ return 0;
+
+disable_clk:
+ clk_disable_unprepare(thm->clk);
+ return ret;
+}
+#endif
+
+static int sprd_thm_remove(struct platform_device *pdev)
+{
+ struct sprd_thermal_data *thm = platform_get_drvdata(pdev);
+ int i;
+
+ for (i = 0; i < thm->nr_sensors; i++) {
+ sprd_thm_toggle_sensor(thm->sensor[i], false);
+ devm_thermal_zone_of_sensor_unregister(&pdev->dev,
+ thm->sensor[i]->tzd);
+ }
+
+ clk_disable_unprepare(thm->clk);
+ return 0;
+}
+
+static const struct of_device_id sprd_thermal_of_match[] = {
+ { .compatible = "sprd,ums512-thermal", .data = &ums512_data },
+ { },
+};
+
+static const struct dev_pm_ops sprd_thermal_pm_ops = {
+ SET_SYSTEM_SLEEP_PM_OPS(sprd_thm_suspend, sprd_thm_resume)
+};
+
+static struct platform_driver sprd_thermal_driver = {
+ .probe = sprd_thm_probe,
+ .remove = sprd_thm_remove,
+ .driver = {
+ .name = "sprd-thermal",
+ .pm = &sprd_thermal_pm_ops,
+ .of_match_table = sprd_thermal_of_match,
+ },
+};
+
+module_platform_driver(sprd_thermal_driver);
+
+MODULE_AUTHOR("Freeman Liu <freeman.liu@unisoc.com>");
+MODULE_DESCRIPTION("Spreadtrum thermal driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/st/stm_thermal.c b/drivers/thermal/st/stm_thermal.c
index ad9e3bf8fdf6..9314e3df6a42 100644
--- a/drivers/thermal/st/stm_thermal.c
+++ b/drivers/thermal/st/stm_thermal.c
@@ -478,7 +478,8 @@ static int stm_thermal_resume(struct device *dev)
}
#endif /* CONFIG_PM_SLEEP */
-SIMPLE_DEV_PM_OPS(stm_thermal_pm_ops, stm_thermal_suspend, stm_thermal_resume);
+static SIMPLE_DEV_PM_OPS(stm_thermal_pm_ops,
+ stm_thermal_suspend, stm_thermal_resume);
static const struct thermal_zone_of_device_ops stm_tz_ops = {
.get_temp = stm_thermal_get_temp,
diff --git a/drivers/thermal/ti-soc-thermal/ti-bandgap.c b/drivers/thermal/ti-soc-thermal/ti-bandgap.c
index 2fa78f738568..263b0420fbe4 100644
--- a/drivers/thermal/ti-soc-thermal/ti-bandgap.c
+++ b/drivers/thermal/ti-soc-thermal/ti-bandgap.c
@@ -15,7 +15,7 @@
#include <linux/kernel.h>
#include <linux/interrupt.h>
#include <linux/clk.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
#include <linux/platform_device.h>
#include <linux/err.h>
#include <linux/types.h>
@@ -24,7 +24,6 @@
#include <linux/of_device.h>
#include <linux/of_platform.h>
#include <linux/of_irq.h>
-#include <linux/of_gpio.h>
#include <linux/io.h>
#include "ti-bandgap.h"
@@ -743,27 +742,13 @@ exit:
static int ti_bandgap_tshut_init(struct ti_bandgap *bgp,
struct platform_device *pdev)
{
- int gpio_nr = bgp->tshut_gpio;
int status;
- /* Request for gpio_86 line */
- status = gpio_request(gpio_nr, "tshut");
- if (status < 0) {
- dev_err(bgp->dev, "Could not request for TSHUT GPIO:%i\n", 86);
- return status;
- }
- status = gpio_direction_input(gpio_nr);
- if (status) {
- dev_err(bgp->dev, "Cannot set input TSHUT GPIO %d\n", gpio_nr);
- return status;
- }
-
- status = request_irq(gpio_to_irq(gpio_nr), ti_bandgap_tshut_irq_handler,
+ status = request_irq(gpiod_to_irq(bgp->tshut_gpiod),
+ ti_bandgap_tshut_irq_handler,
IRQF_TRIGGER_RISING, "tshut", NULL);
- if (status) {
- gpio_free(gpio_nr);
+ if (status)
dev_err(bgp->dev, "request irq failed for TSHUT");
- }
return 0;
}
@@ -860,11 +845,10 @@ static struct ti_bandgap *ti_bandgap_build(struct platform_device *pdev)
} while (res);
if (TI_BANDGAP_HAS(bgp, TSHUT)) {
- bgp->tshut_gpio = of_get_gpio(node, 0);
- if (!gpio_is_valid(bgp->tshut_gpio)) {
- dev_err(&pdev->dev, "invalid gpio for tshut (%d)\n",
- bgp->tshut_gpio);
- return ERR_PTR(-EINVAL);
+ bgp->tshut_gpiod = devm_gpiod_get(&pdev->dev, NULL, GPIOD_IN);
+ if (IS_ERR(bgp->tshut_gpiod)) {
+ dev_err(&pdev->dev, "invalid gpio for tshut\n");
+ return ERR_CAST(bgp->tshut_gpiod);
}
}
@@ -1046,10 +1030,8 @@ put_clks:
put_fclock:
clk_put(bgp->fclock);
free_irqs:
- if (TI_BANDGAP_HAS(bgp, TSHUT)) {
- free_irq(gpio_to_irq(bgp->tshut_gpio), NULL);
- gpio_free(bgp->tshut_gpio);
- }
+ if (TI_BANDGAP_HAS(bgp, TSHUT))
+ free_irq(gpiod_to_irq(bgp->tshut_gpiod), NULL);
return ret;
}
@@ -1079,10 +1061,8 @@ int ti_bandgap_remove(struct platform_device *pdev)
if (TI_BANDGAP_HAS(bgp, TALERT))
free_irq(bgp->irq, bgp);
- if (TI_BANDGAP_HAS(bgp, TSHUT)) {
- free_irq(gpio_to_irq(bgp->tshut_gpio), NULL);
- gpio_free(bgp->tshut_gpio);
- }
+ if (TI_BANDGAP_HAS(bgp, TSHUT))
+ free_irq(gpiod_to_irq(bgp->tshut_gpiod), NULL);
return 0;
}
diff --git a/drivers/thermal/ti-soc-thermal/ti-bandgap.h b/drivers/thermal/ti-soc-thermal/ti-bandgap.h
index bb9b0f7faf99..fce4657e9486 100644
--- a/drivers/thermal/ti-soc-thermal/ti-bandgap.h
+++ b/drivers/thermal/ti-soc-thermal/ti-bandgap.h
@@ -13,6 +13,8 @@
#include <linux/types.h>
#include <linux/err.h>
+struct gpio_desc;
+
/**
* DOC: bandgap driver data structure
* ==================================
@@ -199,7 +201,7 @@ struct ti_bandgap {
struct clk *div_clk;
spinlock_t lock; /* shields this struct */
int irq;
- int tshut_gpio;
+ struct gpio_desc *tshut_gpiod;
u32 clk_rate;
};
diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
new file mode 100644
index 000000000000..7db1460104b7
--- /dev/null
+++ b/drivers/vdpa/Kconfig
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config VDPA
+ tristate
+ help
+ Enable this module to support vDPA device that uses a
+ datapath which complies with virtio specifications with
+ vendor specific control path.
+
+menuconfig VDPA_MENU
+ bool "VDPA drivers"
+ default n
+
+if VDPA_MENU
+
+config VDPA_SIM
+ tristate "vDPA device simulator"
+ depends on RUNTIME_TESTING_MENU
+ select VDPA
+ select VHOST_RING
+ default n
+ help
+ vDPA networking device simulator which loop TX traffic back
+ to RX. This device is used for testing, prototyping and
+ development of vDPA.
+
+config IFCVF
+ tristate "Intel IFC VF VDPA driver"
+ depends on PCI_MSI
+ select VDPA
+ default n
+ help
+ This kernel module can drive Intel IFC VF NIC to offload
+ virtio dataplane traffic to hardware.
+ To compile this driver as a module, choose M here: the module will
+ be called ifcvf.
+
+endif # VDPA_MENU
diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
new file mode 100644
index 000000000000..8bbb686ca7a2
--- /dev/null
+++ b/drivers/vdpa/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_VDPA) += vdpa.o
+obj-$(CONFIG_VDPA_SIM) += vdpa_sim/
+obj-$(CONFIG_IFCVF) += ifcvf/
diff --git a/drivers/vdpa/ifcvf/Makefile b/drivers/vdpa/ifcvf/Makefile
new file mode 100644
index 000000000000..d709915995ab
--- /dev/null
+++ b/drivers/vdpa/ifcvf/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_IFCVF) += ifcvf.o
+ifcvf-$(CONFIG_IFCVF) += ifcvf_main.o ifcvf_base.o
diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c
new file mode 100644
index 000000000000..b61b06ea26d3
--- /dev/null
+++ b/drivers/vdpa/ifcvf/ifcvf_base.c
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel IFC VF NIC driver for virtio dataplane offloading
+ *
+ * Copyright (C) 2020 Intel Corporation.
+ *
+ * Author: Zhu Lingshan <lingshan.zhu@intel.com>
+ *
+ */
+
+#include "ifcvf_base.h"
+
+static inline u8 ifc_ioread8(u8 __iomem *addr)
+{
+ return ioread8(addr);
+}
+static inline u16 ifc_ioread16 (__le16 __iomem *addr)
+{
+ return ioread16(addr);
+}
+
+static inline u32 ifc_ioread32(__le32 __iomem *addr)
+{
+ return ioread32(addr);
+}
+
+static inline void ifc_iowrite8(u8 value, u8 __iomem *addr)
+{
+ iowrite8(value, addr);
+}
+
+static inline void ifc_iowrite16(u16 value, __le16 __iomem *addr)
+{
+ iowrite16(value, addr);
+}
+
+static inline void ifc_iowrite32(u32 value, __le32 __iomem *addr)
+{
+ iowrite32(value, addr);
+}
+
+static void ifc_iowrite64_twopart(u64 val,
+ __le32 __iomem *lo, __le32 __iomem *hi)
+{
+ ifc_iowrite32((u32)val, lo);
+ ifc_iowrite32(val >> 32, hi);
+}
+
+struct ifcvf_adapter *vf_to_adapter(struct ifcvf_hw *hw)
+{
+ return container_of(hw, struct ifcvf_adapter, vf);
+}
+
+static void __iomem *get_cap_addr(struct ifcvf_hw *hw,
+ struct virtio_pci_cap *cap)
+{
+ struct ifcvf_adapter *ifcvf;
+ struct pci_dev *pdev;
+ u32 length, offset;
+ u8 bar;
+
+ length = le32_to_cpu(cap->length);
+ offset = le32_to_cpu(cap->offset);
+ bar = cap->bar;
+
+ ifcvf= vf_to_adapter(hw);
+ pdev = ifcvf->pdev;
+
+ if (bar >= IFCVF_PCI_MAX_RESOURCE) {
+ IFCVF_DBG(pdev,
+ "Invalid bar number %u to get capabilities\n", bar);
+ return NULL;
+ }
+
+ if (offset + length > pci_resource_len(pdev, bar)) {
+ IFCVF_DBG(pdev,
+ "offset(%u) + len(%u) overflows bar%u's capability\n",
+ offset, length, bar);
+ return NULL;
+ }
+
+ return hw->base[bar] + offset;
+}
+
+static int ifcvf_read_config_range(struct pci_dev *dev,
+ uint32_t *val, int size, int where)
+{
+ int ret, i;
+
+ for (i = 0; i < size; i += 4) {
+ ret = pci_read_config_dword(dev, where + i, val + i / 4);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *pdev)
+{
+ struct virtio_pci_cap cap;
+ u16 notify_off;
+ int ret;
+ u8 pos;
+ u32 i;
+
+ ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos);
+ if (ret < 0) {
+ IFCVF_ERR(pdev, "Failed to read PCI capability list\n");
+ return -EIO;
+ }
+
+ while (pos) {
+ ret = ifcvf_read_config_range(pdev, (u32 *)&cap,
+ sizeof(cap), pos);
+ if (ret < 0) {
+ IFCVF_ERR(pdev,
+ "Failed to get PCI capability at %x\n", pos);
+ break;
+ }
+
+ if (cap.cap_vndr != PCI_CAP_ID_VNDR)
+ goto next;
+
+ switch (cap.cfg_type) {
+ case VIRTIO_PCI_CAP_COMMON_CFG:
+ hw->common_cfg = get_cap_addr(hw, &cap);
+ IFCVF_DBG(pdev, "hw->common_cfg = %p\n",
+ hw->common_cfg);
+ break;
+ case VIRTIO_PCI_CAP_NOTIFY_CFG:
+ pci_read_config_dword(pdev, pos + sizeof(cap),
+ &hw->notify_off_multiplier);
+ hw->notify_bar = cap.bar;
+ hw->notify_base = get_cap_addr(hw, &cap);
+ IFCVF_DBG(pdev, "hw->notify_base = %p\n",
+ hw->notify_base);
+ break;
+ case VIRTIO_PCI_CAP_ISR_CFG:
+ hw->isr = get_cap_addr(hw, &cap);
+ IFCVF_DBG(pdev, "hw->isr = %p\n", hw->isr);
+ break;
+ case VIRTIO_PCI_CAP_DEVICE_CFG:
+ hw->net_cfg = get_cap_addr(hw, &cap);
+ IFCVF_DBG(pdev, "hw->net_cfg = %p\n", hw->net_cfg);
+ break;
+ }
+
+next:
+ pos = cap.cap_next;
+ }
+
+ if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+ hw->isr == NULL || hw->net_cfg == NULL) {
+ IFCVF_ERR(pdev, "Incomplete PCI capabilities\n");
+ return -EIO;
+ }
+
+ for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+ ifc_iowrite16(i, &hw->common_cfg->queue_select);
+ notify_off = ifc_ioread16(&hw->common_cfg->queue_notify_off);
+ hw->vring[i].notify_addr = hw->notify_base +
+ notify_off * hw->notify_off_multiplier;
+ }
+
+ hw->lm_cfg = hw->base[IFCVF_LM_BAR];
+
+ IFCVF_DBG(pdev,
+ "PCI capability mapping: common cfg: %p, notify base: %p\n, isr cfg: %p, device cfg: %p, multiplier: %u\n",
+ hw->common_cfg, hw->notify_base, hw->isr,
+ hw->net_cfg, hw->notify_off_multiplier);
+
+ return 0;
+}
+
+u8 ifcvf_get_status(struct ifcvf_hw *hw)
+{
+ return ifc_ioread8(&hw->common_cfg->device_status);
+}
+
+void ifcvf_set_status(struct ifcvf_hw *hw, u8 status)
+{
+ ifc_iowrite8(status, &hw->common_cfg->device_status);
+}
+
+void ifcvf_reset(struct ifcvf_hw *hw)
+{
+ ifcvf_set_status(hw, 0);
+ /* flush set_status, make sure VF is stopped, reset */
+ ifcvf_get_status(hw);
+}
+
+static void ifcvf_add_status(struct ifcvf_hw *hw, u8 status)
+{
+ if (status != 0)
+ status |= ifcvf_get_status(hw);
+
+ ifcvf_set_status(hw, status);
+ ifcvf_get_status(hw);
+}
+
+u64 ifcvf_get_features(struct ifcvf_hw *hw)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
+ u32 features_lo, features_hi;
+
+ ifc_iowrite32(0, &cfg->device_feature_select);
+ features_lo = ifc_ioread32(&cfg->device_feature);
+
+ ifc_iowrite32(1, &cfg->device_feature_select);
+ features_hi = ifc_ioread32(&cfg->device_feature);
+
+ return ((u64)features_hi << 32) | features_lo;
+}
+
+void ifcvf_read_net_config(struct ifcvf_hw *hw, u64 offset,
+ void *dst, int length)
+{
+ u8 old_gen, new_gen, *p;
+ int i;
+
+ WARN_ON(offset + length > sizeof(struct virtio_net_config));
+ do {
+ old_gen = ifc_ioread8(&hw->common_cfg->config_generation);
+ p = dst;
+ for (i = 0; i < length; i++)
+ *p++ = ifc_ioread8(hw->net_cfg + offset + i);
+
+ new_gen = ifc_ioread8(&hw->common_cfg->config_generation);
+ } while (old_gen != new_gen);
+}
+
+void ifcvf_write_net_config(struct ifcvf_hw *hw, u64 offset,
+ const void *src, int length)
+{
+ const u8 *p;
+ int i;
+
+ p = src;
+ WARN_ON(offset + length > sizeof(struct virtio_net_config));
+ for (i = 0; i < length; i++)
+ ifc_iowrite8(*p++, hw->net_cfg + offset + i);
+}
+
+static void ifcvf_set_features(struct ifcvf_hw *hw, u64 features)
+{
+ struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
+
+ ifc_iowrite32(0, &cfg->guest_feature_select);
+ ifc_iowrite32((u32)features, &cfg->guest_feature);
+
+ ifc_iowrite32(1, &cfg->guest_feature_select);
+ ifc_iowrite32(features >> 32, &cfg->guest_feature);
+}
+
+static int ifcvf_config_features(struct ifcvf_hw *hw)
+{
+ struct ifcvf_adapter *ifcvf;
+
+ ifcvf = vf_to_adapter(hw);
+ ifcvf_set_features(hw, hw->req_features);
+ ifcvf_add_status(hw, VIRTIO_CONFIG_S_FEATURES_OK);
+
+ if (!(ifcvf_get_status(hw) & VIRTIO_CONFIG_S_FEATURES_OK)) {
+ IFCVF_ERR(ifcvf->pdev, "Failed to set FEATURES_OK status\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+u64 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid)
+{
+ struct ifcvf_lm_cfg __iomem *ifcvf_lm;
+ void __iomem *avail_idx_addr;
+ u16 last_avail_idx;
+ u32 q_pair_id;
+
+ ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg;
+ q_pair_id = qid / (IFCVF_MAX_QUEUE_PAIRS * 2);
+ avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2];
+ last_avail_idx = ifc_ioread16(avail_idx_addr);
+
+ return last_avail_idx;
+}
+
+int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u64 num)
+{
+ struct ifcvf_lm_cfg __iomem *ifcvf_lm;
+ void __iomem *avail_idx_addr;
+ u32 q_pair_id;
+
+ ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg;
+ q_pair_id = qid / (IFCVF_MAX_QUEUE_PAIRS * 2);
+ avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2];
+ hw->vring[qid].last_avail_idx = num;
+ ifc_iowrite16(num, avail_idx_addr);
+
+ return 0;
+}
+
+static int ifcvf_hw_enable(struct ifcvf_hw *hw)
+{
+ struct ifcvf_lm_cfg __iomem *ifcvf_lm;
+ struct virtio_pci_common_cfg __iomem *cfg;
+ struct ifcvf_adapter *ifcvf;
+ u32 i;
+
+ ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg;
+ ifcvf = vf_to_adapter(hw);
+ cfg = hw->common_cfg;
+ ifc_iowrite16(IFCVF_MSI_CONFIG_OFF, &cfg->msix_config);
+
+ if (ifc_ioread16(&cfg->msix_config) == VIRTIO_MSI_NO_VECTOR) {
+ IFCVF_ERR(ifcvf->pdev, "No msix vector for device config\n");
+ return -EINVAL;
+ }
+
+ for (i = 0; i < hw->nr_vring; i++) {
+ if (!hw->vring[i].ready)
+ break;
+
+ ifc_iowrite16(i, &cfg->queue_select);
+ ifc_iowrite64_twopart(hw->vring[i].desc, &cfg->queue_desc_lo,
+ &cfg->queue_desc_hi);
+ ifc_iowrite64_twopart(hw->vring[i].avail, &cfg->queue_avail_lo,
+ &cfg->queue_avail_hi);
+ ifc_iowrite64_twopart(hw->vring[i].used, &cfg->queue_used_lo,
+ &cfg->queue_used_hi);
+ ifc_iowrite16(hw->vring[i].size, &cfg->queue_size);
+ ifc_iowrite16(i + IFCVF_MSI_QUEUE_OFF, &cfg->queue_msix_vector);
+
+ if (ifc_ioread16(&cfg->queue_msix_vector) ==
+ VIRTIO_MSI_NO_VECTOR) {
+ IFCVF_ERR(ifcvf->pdev,
+ "No msix vector for queue %u\n", i);
+ return -EINVAL;
+ }
+
+ ifcvf_set_vq_state(hw, i, hw->vring[i].last_avail_idx);
+ ifc_iowrite16(1, &cfg->queue_enable);
+ }
+
+ return 0;
+}
+
+static void ifcvf_hw_disable(struct ifcvf_hw *hw)
+{
+ struct virtio_pci_common_cfg __iomem *cfg;
+ u32 i;
+
+ cfg = hw->common_cfg;
+ ifc_iowrite16(VIRTIO_MSI_NO_VECTOR, &cfg->msix_config);
+
+ for (i = 0; i < hw->nr_vring; i++) {
+ ifc_iowrite16(i, &cfg->queue_select);
+ ifc_iowrite16(VIRTIO_MSI_NO_VECTOR, &cfg->queue_msix_vector);
+ }
+
+ ifc_ioread16(&cfg->queue_msix_vector);
+}
+
+int ifcvf_start_hw(struct ifcvf_hw *hw)
+{
+ ifcvf_reset(hw);
+ ifcvf_add_status(hw, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+ ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER);
+
+ if (ifcvf_config_features(hw) < 0)
+ return -EINVAL;
+
+ if (ifcvf_hw_enable(hw) < 0)
+ return -EINVAL;
+
+ ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER_OK);
+
+ return 0;
+}
+
+void ifcvf_stop_hw(struct ifcvf_hw *hw)
+{
+ ifcvf_hw_disable(hw);
+ ifcvf_reset(hw);
+}
+
+void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid)
+{
+ ifc_iowrite16(qid, hw->vring[qid].notify_addr);
+}
diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h
new file mode 100644
index 000000000000..e80307092351
--- /dev/null
+++ b/drivers/vdpa/ifcvf/ifcvf_base.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Intel IFC VF NIC driver for virtio dataplane offloading
+ *
+ * Copyright (C) 2020 Intel Corporation.
+ *
+ * Author: Zhu Lingshan <lingshan.zhu@intel.com>
+ *
+ */
+
+#ifndef _IFCVF_H_
+#define _IFCVF_H_
+
+#include <linux/pci.h>
+#include <linux/pci_regs.h>
+#include <linux/vdpa.h>
+#include <uapi/linux/virtio_net.h>
+#include <uapi/linux/virtio_config.h>
+#include <uapi/linux/virtio_pci.h>
+
+#define IFCVF_VENDOR_ID 0x1AF4
+#define IFCVF_DEVICE_ID 0x1041
+#define IFCVF_SUBSYS_VENDOR_ID 0x8086
+#define IFCVF_SUBSYS_DEVICE_ID 0x001A
+
+#define IFCVF_SUPPORTED_FEATURES \
+ ((1ULL << VIRTIO_NET_F_MAC) | \
+ (1ULL << VIRTIO_F_ANY_LAYOUT) | \
+ (1ULL << VIRTIO_F_VERSION_1) | \
+ (1ULL << VIRTIO_F_ORDER_PLATFORM) | \
+ (1ULL << VIRTIO_F_IOMMU_PLATFORM) | \
+ (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+
+/* Only one queue pair for now. */
+#define IFCVF_MAX_QUEUE_PAIRS 1
+
+#define IFCVF_QUEUE_ALIGNMENT PAGE_SIZE
+#define IFCVF_QUEUE_MAX 32768
+#define IFCVF_MSI_CONFIG_OFF 0
+#define IFCVF_MSI_QUEUE_OFF 1
+#define IFCVF_PCI_MAX_RESOURCE 6
+
+#define IFCVF_LM_CFG_SIZE 0x40
+#define IFCVF_LM_RING_STATE_OFFSET 0x20
+#define IFCVF_LM_BAR 4
+
+#define IFCVF_ERR(pdev, fmt, ...) dev_err(&pdev->dev, fmt, ##__VA_ARGS__)
+#define IFCVF_DBG(pdev, fmt, ...) dev_dbg(&pdev->dev, fmt, ##__VA_ARGS__)
+#define IFCVF_INFO(pdev, fmt, ...) dev_info(&pdev->dev, fmt, ##__VA_ARGS__)
+
+#define ifcvf_private_to_vf(adapter) \
+ (&((struct ifcvf_adapter *)adapter)->vf)
+
+#define IFCVF_MAX_INTR (IFCVF_MAX_QUEUE_PAIRS * 2 + 1)
+
+struct vring_info {
+ u64 desc;
+ u64 avail;
+ u64 used;
+ u16 size;
+ u16 last_avail_idx;
+ bool ready;
+ void __iomem *notify_addr;
+ u32 irq;
+ struct vdpa_callback cb;
+ char msix_name[256];
+};
+
+struct ifcvf_hw {
+ u8 __iomem *isr;
+ /* Live migration */
+ u8 __iomem *lm_cfg;
+ u16 nr_vring;
+ /* Notification bar number */
+ u8 notify_bar;
+ /* Notificaiton bar address */
+ void __iomem *notify_base;
+ u32 notify_off_multiplier;
+ u64 req_features;
+ struct virtio_pci_common_cfg __iomem *common_cfg;
+ void __iomem *net_cfg;
+ struct vring_info vring[IFCVF_MAX_QUEUE_PAIRS * 2];
+ void __iomem * const *base;
+};
+
+struct ifcvf_adapter {
+ struct vdpa_device vdpa;
+ struct pci_dev *pdev;
+ struct ifcvf_hw vf;
+};
+
+struct ifcvf_vring_lm_cfg {
+ u32 idx_addr[2];
+ u8 reserved[IFCVF_LM_CFG_SIZE - 8];
+};
+
+struct ifcvf_lm_cfg {
+ u8 reserved[IFCVF_LM_RING_STATE_OFFSET];
+ struct ifcvf_vring_lm_cfg vring_lm_cfg[IFCVF_MAX_QUEUE_PAIRS];
+};
+
+int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *dev);
+int ifcvf_start_hw(struct ifcvf_hw *hw);
+void ifcvf_stop_hw(struct ifcvf_hw *hw);
+void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid);
+void ifcvf_read_net_config(struct ifcvf_hw *hw, u64 offset,
+ void *dst, int length);
+void ifcvf_write_net_config(struct ifcvf_hw *hw, u64 offset,
+ const void *src, int length);
+u8 ifcvf_get_status(struct ifcvf_hw *hw);
+void ifcvf_set_status(struct ifcvf_hw *hw, u8 status);
+void io_write64_twopart(u64 val, u32 *lo, u32 *hi);
+void ifcvf_reset(struct ifcvf_hw *hw);
+u64 ifcvf_get_features(struct ifcvf_hw *hw);
+u64 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid);
+int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u64 num);
+struct ifcvf_adapter *vf_to_adapter(struct ifcvf_hw *hw);
+#endif /* _IFCVF_H_ */
diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
new file mode 100644
index 000000000000..8d54dc5b08d2
--- /dev/null
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel IFC VF NIC driver for virtio dataplane offloading
+ *
+ * Copyright (C) 2020 Intel Corporation.
+ *
+ * Author: Zhu Lingshan <lingshan.zhu@intel.com>
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/sysfs.h>
+#include "ifcvf_base.h"
+
+#define VERSION_STRING "0.1"
+#define DRIVER_AUTHOR "Intel Corporation"
+#define IFCVF_DRIVER_NAME "ifcvf"
+
+static irqreturn_t ifcvf_intr_handler(int irq, void *arg)
+{
+ struct vring_info *vring = arg;
+
+ if (vring->cb.callback)
+ return vring->cb.callback(vring->cb.private);
+
+ return IRQ_HANDLED;
+}
+
+static int ifcvf_start_datapath(void *private)
+{
+ struct ifcvf_hw *vf = ifcvf_private_to_vf(private);
+ struct ifcvf_adapter *ifcvf;
+ u8 status;
+ int ret;
+
+ ifcvf = vf_to_adapter(vf);
+ vf->nr_vring = IFCVF_MAX_QUEUE_PAIRS * 2;
+ ret = ifcvf_start_hw(vf);
+ if (ret < 0) {
+ status = ifcvf_get_status(vf);
+ status |= VIRTIO_CONFIG_S_FAILED;
+ ifcvf_set_status(vf, status);
+ }
+
+ return ret;
+}
+
+static int ifcvf_stop_datapath(void *private)
+{
+ struct ifcvf_hw *vf = ifcvf_private_to_vf(private);
+ int i;
+
+ for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++)
+ vf->vring[i].cb.callback = NULL;
+
+ ifcvf_stop_hw(vf);
+
+ return 0;
+}
+
+static void ifcvf_reset_vring(struct ifcvf_adapter *adapter)
+{
+ struct ifcvf_hw *vf = ifcvf_private_to_vf(adapter);
+ int i;
+
+ for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+ vf->vring[i].last_avail_idx = 0;
+ vf->vring[i].desc = 0;
+ vf->vring[i].avail = 0;
+ vf->vring[i].used = 0;
+ vf->vring[i].ready = 0;
+ vf->vring[i].cb.callback = NULL;
+ vf->vring[i].cb.private = NULL;
+ }
+
+ ifcvf_reset(vf);
+}
+
+static struct ifcvf_adapter *vdpa_to_adapter(struct vdpa_device *vdpa_dev)
+{
+ return container_of(vdpa_dev, struct ifcvf_adapter, vdpa);
+}
+
+static struct ifcvf_hw *vdpa_to_vf(struct vdpa_device *vdpa_dev)
+{
+ struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev);
+
+ return &adapter->vf;
+}
+
+static u64 ifcvf_vdpa_get_features(struct vdpa_device *vdpa_dev)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+ u64 features;
+
+ features = ifcvf_get_features(vf) & IFCVF_SUPPORTED_FEATURES;
+
+ return features;
+}
+
+static int ifcvf_vdpa_set_features(struct vdpa_device *vdpa_dev, u64 features)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ vf->req_features = features;
+
+ return 0;
+}
+
+static u8 ifcvf_vdpa_get_status(struct vdpa_device *vdpa_dev)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ return ifcvf_get_status(vf);
+}
+
+static void ifcvf_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
+{
+ struct ifcvf_adapter *adapter;
+ struct ifcvf_hw *vf;
+
+ vf = vdpa_to_vf(vdpa_dev);
+ adapter = dev_get_drvdata(vdpa_dev->dev.parent);
+
+ if (status == 0) {
+ ifcvf_stop_datapath(adapter);
+ ifcvf_reset_vring(adapter);
+ return;
+ }
+
+ if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+ if (ifcvf_start_datapath(adapter) < 0)
+ IFCVF_ERR(adapter->pdev,
+ "Failed to set ifcvf vdpa status %u\n",
+ status);
+ }
+
+ ifcvf_set_status(vf, status);
+}
+
+static u16 ifcvf_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev)
+{
+ return IFCVF_QUEUE_MAX;
+}
+
+static u64 ifcvf_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ return ifcvf_get_vq_state(vf, qid);
+}
+
+static int ifcvf_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
+ u64 num)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ return ifcvf_set_vq_state(vf, qid, num);
+}
+
+static void ifcvf_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid,
+ struct vdpa_callback *cb)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ vf->vring[qid].cb = *cb;
+}
+
+static void ifcvf_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev,
+ u16 qid, bool ready)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ vf->vring[qid].ready = ready;
+}
+
+static bool ifcvf_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16 qid)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ return vf->vring[qid].ready;
+}
+
+static void ifcvf_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid,
+ u32 num)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ vf->vring[qid].size = num;
+}
+
+static int ifcvf_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid,
+ u64 desc_area, u64 driver_area,
+ u64 device_area)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ vf->vring[qid].desc = desc_area;
+ vf->vring[qid].avail = driver_area;
+ vf->vring[qid].used = device_area;
+
+ return 0;
+}
+
+static void ifcvf_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ ifcvf_notify_queue(vf, qid);
+}
+
+static u32 ifcvf_vdpa_get_generation(struct vdpa_device *vdpa_dev)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ return ioread8(&vf->common_cfg->config_generation);
+}
+
+static u32 ifcvf_vdpa_get_device_id(struct vdpa_device *vdpa_dev)
+{
+ return VIRTIO_ID_NET;
+}
+
+static u32 ifcvf_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev)
+{
+ return IFCVF_SUBSYS_VENDOR_ID;
+}
+
+static u16 ifcvf_vdpa_get_vq_align(struct vdpa_device *vdpa_dev)
+{
+ return IFCVF_QUEUE_ALIGNMENT;
+}
+
+static void ifcvf_vdpa_get_config(struct vdpa_device *vdpa_dev,
+ unsigned int offset,
+ void *buf, unsigned int len)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ WARN_ON(offset + len > sizeof(struct virtio_net_config));
+ ifcvf_read_net_config(vf, offset, buf, len);
+}
+
+static void ifcvf_vdpa_set_config(struct vdpa_device *vdpa_dev,
+ unsigned int offset, const void *buf,
+ unsigned int len)
+{
+ struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+ WARN_ON(offset + len > sizeof(struct virtio_net_config));
+ ifcvf_write_net_config(vf, offset, buf, len);
+}
+
+static void ifcvf_vdpa_set_config_cb(struct vdpa_device *vdpa_dev,
+ struct vdpa_callback *cb)
+{
+ /* We don't support config interrupt */
+}
+
+/*
+ * IFCVF currently does't have on-chip IOMMU, so not
+ * implemented set_map()/dma_map()/dma_unmap()
+ */
+static const struct vdpa_config_ops ifc_vdpa_ops = {
+ .get_features = ifcvf_vdpa_get_features,
+ .set_features = ifcvf_vdpa_set_features,
+ .get_status = ifcvf_vdpa_get_status,
+ .set_status = ifcvf_vdpa_set_status,
+ .get_vq_num_max = ifcvf_vdpa_get_vq_num_max,
+ .get_vq_state = ifcvf_vdpa_get_vq_state,
+ .set_vq_state = ifcvf_vdpa_set_vq_state,
+ .set_vq_cb = ifcvf_vdpa_set_vq_cb,
+ .set_vq_ready = ifcvf_vdpa_set_vq_ready,
+ .get_vq_ready = ifcvf_vdpa_get_vq_ready,
+ .set_vq_num = ifcvf_vdpa_set_vq_num,
+ .set_vq_address = ifcvf_vdpa_set_vq_address,
+ .kick_vq = ifcvf_vdpa_kick_vq,
+ .get_generation = ifcvf_vdpa_get_generation,
+ .get_device_id = ifcvf_vdpa_get_device_id,
+ .get_vendor_id = ifcvf_vdpa_get_vendor_id,
+ .get_vq_align = ifcvf_vdpa_get_vq_align,
+ .get_config = ifcvf_vdpa_get_config,
+ .set_config = ifcvf_vdpa_set_config,
+ .set_config_cb = ifcvf_vdpa_set_config_cb,
+};
+
+static int ifcvf_request_irq(struct ifcvf_adapter *adapter)
+{
+ struct pci_dev *pdev = adapter->pdev;
+ struct ifcvf_hw *vf = &adapter->vf;
+ int vector, i, ret, irq;
+
+
+ for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+ snprintf(vf->vring[i].msix_name, 256, "ifcvf[%s]-%d\n",
+ pci_name(pdev), i);
+ vector = i + IFCVF_MSI_QUEUE_OFF;
+ irq = pci_irq_vector(pdev, vector);
+ ret = devm_request_irq(&pdev->dev, irq,
+ ifcvf_intr_handler, 0,
+ vf->vring[i].msix_name,
+ &vf->vring[i]);
+ if (ret) {
+ IFCVF_ERR(pdev,
+ "Failed to request irq for vq %d\n", i);
+ return ret;
+ }
+ vf->vring[i].irq = irq;
+ }
+
+ return 0;
+}
+
+static void ifcvf_free_irq_vectors(void *data)
+{
+ pci_free_irq_vectors(data);
+}
+
+static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct device *dev = &pdev->dev;
+ struct ifcvf_adapter *adapter;
+ struct ifcvf_hw *vf;
+ int ret;
+
+ ret = pcim_enable_device(pdev);
+ if (ret) {
+ IFCVF_ERR(pdev, "Failed to enable device\n");
+ return ret;
+ }
+
+ ret = pcim_iomap_regions(pdev, BIT(0) | BIT(2) | BIT(4),
+ IFCVF_DRIVER_NAME);
+ if (ret) {
+ IFCVF_ERR(pdev, "Failed to request MMIO region\n");
+ return ret;
+ }
+
+ ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (ret) {
+ IFCVF_ERR(pdev, "No usable DMA confiugration\n");
+ return ret;
+ }
+
+ ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (ret) {
+ IFCVF_ERR(pdev,
+ "No usable coherent DMA confiugration\n");
+ return ret;
+ }
+
+ ret = pci_alloc_irq_vectors(pdev, IFCVF_MAX_INTR,
+ IFCVF_MAX_INTR, PCI_IRQ_MSIX);
+ if (ret < 0) {
+ IFCVF_ERR(pdev, "Failed to alloc irq vectors\n");
+ return ret;
+ }
+
+ ret = devm_add_action_or_reset(dev, ifcvf_free_irq_vectors, pdev);
+ if (ret) {
+ IFCVF_ERR(pdev,
+ "Failed for adding devres for freeing irq vectors\n");
+ return ret;
+ }
+
+ adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa,
+ dev, &ifc_vdpa_ops);
+ if (adapter == NULL) {
+ IFCVF_ERR(pdev, "Failed to allocate vDPA structure");
+ return -ENOMEM;
+ }
+
+ pci_set_master(pdev);
+ pci_set_drvdata(pdev, adapter);
+
+ vf = &adapter->vf;
+ vf->base = pcim_iomap_table(pdev);
+
+ adapter->pdev = pdev;
+ adapter->vdpa.dma_dev = &pdev->dev;
+
+ ret = ifcvf_request_irq(adapter);
+ if (ret) {
+ IFCVF_ERR(pdev, "Failed to request MSI-X irq\n");
+ goto err;
+ }
+
+ ret = ifcvf_init_hw(vf, pdev);
+ if (ret) {
+ IFCVF_ERR(pdev, "Failed to init IFCVF hw\n");
+ goto err;
+ }
+
+ ret = vdpa_register_device(&adapter->vdpa);
+ if (ret) {
+ IFCVF_ERR(pdev, "Failed to register ifcvf to vdpa bus");
+ goto err;
+ }
+
+ return 0;
+
+err:
+ put_device(&adapter->vdpa.dev);
+ return ret;
+}
+
+static void ifcvf_remove(struct pci_dev *pdev)
+{
+ struct ifcvf_adapter *adapter = pci_get_drvdata(pdev);
+
+ vdpa_unregister_device(&adapter->vdpa);
+}
+
+static struct pci_device_id ifcvf_pci_ids[] = {
+ { PCI_DEVICE_SUB(IFCVF_VENDOR_ID,
+ IFCVF_DEVICE_ID,
+ IFCVF_SUBSYS_VENDOR_ID,
+ IFCVF_SUBSYS_DEVICE_ID) },
+ { 0 },
+};
+MODULE_DEVICE_TABLE(pci, ifcvf_pci_ids);
+
+static struct pci_driver ifcvf_driver = {
+ .name = IFCVF_DRIVER_NAME,
+ .id_table = ifcvf_pci_ids,
+ .probe = ifcvf_probe,
+ .remove = ifcvf_remove,
+};
+
+module_pci_driver(ifcvf_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_VERSION(VERSION_STRING);
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
new file mode 100644
index 000000000000..e9ed6a2b635b
--- /dev/null
+++ b/drivers/vdpa/vdpa.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vDPA bus.
+ *
+ * Copyright (c) 2020, Red Hat. All rights reserved.
+ * Author: Jason Wang <jasowang@redhat.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/vdpa.h>
+
+static DEFINE_IDA(vdpa_index_ida);
+
+static int vdpa_dev_probe(struct device *d)
+{
+ struct vdpa_device *vdev = dev_to_vdpa(d);
+ struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver);
+ int ret = 0;
+
+ if (drv && drv->probe)
+ ret = drv->probe(vdev);
+
+ return ret;
+}
+
+static int vdpa_dev_remove(struct device *d)
+{
+ struct vdpa_device *vdev = dev_to_vdpa(d);
+ struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver);
+
+ if (drv && drv->remove)
+ drv->remove(vdev);
+
+ return 0;
+}
+
+static struct bus_type vdpa_bus = {
+ .name = "vdpa",
+ .probe = vdpa_dev_probe,
+ .remove = vdpa_dev_remove,
+};
+
+static void vdpa_release_dev(struct device *d)
+{
+ struct vdpa_device *vdev = dev_to_vdpa(d);
+ const struct vdpa_config_ops *ops = vdev->config;
+
+ if (ops->free)
+ ops->free(vdev);
+
+ ida_simple_remove(&vdpa_index_ida, vdev->index);
+ kfree(vdev);
+}
+
+/**
+ * __vdpa_alloc_device - allocate and initilaize a vDPA device
+ * This allows driver to some prepartion after device is
+ * initialized but before registered.
+ * @parent: the parent device
+ * @config: the bus operations that is supported by this device
+ * @size: size of the parent structure that contains private data
+ *
+ * Drvier should use vdap_alloc_device() wrapper macro instead of
+ * using this directly.
+ *
+ * Returns an error when parent/config/dma_dev is not set or fail to get
+ * ida.
+ */
+struct vdpa_device *__vdpa_alloc_device(struct device *parent,
+ const struct vdpa_config_ops *config,
+ size_t size)
+{
+ struct vdpa_device *vdev;
+ int err = -EINVAL;
+
+ if (!config)
+ goto err;
+
+ if (!!config->dma_map != !!config->dma_unmap)
+ goto err;
+
+ err = -ENOMEM;
+ vdev = kzalloc(size, GFP_KERNEL);
+ if (!vdev)
+ goto err;
+
+ err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL);
+ if (err < 0)
+ goto err_ida;
+
+ vdev->dev.bus = &vdpa_bus;
+ vdev->dev.parent = parent;
+ vdev->dev.release = vdpa_release_dev;
+ vdev->index = err;
+ vdev->config = config;
+
+ err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index);
+ if (err)
+ goto err_name;
+
+ device_initialize(&vdev->dev);
+
+ return vdev;
+
+err_name:
+ ida_simple_remove(&vdpa_index_ida, vdev->index);
+err_ida:
+ kfree(vdev);
+err:
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(__vdpa_alloc_device);
+
+/**
+ * vdpa_register_device - register a vDPA device
+ * Callers must have a succeed call of vdpa_init_device() before.
+ * @vdev: the vdpa device to be registered to vDPA bus
+ *
+ * Returns an error when fail to add to vDPA bus
+ */
+int vdpa_register_device(struct vdpa_device *vdev)
+{
+ return device_add(&vdev->dev);
+}
+EXPORT_SYMBOL_GPL(vdpa_register_device);
+
+/**
+ * vdpa_unregister_device - unregister a vDPA device
+ * @vdev: the vdpa device to be unregisted from vDPA bus
+ */
+void vdpa_unregister_device(struct vdpa_device *vdev)
+{
+ device_unregister(&vdev->dev);
+}
+EXPORT_SYMBOL_GPL(vdpa_unregister_device);
+
+/**
+ * __vdpa_register_driver - register a vDPA device driver
+ * @drv: the vdpa device driver to be registered
+ * @owner: module owner of the driver
+ *
+ * Returns an err when fail to do the registration
+ */
+int __vdpa_register_driver(struct vdpa_driver *drv, struct module *owner)
+{
+ drv->driver.bus = &vdpa_bus;
+ drv->driver.owner = owner;
+
+ return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(__vdpa_register_driver);
+
+/**
+ * vdpa_unregister_driver - unregister a vDPA device driver
+ * @drv: the vdpa device driver to be unregistered
+ */
+void vdpa_unregister_driver(struct vdpa_driver *drv)
+{
+ driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(vdpa_unregister_driver);
+
+static int vdpa_init(void)
+{
+ return bus_register(&vdpa_bus);
+}
+
+static void __exit vdpa_exit(void)
+{
+ bus_unregister(&vdpa_bus);
+ ida_destroy(&vdpa_index_ida);
+}
+core_initcall(vdpa_init);
+module_exit(vdpa_exit);
+
+MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/vdpa/vdpa_sim/Makefile b/drivers/vdpa/vdpa_sim/Makefile
new file mode 100644
index 000000000000..b40278f65e04
--- /dev/null
+++ b/drivers/vdpa/vdpa_sim/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_VDPA_SIM) += vdpa_sim.o
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
new file mode 100644
index 000000000000..6e8a0cf2fdeb
--- /dev/null
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -0,0 +1,629 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VDPA networking device simulator.
+ *
+ * Copyright (c) 2020, Red Hat Inc. All rights reserved.
+ * Author: Jason Wang <jasowang@redhat.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/uuid.h>
+#include <linux/iommu.h>
+#include <linux/dma-mapping.h>
+#include <linux/sysfs.h>
+#include <linux/file.h>
+#include <linux/etherdevice.h>
+#include <linux/vringh.h>
+#include <linux/vdpa.h>
+#include <linux/vhost_iotlb.h>
+#include <uapi/linux/virtio_config.h>
+#include <uapi/linux/virtio_net.h>
+
+#define DRV_VERSION "0.1"
+#define DRV_AUTHOR "Jason Wang <jasowang@redhat.com>"
+#define DRV_DESC "vDPA Device Simulator"
+#define DRV_LICENSE "GPL v2"
+
+struct vdpasim_virtqueue {
+ struct vringh vring;
+ struct vringh_kiov iov;
+ unsigned short head;
+ bool ready;
+ u64 desc_addr;
+ u64 device_addr;
+ u64 driver_addr;
+ u32 num;
+ void *private;
+ irqreturn_t (*cb)(void *data);
+};
+
+#define VDPASIM_QUEUE_ALIGN PAGE_SIZE
+#define VDPASIM_QUEUE_MAX 256
+#define VDPASIM_DEVICE_ID 0x1
+#define VDPASIM_VENDOR_ID 0
+#define VDPASIM_VQ_NUM 0x2
+#define VDPASIM_NAME "vdpasim-netdev"
+
+static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) |
+ (1ULL << VIRTIO_F_VERSION_1) |
+ (1ULL << VIRTIO_F_IOMMU_PLATFORM);
+
+/* State of each vdpasim device */
+struct vdpasim {
+ struct vdpa_device vdpa;
+ struct vdpasim_virtqueue vqs[2];
+ struct work_struct work;
+ /* spinlock to synchronize virtqueue state */
+ spinlock_t lock;
+ struct virtio_net_config config;
+ struct vhost_iotlb *iommu;
+ void *buffer;
+ u32 status;
+ u32 generation;
+ u64 features;
+};
+
+static struct vdpasim *vdpasim_dev;
+
+static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa)
+{
+ return container_of(vdpa, struct vdpasim, vdpa);
+}
+
+static struct vdpasim *dev_to_sim(struct device *dev)
+{
+ struct vdpa_device *vdpa = dev_to_vdpa(dev);
+
+ return vdpa_to_sim(vdpa);
+}
+
+static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx)
+{
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+ int ret;
+
+ ret = vringh_init_iotlb(&vq->vring, vdpasim_features,
+ VDPASIM_QUEUE_MAX, false,
+ (struct vring_desc *)(uintptr_t)vq->desc_addr,
+ (struct vring_avail *)
+ (uintptr_t)vq->driver_addr,
+ (struct vring_used *)
+ (uintptr_t)vq->device_addr);
+}
+
+static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq)
+{
+ vq->ready = 0;
+ vq->desc_addr = 0;
+ vq->driver_addr = 0;
+ vq->device_addr = 0;
+ vq->cb = NULL;
+ vq->private = NULL;
+ vringh_init_iotlb(&vq->vring, vdpasim_features, VDPASIM_QUEUE_MAX,
+ false, NULL, NULL, NULL);
+}
+
+static void vdpasim_reset(struct vdpasim *vdpasim)
+{
+ int i;
+
+ for (i = 0; i < VDPASIM_VQ_NUM; i++)
+ vdpasim_vq_reset(&vdpasim->vqs[i]);
+
+ vhost_iotlb_reset(vdpasim->iommu);
+
+ vdpasim->features = 0;
+ vdpasim->status = 0;
+ ++vdpasim->generation;
+}
+
+static void vdpasim_work(struct work_struct *work)
+{
+ struct vdpasim *vdpasim = container_of(work, struct
+ vdpasim, work);
+ struct vdpasim_virtqueue *txq = &vdpasim->vqs[1];
+ struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0];
+ size_t read, write, total_write;
+ int err;
+ int pkts = 0;
+
+ spin_lock(&vdpasim->lock);
+
+ if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK))
+ goto out;
+
+ if (!txq->ready || !rxq->ready)
+ goto out;
+
+ while (true) {
+ total_write = 0;
+ err = vringh_getdesc_iotlb(&txq->vring, &txq->iov, NULL,
+ &txq->head, GFP_ATOMIC);
+ if (err <= 0)
+ break;
+
+ err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->iov,
+ &rxq->head, GFP_ATOMIC);
+ if (err <= 0) {
+ vringh_complete_iotlb(&txq->vring, txq->head, 0);
+ break;
+ }
+
+ while (true) {
+ read = vringh_iov_pull_iotlb(&txq->vring, &txq->iov,
+ vdpasim->buffer,
+ PAGE_SIZE);
+ if (read <= 0)
+ break;
+
+ write = vringh_iov_push_iotlb(&rxq->vring, &rxq->iov,
+ vdpasim->buffer, read);
+ if (write <= 0)
+ break;
+
+ total_write += write;
+ }
+
+ /* Make sure data is wrote before advancing index */
+ smp_wmb();
+
+ vringh_complete_iotlb(&txq->vring, txq->head, 0);
+ vringh_complete_iotlb(&rxq->vring, rxq->head, total_write);
+
+ /* Make sure used is visible before rasing the interrupt. */
+ smp_wmb();
+
+ local_bh_disable();
+ if (txq->cb)
+ txq->cb(txq->private);
+ if (rxq->cb)
+ rxq->cb(rxq->private);
+ local_bh_enable();
+
+ if (++pkts > 4) {
+ schedule_work(&vdpasim->work);
+ goto out;
+ }
+ }
+
+out:
+ spin_unlock(&vdpasim->lock);
+}
+
+static int dir_to_perm(enum dma_data_direction dir)
+{
+ int perm = -EFAULT;
+
+ switch (dir) {
+ case DMA_FROM_DEVICE:
+ perm = VHOST_MAP_WO;
+ break;
+ case DMA_TO_DEVICE:
+ perm = VHOST_MAP_RO;
+ break;
+ case DMA_BIDIRECTIONAL:
+ perm = VHOST_MAP_RW;
+ break;
+ default:
+ break;
+ }
+
+ return perm;
+}
+
+static dma_addr_t vdpasim_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct vdpasim *vdpasim = dev_to_sim(dev);
+ struct vhost_iotlb *iommu = vdpasim->iommu;
+ u64 pa = (page_to_pfn(page) << PAGE_SHIFT) + offset;
+ int ret, perm = dir_to_perm(dir);
+
+ if (perm < 0)
+ return DMA_MAPPING_ERROR;
+
+ /* For simplicity, use identical mapping to avoid e.g iova
+ * allocator.
+ */
+ ret = vhost_iotlb_add_range(iommu, pa, pa + size - 1,
+ pa, dir_to_perm(dir));
+ if (ret)
+ return DMA_MAPPING_ERROR;
+
+ return (dma_addr_t)(pa);
+}
+
+static void vdpasim_unmap_page(struct device *dev, dma_addr_t dma_addr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct vdpasim *vdpasim = dev_to_sim(dev);
+ struct vhost_iotlb *iommu = vdpasim->iommu;
+
+ vhost_iotlb_del_range(iommu, (u64)dma_addr,
+ (u64)dma_addr + size - 1);
+}
+
+static void *vdpasim_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_addr, gfp_t flag,
+ unsigned long attrs)
+{
+ struct vdpasim *vdpasim = dev_to_sim(dev);
+ struct vhost_iotlb *iommu = vdpasim->iommu;
+ void *addr = kmalloc(size, flag);
+ int ret;
+
+ if (!addr)
+ *dma_addr = DMA_MAPPING_ERROR;
+ else {
+ u64 pa = virt_to_phys(addr);
+
+ ret = vhost_iotlb_add_range(iommu, (u64)pa,
+ (u64)pa + size - 1,
+ pa, VHOST_MAP_RW);
+ if (ret) {
+ *dma_addr = DMA_MAPPING_ERROR;
+ kfree(addr);
+ addr = NULL;
+ } else
+ *dma_addr = (dma_addr_t)pa;
+ }
+
+ return addr;
+}
+
+static void vdpasim_free_coherent(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_addr,
+ unsigned long attrs)
+{
+ struct vdpasim *vdpasim = dev_to_sim(dev);
+ struct vhost_iotlb *iommu = vdpasim->iommu;
+
+ vhost_iotlb_del_range(iommu, (u64)dma_addr,
+ (u64)dma_addr + size - 1);
+ kfree(phys_to_virt((uintptr_t)dma_addr));
+}
+
+static const struct dma_map_ops vdpasim_dma_ops = {
+ .map_page = vdpasim_map_page,
+ .unmap_page = vdpasim_unmap_page,
+ .alloc = vdpasim_alloc_coherent,
+ .free = vdpasim_free_coherent,
+};
+
+static const struct vdpa_config_ops vdpasim_net_config_ops;
+
+static struct vdpasim *vdpasim_create(void)
+{
+ struct virtio_net_config *config;
+ struct vdpasim *vdpasim;
+ struct device *dev;
+ int ret = -ENOMEM;
+
+ vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL,
+ &vdpasim_net_config_ops);
+ if (!vdpasim)
+ goto err_alloc;
+
+ INIT_WORK(&vdpasim->work, vdpasim_work);
+ spin_lock_init(&vdpasim->lock);
+
+ dev = &vdpasim->vdpa.dev;
+ dev->coherent_dma_mask = DMA_BIT_MASK(64);
+ set_dma_ops(dev, &vdpasim_dma_ops);
+
+ vdpasim->iommu = vhost_iotlb_alloc(2048, 0);
+ if (!vdpasim->iommu)
+ goto err_iommu;
+
+ vdpasim->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!vdpasim->buffer)
+ goto err_iommu;
+
+ config = &vdpasim->config;
+ config->mtu = 1500;
+ config->status = VIRTIO_NET_S_LINK_UP;
+ eth_random_addr(config->mac);
+
+ vringh_set_iotlb(&vdpasim->vqs[0].vring, vdpasim->iommu);
+ vringh_set_iotlb(&vdpasim->vqs[1].vring, vdpasim->iommu);
+
+ vdpasim->vdpa.dma_dev = dev;
+ ret = vdpa_register_device(&vdpasim->vdpa);
+ if (ret)
+ goto err_iommu;
+
+ return vdpasim;
+
+err_iommu:
+ put_device(dev);
+err_alloc:
+ return ERR_PTR(ret);
+}
+
+static int vdpasim_set_vq_address(struct vdpa_device *vdpa, u16 idx,
+ u64 desc_area, u64 driver_area,
+ u64 device_area)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+ vq->desc_addr = desc_area;
+ vq->driver_addr = driver_area;
+ vq->device_addr = device_area;
+
+ return 0;
+}
+
+static void vdpasim_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+ vq->num = num;
+}
+
+static void vdpasim_kick_vq(struct vdpa_device *vdpa, u16 idx)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+ if (vq->ready)
+ schedule_work(&vdpasim->work);
+}
+
+static void vdpasim_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
+ struct vdpa_callback *cb)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+ vq->cb = cb->callback;
+ vq->private = cb->private;
+}
+
+static void vdpasim_set_vq_ready(struct vdpa_device *vdpa, u16 idx, bool ready)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+ spin_lock(&vdpasim->lock);
+ vq->ready = ready;
+ if (vq->ready)
+ vdpasim_queue_ready(vdpasim, idx);
+ spin_unlock(&vdpasim->lock);
+}
+
+static bool vdpasim_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+ return vq->ready;
+}
+
+static int vdpasim_set_vq_state(struct vdpa_device *vdpa, u16 idx, u64 state)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+ struct vringh *vrh = &vq->vring;
+
+ spin_lock(&vdpasim->lock);
+ vrh->last_avail_idx = state;
+ spin_unlock(&vdpasim->lock);
+
+ return 0;
+}
+
+static u64 vdpasim_get_vq_state(struct vdpa_device *vdpa, u16 idx)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+ struct vringh *vrh = &vq->vring;
+
+ return vrh->last_avail_idx;
+}
+
+static u16 vdpasim_get_vq_align(struct vdpa_device *vdpa)
+{
+ return VDPASIM_QUEUE_ALIGN;
+}
+
+static u64 vdpasim_get_features(struct vdpa_device *vdpa)
+{
+ return vdpasim_features;
+}
+
+static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ /* DMA mapping must be done by driver */
+ if (!(features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)))
+ return -EINVAL;
+
+ vdpasim->features = features & vdpasim_features;
+
+ return 0;
+}
+
+static void vdpasim_set_config_cb(struct vdpa_device *vdpa,
+ struct vdpa_callback *cb)
+{
+ /* We don't support config interrupt */
+}
+
+static u16 vdpasim_get_vq_num_max(struct vdpa_device *vdpa)
+{
+ return VDPASIM_QUEUE_MAX;
+}
+
+static u32 vdpasim_get_device_id(struct vdpa_device *vdpa)
+{
+ return VDPASIM_DEVICE_ID;
+}
+
+static u32 vdpasim_get_vendor_id(struct vdpa_device *vdpa)
+{
+ return VDPASIM_VENDOR_ID;
+}
+
+static u8 vdpasim_get_status(struct vdpa_device *vdpa)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ u8 status;
+
+ spin_lock(&vdpasim->lock);
+ status = vdpasim->status;
+ spin_unlock(&vdpasim->lock);
+
+ return vdpasim->status;
+}
+
+static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ spin_lock(&vdpasim->lock);
+ vdpasim->status = status;
+ if (status == 0)
+ vdpasim_reset(vdpasim);
+ spin_unlock(&vdpasim->lock);
+}
+
+static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset,
+ void *buf, unsigned int len)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ if (offset + len < sizeof(struct virtio_net_config))
+ memcpy(buf, &vdpasim->config + offset, len);
+}
+
+static void vdpasim_set_config(struct vdpa_device *vdpa, unsigned int offset,
+ const void *buf, unsigned int len)
+{
+ /* No writable config supportted by vdpasim */
+}
+
+static u32 vdpasim_get_generation(struct vdpa_device *vdpa)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ return vdpasim->generation;
+}
+
+static int vdpasim_set_map(struct vdpa_device *vdpa,
+ struct vhost_iotlb *iotlb)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+ struct vhost_iotlb_map *map;
+ u64 start = 0ULL, last = 0ULL - 1;
+ int ret;
+
+ vhost_iotlb_reset(vdpasim->iommu);
+
+ for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
+ map = vhost_iotlb_itree_next(map, start, last)) {
+ ret = vhost_iotlb_add_range(vdpasim->iommu, map->start,
+ map->last, map->addr, map->perm);
+ if (ret)
+ goto err;
+ }
+ return 0;
+
+err:
+ vhost_iotlb_reset(vdpasim->iommu);
+ return ret;
+}
+
+static int vdpasim_dma_map(struct vdpa_device *vdpa, u64 iova, u64 size,
+ u64 pa, u32 perm)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ return vhost_iotlb_add_range(vdpasim->iommu, iova,
+ iova + size - 1, pa, perm);
+}
+
+static int vdpasim_dma_unmap(struct vdpa_device *vdpa, u64 iova, u64 size)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ vhost_iotlb_del_range(vdpasim->iommu, iova, iova + size - 1);
+
+ return 0;
+}
+
+static void vdpasim_free(struct vdpa_device *vdpa)
+{
+ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+ cancel_work_sync(&vdpasim->work);
+ kfree(vdpasim->buffer);
+ if (vdpasim->iommu)
+ vhost_iotlb_free(vdpasim->iommu);
+}
+
+static const struct vdpa_config_ops vdpasim_net_config_ops = {
+ .set_vq_address = vdpasim_set_vq_address,
+ .set_vq_num = vdpasim_set_vq_num,
+ .kick_vq = vdpasim_kick_vq,
+ .set_vq_cb = vdpasim_set_vq_cb,
+ .set_vq_ready = vdpasim_set_vq_ready,
+ .get_vq_ready = vdpasim_get_vq_ready,
+ .set_vq_state = vdpasim_set_vq_state,
+ .get_vq_state = vdpasim_get_vq_state,
+ .get_vq_align = vdpasim_get_vq_align,
+ .get_features = vdpasim_get_features,
+ .set_features = vdpasim_set_features,
+ .set_config_cb = vdpasim_set_config_cb,
+ .get_vq_num_max = vdpasim_get_vq_num_max,
+ .get_device_id = vdpasim_get_device_id,
+ .get_vendor_id = vdpasim_get_vendor_id,
+ .get_status = vdpasim_get_status,
+ .set_status = vdpasim_set_status,
+ .get_config = vdpasim_get_config,
+ .set_config = vdpasim_set_config,
+ .get_generation = vdpasim_get_generation,
+ .set_map = vdpasim_set_map,
+ .dma_map = vdpasim_dma_map,
+ .dma_unmap = vdpasim_dma_unmap,
+ .free = vdpasim_free,
+};
+
+static int __init vdpasim_dev_init(void)
+{
+ vdpasim_dev = vdpasim_create();
+
+ if (!IS_ERR(vdpasim_dev))
+ return 0;
+
+ return PTR_ERR(vdpasim_dev);
+}
+
+static void __exit vdpasim_dev_exit(void)
+{
+ struct vdpa_device *vdpa = &vdpasim_dev->vdpa;
+
+ vdpa_unregister_device(vdpa);
+}
+
+module_init(vdpasim_dev_init)
+module_exit(vdpasim_dev_exit)
+
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE(DRV_LICENSE);
+MODULE_AUTHOR(DRV_AUTHOR);
+MODULE_DESCRIPTION(DRV_DESC);
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 3d03ccbd1adc..362b832f5338 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -1,4 +1,29 @@
# SPDX-License-Identifier: GPL-2.0-only
+config VHOST_IOTLB
+ tristate
+ help
+ Generic IOTLB implementation for vhost and vringh.
+
+config VHOST_RING
+ tristate
+ select VHOST_IOTLB
+ help
+ This option is selected by any driver which needs to access
+ the host side of a virtio ring.
+
+config VHOST
+ tristate
+ select VHOST_IOTLB
+ help
+ This option is selected by any driver which needs to access
+ the core of vhost.
+
+menuconfig VHOST_MENU
+ bool "VHOST drivers"
+ default y
+
+if VHOST_MENU
+
config VHOST_NET
tristate "Host kernel accelerator for virtio net"
depends on NET && EVENTFD && (TUN || !TUN) && (TAP || !TAP)
@@ -23,8 +48,8 @@ config VHOST_SCSI
config VHOST_VSOCK
tristate "vhost virtio-vsock driver"
depends on VSOCKETS && EVENTFD
- select VIRTIO_VSOCKETS_COMMON
select VHOST
+ select VIRTIO_VSOCKETS_COMMON
default n
---help---
This kernel module can be loaded in the host kernel to provide AF_VSOCK
@@ -34,11 +59,17 @@ config VHOST_VSOCK
To compile this driver as a module, choose M here: the module will be called
vhost_vsock.
-config VHOST
- tristate
- ---help---
- This option is selected by any driver which needs to access
- the core of vhost.
+config VHOST_VDPA
+ tristate "Vhost driver for vDPA-based backend"
+ depends on EVENTFD
+ select VHOST
+ select VDPA
+ help
+ This kernel module can be loaded in host kernel to accelerate
+ guest virtio devices with the vDPA-based backends.
+
+ To compile this driver as a module, choose M here: the module
+ will be called vhost_vdpa.
config VHOST_CROSS_ENDIAN_LEGACY
bool "Cross-endian support for vhost"
@@ -54,3 +85,5 @@ config VHOST_CROSS_ENDIAN_LEGACY
adds some overhead, it is disabled by default.
If unsure, say "N".
+
+endif
diff --git a/drivers/vhost/Kconfig.vringh b/drivers/vhost/Kconfig.vringh
deleted file mode 100644
index c1fe36a9b8d4..000000000000
--- a/drivers/vhost/Kconfig.vringh
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config VHOST_RING
- tristate
- ---help---
- This option is selected by any driver which needs to access
- the host side of a virtio ring.
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 6c6df24f770c..f3e1897cce85 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -10,4 +10,10 @@ vhost_vsock-y := vsock.o
obj-$(CONFIG_VHOST_RING) += vringh.o
+obj-$(CONFIG_VHOST_VDPA) += vhost_vdpa.o
+vhost_vdpa-y := vdpa.o
+
obj-$(CONFIG_VHOST) += vhost.o
+
+obj-$(CONFIG_VHOST_IOTLB) += vhost_iotlb.o
+vhost_iotlb-y := iotlb.o
diff --git a/drivers/vhost/iotlb.c b/drivers/vhost/iotlb.c
new file mode 100644
index 000000000000..1f0ca6e44410
--- /dev/null
+++ b/drivers/vhost/iotlb.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2020 Red Hat, Inc.
+ * Author: Jason Wang <jasowang@redhat.com>
+ *
+ * IOTLB implementation for vhost.
+ */
+#include <linux/slab.h>
+#include <linux/vhost_iotlb.h>
+#include <linux/module.h>
+
+#define MOD_VERSION "0.1"
+#define MOD_DESC "VHOST IOTLB"
+#define MOD_AUTHOR "Jason Wang <jasowang@redhat.com>"
+#define MOD_LICENSE "GPL v2"
+
+#define START(map) ((map)->start)
+#define LAST(map) ((map)->last)
+
+INTERVAL_TREE_DEFINE(struct vhost_iotlb_map,
+ rb, __u64, __subtree_last,
+ START, LAST, static inline, vhost_iotlb_itree);
+
+/**
+ * vhost_iotlb_map_free - remove a map node and free it
+ * @iotlb: the IOTLB
+ * @map: the map that want to be remove and freed
+ */
+void vhost_iotlb_map_free(struct vhost_iotlb *iotlb,
+ struct vhost_iotlb_map *map)
+{
+ vhost_iotlb_itree_remove(map, &iotlb->root);
+ list_del(&map->link);
+ kfree(map);
+ iotlb->nmaps--;
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_map_free);
+
+/**
+ * vhost_iotlb_add_range - add a new range to vhost IOTLB
+ * @iotlb: the IOTLB
+ * @start: start of the IOVA range
+ * @last: last of IOVA range
+ * @addr: the address that is mapped to @start
+ * @perm: access permission of this range
+ *
+ * Returns an error last is smaller than start or memory allocation
+ * fails
+ */
+int vhost_iotlb_add_range(struct vhost_iotlb *iotlb,
+ u64 start, u64 last,
+ u64 addr, unsigned int perm)
+{
+ struct vhost_iotlb_map *map;
+
+ if (last < start)
+ return -EFAULT;
+
+ if (iotlb->limit &&
+ iotlb->nmaps == iotlb->limit &&
+ iotlb->flags & VHOST_IOTLB_FLAG_RETIRE) {
+ map = list_first_entry(&iotlb->list, typeof(*map), link);
+ vhost_iotlb_map_free(iotlb, map);
+ }
+
+ map = kmalloc(sizeof(*map), GFP_ATOMIC);
+ if (!map)
+ return -ENOMEM;
+
+ map->start = start;
+ map->size = last - start + 1;
+ map->last = last;
+ map->addr = addr;
+ map->perm = perm;
+
+ iotlb->nmaps++;
+ vhost_iotlb_itree_insert(map, &iotlb->root);
+
+ INIT_LIST_HEAD(&map->link);
+ list_add_tail(&map->link, &iotlb->list);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_add_range);
+
+/**
+ * vring_iotlb_del_range - delete overlapped ranges from vhost IOTLB
+ * @iotlb: the IOTLB
+ * @start: start of the IOVA range
+ * @last: last of IOVA range
+ */
+void vhost_iotlb_del_range(struct vhost_iotlb *iotlb, u64 start, u64 last)
+{
+ struct vhost_iotlb_map *map;
+
+ while ((map = vhost_iotlb_itree_iter_first(&iotlb->root,
+ start, last)))
+ vhost_iotlb_map_free(iotlb, map);
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_del_range);
+
+/**
+ * vhost_iotlb_alloc - add a new vhost IOTLB
+ * @limit: maximum number of IOTLB entries
+ * @flags: VHOST_IOTLB_FLAG_XXX
+ *
+ * Returns an error is memory allocation fails
+ */
+struct vhost_iotlb *vhost_iotlb_alloc(unsigned int limit, unsigned int flags)
+{
+ struct vhost_iotlb *iotlb = kzalloc(sizeof(*iotlb), GFP_KERNEL);
+
+ if (!iotlb)
+ return NULL;
+
+ iotlb->root = RB_ROOT_CACHED;
+ iotlb->limit = limit;
+ iotlb->nmaps = 0;
+ iotlb->flags = flags;
+ INIT_LIST_HEAD(&iotlb->list);
+
+ return iotlb;
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_alloc);
+
+/**
+ * vhost_iotlb_reset - reset vhost IOTLB (free all IOTLB entries)
+ * @iotlb: the IOTLB to be reset
+ */
+void vhost_iotlb_reset(struct vhost_iotlb *iotlb)
+{
+ vhost_iotlb_del_range(iotlb, 0ULL, 0ULL - 1);
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_reset);
+
+/**
+ * vhost_iotlb_free - reset and free vhost IOTLB
+ * @iotlb: the IOTLB to be freed
+ */
+void vhost_iotlb_free(struct vhost_iotlb *iotlb)
+{
+ if (iotlb) {
+ vhost_iotlb_reset(iotlb);
+ kfree(iotlb);
+ }
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_free);
+
+/**
+ * vhost_iotlb_itree_first - return the first overlapped range
+ * @iotlb: the IOTLB
+ * @start: start of IOVA range
+ * @end: end of IOVA range
+ */
+struct vhost_iotlb_map *
+vhost_iotlb_itree_first(struct vhost_iotlb *iotlb, u64 start, u64 last)
+{
+ return vhost_iotlb_itree_iter_first(&iotlb->root, start, last);
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_itree_first);
+
+/**
+ * vhost_iotlb_itree_first - return the next overlapped range
+ * @iotlb: the IOTLB
+ * @start: start of IOVA range
+ * @end: end of IOVA range
+ */
+struct vhost_iotlb_map *
+vhost_iotlb_itree_next(struct vhost_iotlb_map *map, u64 start, u64 last)
+{
+ return vhost_iotlb_itree_iter_next(map, start, last);
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_itree_next);
+
+MODULE_VERSION(MOD_VERSION);
+MODULE_DESCRIPTION(MOD_DESC);
+MODULE_AUTHOR(MOD_AUTHOR);
+MODULE_LICENSE(MOD_LICENSE);
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 18e205eeb9af..87469d67ede8 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1324,7 +1324,8 @@ static int vhost_net_open(struct inode *inode, struct file *f)
}
vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
UIO_MAXIOV + VHOST_NET_BATCH,
- VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT);
+ VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT,
+ NULL);
vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
@@ -1586,7 +1587,7 @@ static long vhost_net_reset_owner(struct vhost_net *n)
struct socket *tx_sock = NULL;
struct socket *rx_sock = NULL;
long err;
- struct vhost_umem *umem;
+ struct vhost_iotlb *umem;
mutex_lock(&n->dev.mutex);
err = vhost_dev_check_owner(&n->dev);
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 0b949a14bce3..7653667a8cdc 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1628,7 +1628,7 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
}
vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ, UIO_MAXIOV,
- VHOST_SCSI_WEIGHT, 0);
+ VHOST_SCSI_WEIGHT, 0, NULL);
vhost_scsi_init_inflight(vs, NULL);
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
new file mode 100644
index 000000000000..421f02a8530a
--- /dev/null
+++ b/drivers/vhost/vdpa.c
@@ -0,0 +1,883 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018-2020 Intel Corporation.
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * Author: Tiwei Bie <tiwei.bie@intel.com>
+ * Jason Wang <jasowang@redhat.com>
+ *
+ * Thanks Michael S. Tsirkin for the valuable comments and
+ * suggestions. And thanks to Cunming Liang and Zhihong Wang for all
+ * their supports.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/iommu.h>
+#include <linux/uuid.h>
+#include <linux/vdpa.h>
+#include <linux/nospec.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+
+#include "vhost.h"
+
+enum {
+ VHOST_VDPA_FEATURES =
+ (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
+ (1ULL << VIRTIO_F_ANY_LAYOUT) |
+ (1ULL << VIRTIO_F_VERSION_1) |
+ (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
+ (1ULL << VIRTIO_F_RING_PACKED) |
+ (1ULL << VIRTIO_F_ORDER_PLATFORM) |
+ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
+ (1ULL << VIRTIO_RING_F_EVENT_IDX),
+
+ VHOST_VDPA_NET_FEATURES = VHOST_VDPA_FEATURES |
+ (1ULL << VIRTIO_NET_F_CSUM) |
+ (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
+ (1ULL << VIRTIO_NET_F_MTU) |
+ (1ULL << VIRTIO_NET_F_MAC) |
+ (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
+ (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
+ (1ULL << VIRTIO_NET_F_GUEST_ECN) |
+ (1ULL << VIRTIO_NET_F_GUEST_UFO) |
+ (1ULL << VIRTIO_NET_F_HOST_TSO4) |
+ (1ULL << VIRTIO_NET_F_HOST_TSO6) |
+ (1ULL << VIRTIO_NET_F_HOST_ECN) |
+ (1ULL << VIRTIO_NET_F_HOST_UFO) |
+ (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+ (1ULL << VIRTIO_NET_F_STATUS) |
+ (1ULL << VIRTIO_NET_F_SPEED_DUPLEX),
+};
+
+/* Currently, only network backend w/o multiqueue is supported. */
+#define VHOST_VDPA_VQ_MAX 2
+
+#define VHOST_VDPA_DEV_MAX (1U << MINORBITS)
+
+struct vhost_vdpa {
+ struct vhost_dev vdev;
+ struct iommu_domain *domain;
+ struct vhost_virtqueue *vqs;
+ struct completion completion;
+ struct vdpa_device *vdpa;
+ struct device dev;
+ struct cdev cdev;
+ atomic_t opened;
+ int nvqs;
+ int virtio_id;
+ int minor;
+};
+
+static DEFINE_IDA(vhost_vdpa_ida);
+
+static dev_t vhost_vdpa_major;
+
+static const u64 vhost_vdpa_features[] = {
+ [VIRTIO_ID_NET] = VHOST_VDPA_NET_FEATURES,
+};
+
+static void handle_vq_kick(struct vhost_work *work)
+{
+ struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+ poll.work);
+ struct vhost_vdpa *v = container_of(vq->dev, struct vhost_vdpa, vdev);
+ const struct vdpa_config_ops *ops = v->vdpa->config;
+
+ ops->kick_vq(v->vdpa, vq - v->vqs);
+}
+
+static irqreturn_t vhost_vdpa_virtqueue_cb(void *private)
+{
+ struct vhost_virtqueue *vq = private;
+ struct eventfd_ctx *call_ctx = vq->call_ctx;
+
+ if (call_ctx)
+ eventfd_signal(call_ctx, 1);
+
+ return IRQ_HANDLED;
+}
+
+static void vhost_vdpa_reset(struct vhost_vdpa *v)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ ops->set_status(vdpa, 0);
+}
+
+static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u32 device_id;
+
+ device_id = ops->get_device_id(vdpa);
+
+ if (copy_to_user(argp, &device_id, sizeof(device_id)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long vhost_vdpa_get_status(struct vhost_vdpa *v, u8 __user *statusp)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u8 status;
+
+ status = ops->get_status(vdpa);
+
+ if (copy_to_user(statusp, &status, sizeof(status)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u8 status;
+
+ if (copy_from_user(&status, statusp, sizeof(status)))
+ return -EFAULT;
+
+ /*
+ * Userspace shouldn't remove status bits unless reset the
+ * status to 0.
+ */
+ if (status != 0 && (ops->get_status(vdpa) & ~status) != 0)
+ return -EINVAL;
+
+ ops->set_status(vdpa, status);
+
+ return 0;
+}
+
+static int vhost_vdpa_config_validate(struct vhost_vdpa *v,
+ struct vhost_vdpa_config *c)
+{
+ long size = 0;
+
+ switch (v->virtio_id) {
+ case VIRTIO_ID_NET:
+ size = sizeof(struct virtio_net_config);
+ break;
+ }
+
+ if (c->len == 0)
+ return -EINVAL;
+
+ if (c->len > size - c->off)
+ return -E2BIG;
+
+ return 0;
+}
+
+static long vhost_vdpa_get_config(struct vhost_vdpa *v,
+ struct vhost_vdpa_config __user *c)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct vhost_vdpa_config config;
+ unsigned long size = offsetof(struct vhost_vdpa_config, buf);
+ u8 *buf;
+
+ if (copy_from_user(&config, c, size))
+ return -EFAULT;
+ if (vhost_vdpa_config_validate(v, &config))
+ return -EINVAL;
+ buf = kvzalloc(config.len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ ops->get_config(vdpa, config.off, buf, config.len);
+
+ if (copy_to_user(c->buf, buf, config.len)) {
+ kvfree(buf);
+ return -EFAULT;
+ }
+
+ kvfree(buf);
+ return 0;
+}
+
+static long vhost_vdpa_set_config(struct vhost_vdpa *v,
+ struct vhost_vdpa_config __user *c)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct vhost_vdpa_config config;
+ unsigned long size = offsetof(struct vhost_vdpa_config, buf);
+ u8 *buf;
+
+ if (copy_from_user(&config, c, size))
+ return -EFAULT;
+ if (vhost_vdpa_config_validate(v, &config))
+ return -EINVAL;
+ buf = kvzalloc(config.len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, c->buf, config.len)) {
+ kvfree(buf);
+ return -EFAULT;
+ }
+
+ ops->set_config(vdpa, config.off, buf, config.len);
+
+ kvfree(buf);
+ return 0;
+}
+
+static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u64 features;
+
+ features = ops->get_features(vdpa);
+ features &= vhost_vdpa_features[v->virtio_id];
+
+ if (copy_to_user(featurep, &features, sizeof(features)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u64 features;
+
+ /*
+ * It's not allowed to change the features after they have
+ * been negotiated.
+ */
+ if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_FEATURES_OK)
+ return -EBUSY;
+
+ if (copy_from_user(&features, featurep, sizeof(features)))
+ return -EFAULT;
+
+ if (features & ~vhost_vdpa_features[v->virtio_id])
+ return -EINVAL;
+
+ if (ops->set_features(vdpa, features))
+ return -EINVAL;
+
+ return 0;
+}
+
+static long vhost_vdpa_get_vring_num(struct vhost_vdpa *v, u16 __user *argp)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u16 num;
+
+ num = ops->get_vq_num_max(vdpa);
+
+ if (copy_to_user(argp, &num, sizeof(num)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
+ void __user *argp)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct vdpa_callback cb;
+ struct vhost_virtqueue *vq;
+ struct vhost_vring_state s;
+ u8 status;
+ u32 idx;
+ long r;
+
+ r = get_user(idx, (u32 __user *)argp);
+ if (r < 0)
+ return r;
+
+ if (idx >= v->nvqs)
+ return -ENOBUFS;
+
+ idx = array_index_nospec(idx, v->nvqs);
+ vq = &v->vqs[idx];
+
+ status = ops->get_status(vdpa);
+
+ if (cmd == VHOST_VDPA_SET_VRING_ENABLE) {
+ if (copy_from_user(&s, argp, sizeof(s)))
+ return -EFAULT;
+ ops->set_vq_ready(vdpa, idx, s.num);
+ return 0;
+ }
+
+ if (cmd == VHOST_GET_VRING_BASE)
+ vq->last_avail_idx = ops->get_vq_state(v->vdpa, idx);
+
+ r = vhost_vring_ioctl(&v->vdev, cmd, argp);
+ if (r)
+ return r;
+
+ switch (cmd) {
+ case VHOST_SET_VRING_ADDR:
+ if (ops->set_vq_address(vdpa, idx,
+ (u64)(uintptr_t)vq->desc,
+ (u64)(uintptr_t)vq->avail,
+ (u64)(uintptr_t)vq->used))
+ r = -EINVAL;
+ break;
+
+ case VHOST_SET_VRING_BASE:
+ if (ops->set_vq_state(vdpa, idx, vq->last_avail_idx))
+ r = -EINVAL;
+ break;
+
+ case VHOST_SET_VRING_CALL:
+ if (vq->call_ctx) {
+ cb.callback = vhost_vdpa_virtqueue_cb;
+ cb.private = vq;
+ } else {
+ cb.callback = NULL;
+ cb.private = NULL;
+ }
+ ops->set_vq_cb(vdpa, idx, &cb);
+ break;
+
+ case VHOST_SET_VRING_NUM:
+ ops->set_vq_num(vdpa, idx, vq->num);
+ break;
+ }
+
+ return r;
+}
+
+static long vhost_vdpa_unlocked_ioctl(struct file *filep,
+ unsigned int cmd, unsigned long arg)
+{
+ struct vhost_vdpa *v = filep->private_data;
+ struct vhost_dev *d = &v->vdev;
+ void __user *argp = (void __user *)arg;
+ long r;
+
+ mutex_lock(&d->mutex);
+
+ switch (cmd) {
+ case VHOST_VDPA_GET_DEVICE_ID:
+ r = vhost_vdpa_get_device_id(v, argp);
+ break;
+ case VHOST_VDPA_GET_STATUS:
+ r = vhost_vdpa_get_status(v, argp);
+ break;
+ case VHOST_VDPA_SET_STATUS:
+ r = vhost_vdpa_set_status(v, argp);
+ break;
+ case VHOST_VDPA_GET_CONFIG:
+ r = vhost_vdpa_get_config(v, argp);
+ break;
+ case VHOST_VDPA_SET_CONFIG:
+ r = vhost_vdpa_set_config(v, argp);
+ break;
+ case VHOST_GET_FEATURES:
+ r = vhost_vdpa_get_features(v, argp);
+ break;
+ case VHOST_SET_FEATURES:
+ r = vhost_vdpa_set_features(v, argp);
+ break;
+ case VHOST_VDPA_GET_VRING_NUM:
+ r = vhost_vdpa_get_vring_num(v, argp);
+ break;
+ case VHOST_SET_LOG_BASE:
+ case VHOST_SET_LOG_FD:
+ r = -ENOIOCTLCMD;
+ break;
+ default:
+ r = vhost_dev_ioctl(&v->vdev, cmd, argp);
+ if (r == -ENOIOCTLCMD)
+ r = vhost_vdpa_vring_ioctl(v, cmd, argp);
+ break;
+ }
+
+ mutex_unlock(&d->mutex);
+ return r;
+}
+
+static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last)
+{
+ struct vhost_dev *dev = &v->vdev;
+ struct vhost_iotlb *iotlb = dev->iotlb;
+ struct vhost_iotlb_map *map;
+ struct page *page;
+ unsigned long pfn, pinned;
+
+ while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) {
+ pinned = map->size >> PAGE_SHIFT;
+ for (pfn = map->addr >> PAGE_SHIFT;
+ pinned > 0; pfn++, pinned--) {
+ page = pfn_to_page(pfn);
+ if (map->perm & VHOST_ACCESS_WO)
+ set_page_dirty_lock(page);
+ unpin_user_page(page);
+ }
+ atomic64_sub(map->size >> PAGE_SHIFT, &dev->mm->pinned_vm);
+ vhost_iotlb_map_free(iotlb, map);
+ }
+}
+
+static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v)
+{
+ struct vhost_dev *dev = &v->vdev;
+
+ vhost_vdpa_iotlb_unmap(v, 0ULL, 0ULL - 1);
+ kfree(dev->iotlb);
+ dev->iotlb = NULL;
+}
+
+static int perm_to_iommu_flags(u32 perm)
+{
+ int flags = 0;
+
+ switch (perm) {
+ case VHOST_ACCESS_WO:
+ flags |= IOMMU_WRITE;
+ break;
+ case VHOST_ACCESS_RO:
+ flags |= IOMMU_READ;
+ break;
+ case VHOST_ACCESS_RW:
+ flags |= (IOMMU_WRITE | IOMMU_READ);
+ break;
+ default:
+ WARN(1, "invalidate vhost IOTLB permission\n");
+ break;
+ }
+
+ return flags | IOMMU_CACHE;
+}
+
+static int vhost_vdpa_map(struct vhost_vdpa *v,
+ u64 iova, u64 size, u64 pa, u32 perm)
+{
+ struct vhost_dev *dev = &v->vdev;
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ int r = 0;
+
+ r = vhost_iotlb_add_range(dev->iotlb, iova, iova + size - 1,
+ pa, perm);
+ if (r)
+ return r;
+
+ if (ops->dma_map)
+ r = ops->dma_map(vdpa, iova, size, pa, perm);
+ else if (ops->set_map)
+ r = ops->set_map(vdpa, dev->iotlb);
+ else
+ r = iommu_map(v->domain, iova, pa, size,
+ perm_to_iommu_flags(perm));
+
+ return r;
+}
+
+static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size)
+{
+ struct vhost_dev *dev = &v->vdev;
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ vhost_vdpa_iotlb_unmap(v, iova, iova + size - 1);
+
+ if (ops->dma_map)
+ ops->dma_unmap(vdpa, iova, size);
+ else if (ops->set_map)
+ ops->set_map(vdpa, dev->iotlb);
+ else
+ iommu_unmap(v->domain, iova, size);
+}
+
+static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
+ struct vhost_iotlb_msg *msg)
+{
+ struct vhost_dev *dev = &v->vdev;
+ struct vhost_iotlb *iotlb = dev->iotlb;
+ struct page **page_list;
+ unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
+ unsigned int gup_flags = FOLL_LONGTERM;
+ unsigned long npages, cur_base, map_pfn, last_pfn = 0;
+ unsigned long locked, lock_limit, pinned, i;
+ u64 iova = msg->iova;
+ int ret = 0;
+
+ if (vhost_iotlb_itree_first(iotlb, msg->iova,
+ msg->iova + msg->size - 1))
+ return -EEXIST;
+
+ page_list = (struct page **) __get_free_page(GFP_KERNEL);
+ if (!page_list)
+ return -ENOMEM;
+
+ if (msg->perm & VHOST_ACCESS_WO)
+ gup_flags |= FOLL_WRITE;
+
+ npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT;
+ if (!npages)
+ return -EINVAL;
+
+ down_read(&dev->mm->mmap_sem);
+
+ locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ if (locked > lock_limit) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ cur_base = msg->uaddr & PAGE_MASK;
+ iova &= PAGE_MASK;
+
+ while (npages) {
+ pinned = min_t(unsigned long, npages, list_size);
+ ret = pin_user_pages(cur_base, pinned,
+ gup_flags, page_list, NULL);
+ if (ret != pinned)
+ goto out;
+
+ if (!last_pfn)
+ map_pfn = page_to_pfn(page_list[0]);
+
+ for (i = 0; i < ret; i++) {
+ unsigned long this_pfn = page_to_pfn(page_list[i]);
+ u64 csize;
+
+ if (last_pfn && (this_pfn != last_pfn + 1)) {
+ /* Pin a contiguous chunk of memory */
+ csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
+ if (vhost_vdpa_map(v, iova, csize,
+ map_pfn << PAGE_SHIFT,
+ msg->perm))
+ goto out;
+ map_pfn = this_pfn;
+ iova += csize;
+ }
+
+ last_pfn = this_pfn;
+ }
+
+ cur_base += ret << PAGE_SHIFT;
+ npages -= ret;
+ }
+
+ /* Pin the rest chunk */
+ ret = vhost_vdpa_map(v, iova, (last_pfn - map_pfn + 1) << PAGE_SHIFT,
+ map_pfn << PAGE_SHIFT, msg->perm);
+out:
+ if (ret) {
+ vhost_vdpa_unmap(v, msg->iova, msg->size);
+ atomic64_sub(npages, &dev->mm->pinned_vm);
+ }
+ up_read(&dev->mm->mmap_sem);
+ free_page((unsigned long)page_list);
+ return ret;
+}
+
+static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev,
+ struct vhost_iotlb_msg *msg)
+{
+ struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev);
+ int r = 0;
+
+ r = vhost_dev_check_owner(dev);
+ if (r)
+ return r;
+
+ switch (msg->type) {
+ case VHOST_IOTLB_UPDATE:
+ r = vhost_vdpa_process_iotlb_update(v, msg);
+ break;
+ case VHOST_IOTLB_INVALIDATE:
+ vhost_vdpa_unmap(v, msg->iova, msg->size);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+
+ return r;
+}
+
+static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct vhost_vdpa *v = file->private_data;
+ struct vhost_dev *dev = &v->vdev;
+
+ return vhost_chr_write_iter(dev, from);
+}
+
+static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct device *dma_dev = vdpa_get_dma_dev(vdpa);
+ struct bus_type *bus;
+ int ret;
+
+ /* Device want to do DMA by itself */
+ if (ops->set_map || ops->dma_map)
+ return 0;
+
+ bus = dma_dev->bus;
+ if (!bus)
+ return -EFAULT;
+
+ if (!iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
+ return -ENOTSUPP;
+
+ v->domain = iommu_domain_alloc(bus);
+ if (!v->domain)
+ return -EIO;
+
+ ret = iommu_attach_device(v->domain, dma_dev);
+ if (ret)
+ goto err_attach;
+
+ return 0;
+
+err_attach:
+ iommu_domain_free(v->domain);
+ return ret;
+}
+
+static void vhost_vdpa_free_domain(struct vhost_vdpa *v)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ struct device *dma_dev = vdpa_get_dma_dev(vdpa);
+
+ if (v->domain) {
+ iommu_detach_device(v->domain, dma_dev);
+ iommu_domain_free(v->domain);
+ }
+
+ v->domain = NULL;
+}
+
+static int vhost_vdpa_open(struct inode *inode, struct file *filep)
+{
+ struct vhost_vdpa *v;
+ struct vhost_dev *dev;
+ struct vhost_virtqueue **vqs;
+ int nvqs, i, r, opened;
+
+ v = container_of(inode->i_cdev, struct vhost_vdpa, cdev);
+ if (!v)
+ return -ENODEV;
+
+ opened = atomic_cmpxchg(&v->opened, 0, 1);
+ if (opened)
+ return -EBUSY;
+
+ nvqs = v->nvqs;
+ vhost_vdpa_reset(v);
+
+ vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL);
+ if (!vqs) {
+ r = -ENOMEM;
+ goto err;
+ }
+
+ dev = &v->vdev;
+ for (i = 0; i < nvqs; i++) {
+ vqs[i] = &v->vqs[i];
+ vqs[i]->handle_kick = handle_vq_kick;
+ }
+ vhost_dev_init(dev, vqs, nvqs, 0, 0, 0,
+ vhost_vdpa_process_iotlb_msg);
+
+ dev->iotlb = vhost_iotlb_alloc(0, 0);
+ if (!dev->iotlb) {
+ r = -ENOMEM;
+ goto err_init_iotlb;
+ }
+
+ r = vhost_vdpa_alloc_domain(v);
+ if (r)
+ goto err_init_iotlb;
+
+ filep->private_data = v;
+
+ return 0;
+
+err_init_iotlb:
+ vhost_dev_cleanup(&v->vdev);
+err:
+ atomic_dec(&v->opened);
+ return r;
+}
+
+static int vhost_vdpa_release(struct inode *inode, struct file *filep)
+{
+ struct vhost_vdpa *v = filep->private_data;
+ struct vhost_dev *d = &v->vdev;
+
+ mutex_lock(&d->mutex);
+ filep->private_data = NULL;
+ vhost_vdpa_reset(v);
+ vhost_dev_stop(&v->vdev);
+ vhost_vdpa_iotlb_free(v);
+ vhost_vdpa_free_domain(v);
+ vhost_dev_cleanup(&v->vdev);
+ kfree(v->vdev.vqs);
+ mutex_unlock(&d->mutex);
+
+ atomic_dec(&v->opened);
+ complete(&v->completion);
+
+ return 0;
+}
+
+static const struct file_operations vhost_vdpa_fops = {
+ .owner = THIS_MODULE,
+ .open = vhost_vdpa_open,
+ .release = vhost_vdpa_release,
+ .write_iter = vhost_vdpa_chr_write_iter,
+ .unlocked_ioctl = vhost_vdpa_unlocked_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+};
+
+static void vhost_vdpa_release_dev(struct device *device)
+{
+ struct vhost_vdpa *v =
+ container_of(device, struct vhost_vdpa, dev);
+
+ ida_simple_remove(&vhost_vdpa_ida, v->minor);
+ kfree(v->vqs);
+ kfree(v);
+}
+
+static int vhost_vdpa_probe(struct vdpa_device *vdpa)
+{
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct vhost_vdpa *v;
+ int minor, nvqs = VHOST_VDPA_VQ_MAX;
+ int r;
+
+ /* Currently, we only accept the network devices. */
+ if (ops->get_device_id(vdpa) != VIRTIO_ID_NET)
+ return -ENOTSUPP;
+
+ v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ if (!v)
+ return -ENOMEM;
+
+ minor = ida_simple_get(&vhost_vdpa_ida, 0,
+ VHOST_VDPA_DEV_MAX, GFP_KERNEL);
+ if (minor < 0) {
+ kfree(v);
+ return minor;
+ }
+
+ atomic_set(&v->opened, 0);
+ v->minor = minor;
+ v->vdpa = vdpa;
+ v->nvqs = nvqs;
+ v->virtio_id = ops->get_device_id(vdpa);
+
+ device_initialize(&v->dev);
+ v->dev.release = vhost_vdpa_release_dev;
+ v->dev.parent = &vdpa->dev;
+ v->dev.devt = MKDEV(MAJOR(vhost_vdpa_major), minor);
+ v->vqs = kmalloc_array(nvqs, sizeof(struct vhost_virtqueue),
+ GFP_KERNEL);
+ if (!v->vqs) {
+ r = -ENOMEM;
+ goto err;
+ }
+
+ r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor);
+ if (r)
+ goto err;
+
+ cdev_init(&v->cdev, &vhost_vdpa_fops);
+ v->cdev.owner = THIS_MODULE;
+
+ r = cdev_device_add(&v->cdev, &v->dev);
+ if (r)
+ goto err;
+
+ init_completion(&v->completion);
+ vdpa_set_drvdata(vdpa, v);
+
+ return 0;
+
+err:
+ put_device(&v->dev);
+ return r;
+}
+
+static void vhost_vdpa_remove(struct vdpa_device *vdpa)
+{
+ struct vhost_vdpa *v = vdpa_get_drvdata(vdpa);
+ int opened;
+
+ cdev_device_del(&v->cdev, &v->dev);
+
+ do {
+ opened = atomic_cmpxchg(&v->opened, 0, 1);
+ if (!opened)
+ break;
+ wait_for_completion(&v->completion);
+ } while (1);
+
+ put_device(&v->dev);
+}
+
+static struct vdpa_driver vhost_vdpa_driver = {
+ .driver = {
+ .name = "vhost_vdpa",
+ },
+ .probe = vhost_vdpa_probe,
+ .remove = vhost_vdpa_remove,
+};
+
+static int __init vhost_vdpa_init(void)
+{
+ int r;
+
+ r = alloc_chrdev_region(&vhost_vdpa_major, 0, VHOST_VDPA_DEV_MAX,
+ "vhost-vdpa");
+ if (r)
+ goto err_alloc_chrdev;
+
+ r = vdpa_register_driver(&vhost_vdpa_driver);
+ if (r)
+ goto err_vdpa_register_driver;
+
+ return 0;
+
+err_vdpa_register_driver:
+ unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX);
+err_alloc_chrdev:
+ return r;
+}
+module_init(vhost_vdpa_init);
+
+static void __exit vhost_vdpa_exit(void)
+{
+ vdpa_unregister_driver(&vhost_vdpa_driver);
+ unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX);
+}
+module_exit(vhost_vdpa_exit);
+
+MODULE_VERSION("0.0.1");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("vDPA-based vhost backend for virtio");
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f44340b41494..d450e16c5c25 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -50,10 +50,6 @@ enum {
#define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
#define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
-INTERVAL_TREE_DEFINE(struct vhost_umem_node,
- rb, __u64, __subtree_last,
- START, LAST, static inline, vhost_umem_interval_tree);
-
#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
{
@@ -457,7 +453,9 @@ static size_t vhost_get_desc_size(struct vhost_virtqueue *vq,
void vhost_dev_init(struct vhost_dev *dev,
struct vhost_virtqueue **vqs, int nvqs,
- int iov_limit, int weight, int byte_weight)
+ int iov_limit, int weight, int byte_weight,
+ int (*msg_handler)(struct vhost_dev *dev,
+ struct vhost_iotlb_msg *msg))
{
struct vhost_virtqueue *vq;
int i;
@@ -473,6 +471,7 @@ void vhost_dev_init(struct vhost_dev *dev,
dev->iov_limit = iov_limit;
dev->weight = weight;
dev->byte_weight = byte_weight;
+ dev->msg_handler = msg_handler;
init_llist_head(&dev->work_list);
init_waitqueue_head(&dev->wait);
INIT_LIST_HEAD(&dev->read_list);
@@ -581,21 +580,25 @@ err_mm:
}
EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
-struct vhost_umem *vhost_dev_reset_owner_prepare(void)
+static struct vhost_iotlb *iotlb_alloc(void)
+{
+ return vhost_iotlb_alloc(max_iotlb_entries,
+ VHOST_IOTLB_FLAG_RETIRE);
+}
+
+struct vhost_iotlb *vhost_dev_reset_owner_prepare(void)
{
- return kvzalloc(sizeof(struct vhost_umem), GFP_KERNEL);
+ return iotlb_alloc();
}
EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
/* Caller should have device mutex */
-void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem)
+void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem)
{
int i;
vhost_dev_cleanup(dev);
- /* Restore memory to default empty mapping. */
- INIT_LIST_HEAD(&umem->umem_list);
dev->umem = umem;
/* We don't need VQ locks below since vhost_dev_cleanup makes sure
* VQs aren't running.
@@ -618,28 +621,6 @@ void vhost_dev_stop(struct vhost_dev *dev)
}
EXPORT_SYMBOL_GPL(vhost_dev_stop);
-static void vhost_umem_free(struct vhost_umem *umem,
- struct vhost_umem_node *node)
-{
- vhost_umem_interval_tree_remove(node, &umem->umem_tree);
- list_del(&node->link);
- kfree(node);
- umem->numem--;
-}
-
-static void vhost_umem_clean(struct vhost_umem *umem)
-{
- struct vhost_umem_node *node, *tmp;
-
- if (!umem)
- return;
-
- list_for_each_entry_safe(node, tmp, &umem->umem_list, link)
- vhost_umem_free(umem, node);
-
- kvfree(umem);
-}
-
static void vhost_clear_msg(struct vhost_dev *dev)
{
struct vhost_msg_node *node, *n;
@@ -677,9 +658,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
eventfd_ctx_put(dev->log_ctx);
dev->log_ctx = NULL;
/* No one will access memory at this point */
- vhost_umem_clean(dev->umem);
+ vhost_iotlb_free(dev->umem);
dev->umem = NULL;
- vhost_umem_clean(dev->iotlb);
+ vhost_iotlb_free(dev->iotlb);
dev->iotlb = NULL;
vhost_clear_msg(dev);
wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
@@ -715,27 +696,26 @@ static bool vhost_overflow(u64 uaddr, u64 size)
}
/* Caller should have vq mutex and device mutex. */
-static bool vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,
+static bool vq_memory_access_ok(void __user *log_base, struct vhost_iotlb *umem,
int log_all)
{
- struct vhost_umem_node *node;
+ struct vhost_iotlb_map *map;
if (!umem)
return false;
- list_for_each_entry(node, &umem->umem_list, link) {
- unsigned long a = node->userspace_addr;
+ list_for_each_entry(map, &umem->list, link) {
+ unsigned long a = map->addr;
- if (vhost_overflow(node->userspace_addr, node->size))
+ if (vhost_overflow(map->addr, map->size))
return false;
- if (!access_ok((void __user *)a,
- node->size))
+ if (!access_ok((void __user *)a, map->size))
return false;
else if (log_all && !log_access_ok(log_base,
- node->start,
- node->size))
+ map->start,
+ map->size))
return false;
}
return true;
@@ -745,17 +725,17 @@ static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq,
u64 addr, unsigned int size,
int type)
{
- const struct vhost_umem_node *node = vq->meta_iotlb[type];
+ const struct vhost_iotlb_map *map = vq->meta_iotlb[type];
- if (!node)
+ if (!map)
return NULL;
- return (void *)(uintptr_t)(node->userspace_addr + addr - node->start);
+ return (void *)(uintptr_t)(map->addr + addr - map->start);
}
/* Can we switch to this memory table? */
/* Caller should have device mutex but not vq mutex */
-static bool memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
+static bool memory_access_ok(struct vhost_dev *d, struct vhost_iotlb *umem,
int log_all)
{
int i;
@@ -1020,47 +1000,6 @@ static inline int vhost_get_desc(struct vhost_virtqueue *vq,
return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
}
-static int vhost_new_umem_range(struct vhost_umem *umem,
- u64 start, u64 size, u64 end,
- u64 userspace_addr, int perm)
-{
- struct vhost_umem_node *tmp, *node;
-
- if (!size)
- return -EFAULT;
-
- node = kmalloc(sizeof(*node), GFP_ATOMIC);
- if (!node)
- return -ENOMEM;
-
- if (umem->numem == max_iotlb_entries) {
- tmp = list_first_entry(&umem->umem_list, typeof(*tmp), link);
- vhost_umem_free(umem, tmp);
- }
-
- node->start = start;
- node->size = size;
- node->last = end;
- node->userspace_addr = userspace_addr;
- node->perm = perm;
- INIT_LIST_HEAD(&node->link);
- list_add_tail(&node->link, &umem->umem_list);
- vhost_umem_interval_tree_insert(node, &umem->umem_tree);
- umem->numem++;
-
- return 0;
-}
-
-static void vhost_del_umem_range(struct vhost_umem *umem,
- u64 start, u64 end)
-{
- struct vhost_umem_node *node;
-
- while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
- start, end)))
- vhost_umem_free(umem, node);
-}
-
static void vhost_iotlb_notify_vq(struct vhost_dev *d,
struct vhost_iotlb_msg *msg)
{
@@ -1117,9 +1056,9 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
break;
}
vhost_vq_meta_reset(dev);
- if (vhost_new_umem_range(dev->iotlb, msg->iova, msg->size,
- msg->iova + msg->size - 1,
- msg->uaddr, msg->perm)) {
+ if (vhost_iotlb_add_range(dev->iotlb, msg->iova,
+ msg->iova + msg->size - 1,
+ msg->uaddr, msg->perm)) {
ret = -ENOMEM;
break;
}
@@ -1131,8 +1070,8 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
break;
}
vhost_vq_meta_reset(dev);
- vhost_del_umem_range(dev->iotlb, msg->iova,
- msg->iova + msg->size - 1);
+ vhost_iotlb_del_range(dev->iotlb, msg->iova,
+ msg->iova + msg->size - 1);
break;
default:
ret = -EINVAL;
@@ -1178,7 +1117,12 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
ret = -EINVAL;
goto done;
}
- if (vhost_process_iotlb_msg(dev, &msg)) {
+
+ if (dev->msg_handler)
+ ret = dev->msg_handler(dev, &msg);
+ else
+ ret = vhost_process_iotlb_msg(dev, &msg);
+ if (ret) {
ret = -EFAULT;
goto done;
}
@@ -1311,44 +1255,42 @@ static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
}
static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
- const struct vhost_umem_node *node,
+ const struct vhost_iotlb_map *map,
int type)
{
int access = (type == VHOST_ADDR_USED) ?
VHOST_ACCESS_WO : VHOST_ACCESS_RO;
- if (likely(node->perm & access))
- vq->meta_iotlb[type] = node;
+ if (likely(map->perm & access))
+ vq->meta_iotlb[type] = map;
}
static bool iotlb_access_ok(struct vhost_virtqueue *vq,
int access, u64 addr, u64 len, int type)
{
- const struct vhost_umem_node *node;
- struct vhost_umem *umem = vq->iotlb;
+ const struct vhost_iotlb_map *map;
+ struct vhost_iotlb *umem = vq->iotlb;
u64 s = 0, size, orig_addr = addr, last = addr + len - 1;
if (vhost_vq_meta_fetch(vq, addr, len, type))
return true;
while (len > s) {
- node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
- addr,
- last);
- if (node == NULL || node->start > addr) {
+ map = vhost_iotlb_itree_first(umem, addr, last);
+ if (map == NULL || map->start > addr) {
vhost_iotlb_miss(vq, addr, access);
return false;
- } else if (!(node->perm & access)) {
+ } else if (!(map->perm & access)) {
/* Report the possible access violation by
* request another translation from userspace.
*/
return false;
}
- size = node->size - addr + node->start;
+ size = map->size - addr + map->start;
if (orig_addr == addr && size >= len)
- vhost_vq_meta_update(vq, node, type);
+ vhost_vq_meta_update(vq, map, type);
s += size;
addr += size;
@@ -1364,12 +1306,12 @@ int vq_meta_prefetch(struct vhost_virtqueue *vq)
if (!vq->iotlb)
return 1;
- return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
+ return iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->desc,
vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
- iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
+ iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->avail,
vhost_get_avail_size(vq, num),
VHOST_ADDR_AVAIL) &&
- iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used,
+ iotlb_access_ok(vq, VHOST_MAP_WO, (u64)(uintptr_t)vq->used,
vhost_get_used_size(vq, num), VHOST_ADDR_USED);
}
EXPORT_SYMBOL_GPL(vq_meta_prefetch);
@@ -1408,25 +1350,11 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq)
}
EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
-static struct vhost_umem *vhost_umem_alloc(void)
-{
- struct vhost_umem *umem = kvzalloc(sizeof(*umem), GFP_KERNEL);
-
- if (!umem)
- return NULL;
-
- umem->umem_tree = RB_ROOT_CACHED;
- umem->numem = 0;
- INIT_LIST_HEAD(&umem->umem_list);
-
- return umem;
-}
-
static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
{
struct vhost_memory mem, *newmem;
struct vhost_memory_region *region;
- struct vhost_umem *newumem, *oldumem;
+ struct vhost_iotlb *newumem, *oldumem;
unsigned long size = offsetof(struct vhost_memory, regions);
int i;
@@ -1448,7 +1376,7 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
return -EFAULT;
}
- newumem = vhost_umem_alloc();
+ newumem = iotlb_alloc();
if (!newumem) {
kvfree(newmem);
return -ENOMEM;
@@ -1457,13 +1385,12 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
for (region = newmem->regions;
region < newmem->regions + mem.nregions;
region++) {
- if (vhost_new_umem_range(newumem,
- region->guest_phys_addr,
- region->memory_size,
- region->guest_phys_addr +
- region->memory_size - 1,
- region->userspace_addr,
- VHOST_ACCESS_RW))
+ if (vhost_iotlb_add_range(newumem,
+ region->guest_phys_addr,
+ region->guest_phys_addr +
+ region->memory_size - 1,
+ region->userspace_addr,
+ VHOST_MAP_RW))
goto err;
}
@@ -1481,11 +1408,11 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
}
kvfree(newmem);
- vhost_umem_clean(oldumem);
+ vhost_iotlb_free(oldumem);
return 0;
err:
- vhost_umem_clean(newumem);
+ vhost_iotlb_free(newumem);
kvfree(newmem);
return -EFAULT;
}
@@ -1726,10 +1653,10 @@ EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)
{
- struct vhost_umem *niotlb, *oiotlb;
+ struct vhost_iotlb *niotlb, *oiotlb;
int i;
- niotlb = vhost_umem_alloc();
+ niotlb = iotlb_alloc();
if (!niotlb)
return -ENOMEM;
@@ -1745,7 +1672,7 @@ int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)
mutex_unlock(&vq->mutex);
}
- vhost_umem_clean(oiotlb);
+ vhost_iotlb_free(oiotlb);
return 0;
}
@@ -1875,8 +1802,8 @@ static int log_write(void __user *log_base,
static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len)
{
- struct vhost_umem *umem = vq->umem;
- struct vhost_umem_node *u;
+ struct vhost_iotlb *umem = vq->umem;
+ struct vhost_iotlb_map *u;
u64 start, end, l, min;
int r;
bool hit = false;
@@ -1886,16 +1813,15 @@ static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len)
/* More than one GPAs can be mapped into a single HVA. So
* iterate all possible umems here to be safe.
*/
- list_for_each_entry(u, &umem->umem_list, link) {
- if (u->userspace_addr > hva - 1 + len ||
- u->userspace_addr - 1 + u->size < hva)
+ list_for_each_entry(u, &umem->list, link) {
+ if (u->addr > hva - 1 + len ||
+ u->addr - 1 + u->size < hva)
continue;
- start = max(u->userspace_addr, hva);
- end = min(u->userspace_addr - 1 + u->size,
- hva - 1 + len);
+ start = max(u->addr, hva);
+ end = min(u->addr - 1 + u->size, hva - 1 + len);
l = end - start + 1;
r = log_write(vq->log_base,
- u->start + start - u->userspace_addr,
+ u->start + start - u->addr,
l);
if (r < 0)
return r;
@@ -2046,9 +1972,9 @@ EXPORT_SYMBOL_GPL(vhost_vq_init_access);
static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
struct iovec iov[], int iov_size, int access)
{
- const struct vhost_umem_node *node;
+ const struct vhost_iotlb_map *map;
struct vhost_dev *dev = vq->dev;
- struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem;
+ struct vhost_iotlb *umem = dev->iotlb ? dev->iotlb : dev->umem;
struct iovec *_iov;
u64 s = 0;
int ret = 0;
@@ -2060,25 +1986,24 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
break;
}
- node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
- addr, addr + len - 1);
- if (node == NULL || node->start > addr) {
+ map = vhost_iotlb_itree_first(umem, addr, addr + len - 1);
+ if (map == NULL || map->start > addr) {
if (umem != dev->iotlb) {
ret = -EFAULT;
break;
}
ret = -EAGAIN;
break;
- } else if (!(node->perm & access)) {
+ } else if (!(map->perm & access)) {
ret = -EPERM;
break;
}
_iov = iov + ret;
- size = node->size - addr + node->start;
+ size = map->size - addr + map->start;
_iov->iov_len = min((u64)len - s, size);
_iov->iov_base = (void __user *)(unsigned long)
- (node->userspace_addr + addr - node->start);
+ (map->addr + addr - map->start);
s += size;
addr += size;
++ret;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index a123fd70847e..181382185bbc 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -12,6 +12,7 @@
#include <linux/virtio_config.h>
#include <linux/virtio_ring.h>
#include <linux/atomic.h>
+#include <linux/vhost_iotlb.h>
struct vhost_work;
typedef void (*vhost_work_fn_t)(struct vhost_work *work);
@@ -52,27 +53,6 @@ struct vhost_log {
u64 len;
};
-#define START(node) ((node)->start)
-#define LAST(node) ((node)->last)
-
-struct vhost_umem_node {
- struct rb_node rb;
- struct list_head link;
- __u64 start;
- __u64 last;
- __u64 size;
- __u64 userspace_addr;
- __u32 perm;
- __u32 flags_padding;
- __u64 __subtree_last;
-};
-
-struct vhost_umem {
- struct rb_root_cached umem_tree;
- struct list_head umem_list;
- int numem;
-};
-
enum vhost_uaddr_type {
VHOST_ADDR_DESC = 0,
VHOST_ADDR_AVAIL = 1,
@@ -90,7 +70,7 @@ struct vhost_virtqueue {
struct vring_desc __user *desc;
struct vring_avail __user *avail;
struct vring_used __user *used;
- const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
+ const struct vhost_iotlb_map *meta_iotlb[VHOST_NUM_ADDRS];
struct file *kick;
struct eventfd_ctx *call_ctx;
struct eventfd_ctx *error_ctx;
@@ -128,8 +108,8 @@ struct vhost_virtqueue {
struct iovec *indirect;
struct vring_used_elem *heads;
/* Protected by virtqueue mutex. */
- struct vhost_umem *umem;
- struct vhost_umem *iotlb;
+ struct vhost_iotlb *umem;
+ struct vhost_iotlb *iotlb;
void *private_data;
u64 acked_features;
u64 acked_backend_features;
@@ -164,8 +144,8 @@ struct vhost_dev {
struct eventfd_ctx *log_ctx;
struct llist_head work_list;
struct task_struct *worker;
- struct vhost_umem *umem;
- struct vhost_umem *iotlb;
+ struct vhost_iotlb *umem;
+ struct vhost_iotlb *iotlb;
spinlock_t iotlb_lock;
struct list_head read_list;
struct list_head pending_list;
@@ -174,16 +154,20 @@ struct vhost_dev {
int weight;
int byte_weight;
u64 kcov_handle;
+ int (*msg_handler)(struct vhost_dev *dev,
+ struct vhost_iotlb_msg *msg);
};
bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len);
void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs,
- int nvqs, int iov_limit, int weight, int byte_weight);
+ int nvqs, int iov_limit, int weight, int byte_weight,
+ int (*msg_handler)(struct vhost_dev *dev,
+ struct vhost_iotlb_msg *msg));
long vhost_dev_set_owner(struct vhost_dev *dev);
bool vhost_dev_has_owner(struct vhost_dev *dev);
long vhost_dev_check_owner(struct vhost_dev *);
-struct vhost_umem *vhost_dev_reset_owner_prepare(void);
-void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_umem *);
+struct vhost_iotlb *vhost_dev_reset_owner_prepare(void);
+void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *iotlb);
void vhost_dev_cleanup(struct vhost_dev *);
void vhost_dev_stop(struct vhost_dev *);
long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);
@@ -229,6 +213,9 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
struct iov_iter *from);
int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
+void vhost_iotlb_map_free(struct vhost_iotlb *iotlb,
+ struct vhost_iotlb_map *map);
+
#define vq_err(vq, fmt, ...) do { \
pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
if ((vq)->error_ctx) \
diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
index a0a2d74967ef..ee0491f579ac 100644
--- a/drivers/vhost/vringh.c
+++ b/drivers/vhost/vringh.c
@@ -13,6 +13,9 @@
#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/bvec.h>
+#include <linux/highmem.h>
+#include <linux/vhost_iotlb.h>
#include <uapi/linux/virtio_config.h>
static __printf(1,2) __cold void vringh_bad(const char *fmt, ...)
@@ -71,9 +74,11 @@ static inline int __vringh_get_head(const struct vringh *vrh,
}
/* Copy some bytes to/from the iovec. Returns num copied. */
-static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
+static inline ssize_t vringh_iov_xfer(struct vringh *vrh,
+ struct vringh_kiov *iov,
void *ptr, size_t len,
- int (*xfer)(void *addr, void *ptr,
+ int (*xfer)(const struct vringh *vrh,
+ void *addr, void *ptr,
size_t len))
{
int err, done = 0;
@@ -82,7 +87,7 @@ static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
size_t partlen;
partlen = min(iov->iov[iov->i].iov_len, len);
- err = xfer(iov->iov[iov->i].iov_base, ptr, partlen);
+ err = xfer(vrh, iov->iov[iov->i].iov_base, ptr, partlen);
if (err)
return err;
done += partlen;
@@ -96,6 +101,7 @@ static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
/* Fix up old iov element then increment. */
iov->iov[iov->i].iov_len = iov->consumed;
iov->iov[iov->i].iov_base -= iov->consumed;
+
iov->consumed = 0;
iov->i++;
@@ -227,7 +233,8 @@ static int slow_copy(struct vringh *vrh, void *dst, const void *src,
u64 addr,
struct vringh_range *r),
struct vringh_range *range,
- int (*copy)(void *dst, const void *src, size_t len))
+ int (*copy)(const struct vringh *vrh,
+ void *dst, const void *src, size_t len))
{
size_t part, len = sizeof(struct vring_desc);
@@ -241,7 +248,7 @@ static int slow_copy(struct vringh *vrh, void *dst, const void *src,
if (!rcheck(vrh, addr, &part, range, getrange))
return -EINVAL;
- err = copy(dst, src, part);
+ err = copy(vrh, dst, src, part);
if (err)
return err;
@@ -262,7 +269,8 @@ __vringh_iov(struct vringh *vrh, u16 i,
struct vringh_range *)),
bool (*getrange)(struct vringh *, u64, struct vringh_range *),
gfp_t gfp,
- int (*copy)(void *dst, const void *src, size_t len))
+ int (*copy)(const struct vringh *vrh,
+ void *dst, const void *src, size_t len))
{
int err, count = 0, up_next, desc_max;
struct vring_desc desc, *descs;
@@ -291,7 +299,7 @@ __vringh_iov(struct vringh *vrh, u16 i,
err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange,
&slowrange, copy);
else
- err = copy(&desc, &descs[i], sizeof(desc));
+ err = copy(vrh, &desc, &descs[i], sizeof(desc));
if (unlikely(err))
goto fail;
@@ -404,7 +412,8 @@ static inline int __vringh_complete(struct vringh *vrh,
unsigned int num_used,
int (*putu16)(const struct vringh *vrh,
__virtio16 *p, u16 val),
- int (*putused)(struct vring_used_elem *dst,
+ int (*putused)(const struct vringh *vrh,
+ struct vring_used_elem *dst,
const struct vring_used_elem
*src, unsigned num))
{
@@ -420,12 +429,12 @@ static inline int __vringh_complete(struct vringh *vrh,
/* Compiler knows num_used == 1 sometimes, hence extra check */
if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) {
u16 part = vrh->vring.num - off;
- err = putused(&used_ring->ring[off], used, part);
+ err = putused(vrh, &used_ring->ring[off], used, part);
if (!err)
- err = putused(&used_ring->ring[0], used + part,
+ err = putused(vrh, &used_ring->ring[0], used + part,
num_used - part);
} else
- err = putused(&used_ring->ring[off], used, num_used);
+ err = putused(vrh, &used_ring->ring[off], used, num_used);
if (err) {
vringh_bad("Failed to write %u used entries %u at %p",
@@ -564,13 +573,15 @@ static inline int putu16_user(const struct vringh *vrh, __virtio16 *p, u16 val)
return put_user(v, (__force __virtio16 __user *)p);
}
-static inline int copydesc_user(void *dst, const void *src, size_t len)
+static inline int copydesc_user(const struct vringh *vrh,
+ void *dst, const void *src, size_t len)
{
return copy_from_user(dst, (__force void __user *)src, len) ?
-EFAULT : 0;
}
-static inline int putused_user(struct vring_used_elem *dst,
+static inline int putused_user(const struct vringh *vrh,
+ struct vring_used_elem *dst,
const struct vring_used_elem *src,
unsigned int num)
{
@@ -578,13 +589,15 @@ static inline int putused_user(struct vring_used_elem *dst,
sizeof(*dst) * num) ? -EFAULT : 0;
}
-static inline int xfer_from_user(void *src, void *dst, size_t len)
+static inline int xfer_from_user(const struct vringh *vrh, void *src,
+ void *dst, size_t len)
{
return copy_from_user(dst, (__force void __user *)src, len) ?
-EFAULT : 0;
}
-static inline int xfer_to_user(void *dst, void *src, size_t len)
+static inline int xfer_to_user(const struct vringh *vrh,
+ void *dst, void *src, size_t len)
{
return copy_to_user((__force void __user *)dst, src, len) ?
-EFAULT : 0;
@@ -706,7 +719,7 @@ EXPORT_SYMBOL(vringh_getdesc_user);
*/
ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len)
{
- return vringh_iov_xfer((struct vringh_kiov *)riov,
+ return vringh_iov_xfer(NULL, (struct vringh_kiov *)riov,
dst, len, xfer_from_user);
}
EXPORT_SYMBOL(vringh_iov_pull_user);
@@ -722,7 +735,7 @@ EXPORT_SYMBOL(vringh_iov_pull_user);
ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
const void *src, size_t len)
{
- return vringh_iov_xfer((struct vringh_kiov *)wiov,
+ return vringh_iov_xfer(NULL, (struct vringh_kiov *)wiov,
(void *)src, len, xfer_to_user);
}
EXPORT_SYMBOL(vringh_iov_push_user);
@@ -832,13 +845,15 @@ static inline int putu16_kern(const struct vringh *vrh, __virtio16 *p, u16 val)
return 0;
}
-static inline int copydesc_kern(void *dst, const void *src, size_t len)
+static inline int copydesc_kern(const struct vringh *vrh,
+ void *dst, const void *src, size_t len)
{
memcpy(dst, src, len);
return 0;
}
-static inline int putused_kern(struct vring_used_elem *dst,
+static inline int putused_kern(const struct vringh *vrh,
+ struct vring_used_elem *dst,
const struct vring_used_elem *src,
unsigned int num)
{
@@ -846,13 +861,15 @@ static inline int putused_kern(struct vring_used_elem *dst,
return 0;
}
-static inline int xfer_kern(void *src, void *dst, size_t len)
+static inline int xfer_kern(const struct vringh *vrh, void *src,
+ void *dst, size_t len)
{
memcpy(dst, src, len);
return 0;
}
-static inline int kern_xfer(void *dst, void *src, size_t len)
+static inline int kern_xfer(const struct vringh *vrh, void *dst,
+ void *src, size_t len)
{
memcpy(dst, src, len);
return 0;
@@ -949,7 +966,7 @@ EXPORT_SYMBOL(vringh_getdesc_kern);
*/
ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len)
{
- return vringh_iov_xfer(riov, dst, len, xfer_kern);
+ return vringh_iov_xfer(NULL, riov, dst, len, xfer_kern);
}
EXPORT_SYMBOL(vringh_iov_pull_kern);
@@ -964,7 +981,7 @@ EXPORT_SYMBOL(vringh_iov_pull_kern);
ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
const void *src, size_t len)
{
- return vringh_iov_xfer(wiov, (void *)src, len, kern_xfer);
+ return vringh_iov_xfer(NULL, wiov, (void *)src, len, kern_xfer);
}
EXPORT_SYMBOL(vringh_iov_push_kern);
@@ -1042,4 +1059,362 @@ int vringh_need_notify_kern(struct vringh *vrh)
}
EXPORT_SYMBOL(vringh_need_notify_kern);
+static int iotlb_translate(const struct vringh *vrh,
+ u64 addr, u64 len, struct bio_vec iov[],
+ int iov_size, u32 perm)
+{
+ struct vhost_iotlb_map *map;
+ struct vhost_iotlb *iotlb = vrh->iotlb;
+ int ret = 0;
+ u64 s = 0;
+
+ while (len > s) {
+ u64 size, pa, pfn;
+
+ if (unlikely(ret >= iov_size)) {
+ ret = -ENOBUFS;
+ break;
+ }
+
+ map = vhost_iotlb_itree_first(iotlb, addr,
+ addr + len - 1);
+ if (!map || map->start > addr) {
+ ret = -EINVAL;
+ break;
+ } else if (!(map->perm & perm)) {
+ ret = -EPERM;
+ break;
+ }
+
+ size = map->size - addr + map->start;
+ pa = map->addr + addr - map->start;
+ pfn = pa >> PAGE_SHIFT;
+ iov[ret].bv_page = pfn_to_page(pfn);
+ iov[ret].bv_len = min(len - s, size);
+ iov[ret].bv_offset = pa & (PAGE_SIZE - 1);
+ s += size;
+ addr += size;
+ ++ret;
+ }
+
+ return ret;
+}
+
+static inline int copy_from_iotlb(const struct vringh *vrh, void *dst,
+ void *src, size_t len)
+{
+ struct iov_iter iter;
+ struct bio_vec iov[16];
+ int ret;
+
+ ret = iotlb_translate(vrh, (u64)(uintptr_t)src,
+ len, iov, 16, VHOST_MAP_RO);
+ if (ret < 0)
+ return ret;
+
+ iov_iter_bvec(&iter, READ, iov, ret, len);
+
+ ret = copy_from_iter(dst, len, &iter);
+
+ return ret;
+}
+
+static inline int copy_to_iotlb(const struct vringh *vrh, void *dst,
+ void *src, size_t len)
+{
+ struct iov_iter iter;
+ struct bio_vec iov[16];
+ int ret;
+
+ ret = iotlb_translate(vrh, (u64)(uintptr_t)dst,
+ len, iov, 16, VHOST_MAP_WO);
+ if (ret < 0)
+ return ret;
+
+ iov_iter_bvec(&iter, WRITE, iov, ret, len);
+
+ return copy_to_iter(src, len, &iter);
+}
+
+static inline int getu16_iotlb(const struct vringh *vrh,
+ u16 *val, const __virtio16 *p)
+{
+ struct bio_vec iov;
+ void *kaddr, *from;
+ int ret;
+
+ /* Atomic read is needed for getu16 */
+ ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p),
+ &iov, 1, VHOST_MAP_RO);
+ if (ret < 0)
+ return ret;
+
+ kaddr = kmap_atomic(iov.bv_page);
+ from = kaddr + iov.bv_offset;
+ *val = vringh16_to_cpu(vrh, READ_ONCE(*(__virtio16 *)from));
+ kunmap_atomic(kaddr);
+
+ return 0;
+}
+
+static inline int putu16_iotlb(const struct vringh *vrh,
+ __virtio16 *p, u16 val)
+{
+ struct bio_vec iov;
+ void *kaddr, *to;
+ int ret;
+
+ /* Atomic write is needed for putu16 */
+ ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p),
+ &iov, 1, VHOST_MAP_WO);
+ if (ret < 0)
+ return ret;
+
+ kaddr = kmap_atomic(iov.bv_page);
+ to = kaddr + iov.bv_offset;
+ WRITE_ONCE(*(__virtio16 *)to, cpu_to_vringh16(vrh, val));
+ kunmap_atomic(kaddr);
+
+ return 0;
+}
+
+static inline int copydesc_iotlb(const struct vringh *vrh,
+ void *dst, const void *src, size_t len)
+{
+ int ret;
+
+ ret = copy_from_iotlb(vrh, dst, (void *)src, len);
+ if (ret != len)
+ return -EFAULT;
+
+ return 0;
+}
+
+static inline int xfer_from_iotlb(const struct vringh *vrh, void *src,
+ void *dst, size_t len)
+{
+ int ret;
+
+ ret = copy_from_iotlb(vrh, dst, src, len);
+ if (ret != len)
+ return -EFAULT;
+
+ return 0;
+}
+
+static inline int xfer_to_iotlb(const struct vringh *vrh,
+ void *dst, void *src, size_t len)
+{
+ int ret;
+
+ ret = copy_to_iotlb(vrh, dst, src, len);
+ if (ret != len)
+ return -EFAULT;
+
+ return 0;
+}
+
+static inline int putused_iotlb(const struct vringh *vrh,
+ struct vring_used_elem *dst,
+ const struct vring_used_elem *src,
+ unsigned int num)
+{
+ int size = num * sizeof(*dst);
+ int ret;
+
+ ret = copy_to_iotlb(vrh, dst, (void *)src, num * sizeof(*dst));
+ if (ret != size)
+ return -EFAULT;
+
+ return 0;
+}
+
+/**
+ * vringh_init_iotlb - initialize a vringh for a ring with IOTLB.
+ * @vrh: the vringh to initialize.
+ * @features: the feature bits for this ring.
+ * @num: the number of elements.
+ * @weak_barriers: true if we only need memory barriers, not I/O.
+ * @desc: the userpace descriptor pointer.
+ * @avail: the userpace avail pointer.
+ * @used: the userpace used pointer.
+ *
+ * Returns an error if num is invalid.
+ */
+int vringh_init_iotlb(struct vringh *vrh, u64 features,
+ unsigned int num, bool weak_barriers,
+ struct vring_desc *desc,
+ struct vring_avail *avail,
+ struct vring_used *used)
+{
+ return vringh_init_kern(vrh, features, num, weak_barriers,
+ desc, avail, used);
+}
+EXPORT_SYMBOL(vringh_init_iotlb);
+
+/**
+ * vringh_set_iotlb - initialize a vringh for a ring with IOTLB.
+ * @vrh: the vring
+ * @iotlb: iotlb associated with this vring
+ */
+void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb)
+{
+ vrh->iotlb = iotlb;
+}
+EXPORT_SYMBOL(vringh_set_iotlb);
+
+/**
+ * vringh_getdesc_iotlb - get next available descriptor from ring with
+ * IOTLB.
+ * @vrh: the kernelspace vring.
+ * @riov: where to put the readable descriptors (or NULL)
+ * @wiov: where to put the writable descriptors (or NULL)
+ * @head: head index we received, for passing to vringh_complete_iotlb().
+ * @gfp: flags for allocating larger riov/wiov.
+ *
+ * Returns 0 if there was no descriptor, 1 if there was, or -errno.
+ *
+ * Note that on error return, you can tell the difference between an
+ * invalid ring and a single invalid descriptor: in the former case,
+ * *head will be vrh->vring.num. You may be able to ignore an invalid
+ * descriptor, but there's not much you can do with an invalid ring.
+ *
+ * Note that you may need to clean up riov and wiov, even on error!
+ */
+int vringh_getdesc_iotlb(struct vringh *vrh,
+ struct vringh_kiov *riov,
+ struct vringh_kiov *wiov,
+ u16 *head,
+ gfp_t gfp)
+{
+ int err;
+
+ err = __vringh_get_head(vrh, getu16_iotlb, &vrh->last_avail_idx);
+ if (err < 0)
+ return err;
+
+ /* Empty... */
+ if (err == vrh->vring.num)
+ return 0;
+
+ *head = err;
+ err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL,
+ gfp, copydesc_iotlb);
+ if (err)
+ return err;
+
+ return 1;
+}
+EXPORT_SYMBOL(vringh_getdesc_iotlb);
+
+/**
+ * vringh_iov_pull_iotlb - copy bytes from vring_iov.
+ * @vrh: the vring.
+ * @riov: the riov as passed to vringh_getdesc_iotlb() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_pull_iotlb(struct vringh *vrh,
+ struct vringh_kiov *riov,
+ void *dst, size_t len)
+{
+ return vringh_iov_xfer(vrh, riov, dst, len, xfer_from_iotlb);
+}
+EXPORT_SYMBOL(vringh_iov_pull_iotlb);
+
+/**
+ * vringh_iov_push_iotlb - copy bytes into vring_iov.
+ * @vrh: the vring.
+ * @wiov: the wiov as passed to vringh_getdesc_iotlb() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_push_iotlb(struct vringh *vrh,
+ struct vringh_kiov *wiov,
+ const void *src, size_t len)
+{
+ return vringh_iov_xfer(vrh, wiov, (void *)src, len, xfer_to_iotlb);
+}
+EXPORT_SYMBOL(vringh_iov_push_iotlb);
+
+/**
+ * vringh_abandon_iotlb - we've decided not to handle the descriptor(s).
+ * @vrh: the vring.
+ * @num: the number of descriptors to put back (ie. num
+ * vringh_get_iotlb() to undo).
+ *
+ * The next vringh_get_iotlb() will return the old descriptor(s) again.
+ */
+void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num)
+{
+ /* We only update vring_avail_event(vr) when we want to be notified,
+ * so we haven't changed that yet.
+ */
+ vrh->last_avail_idx -= num;
+}
+EXPORT_SYMBOL(vringh_abandon_iotlb);
+
+/**
+ * vringh_complete_iotlb - we've finished with descriptor, publish it.
+ * @vrh: the vring.
+ * @head: the head as filled in by vringh_getdesc_iotlb.
+ * @len: the length of data we have written.
+ *
+ * You should check vringh_need_notify_iotlb() after one or more calls
+ * to this function.
+ */
+int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len)
+{
+ struct vring_used_elem used;
+
+ used.id = cpu_to_vringh32(vrh, head);
+ used.len = cpu_to_vringh32(vrh, len);
+
+ return __vringh_complete(vrh, &used, 1, putu16_iotlb, putused_iotlb);
+}
+EXPORT_SYMBOL(vringh_complete_iotlb);
+
+/**
+ * vringh_notify_enable_iotlb - we want to know if something changes.
+ * @vrh: the vring.
+ *
+ * This always enables notifications, but returns false if there are
+ * now more buffers available in the vring.
+ */
+bool vringh_notify_enable_iotlb(struct vringh *vrh)
+{
+ return __vringh_notify_enable(vrh, getu16_iotlb, putu16_iotlb);
+}
+EXPORT_SYMBOL(vringh_notify_enable_iotlb);
+
+/**
+ * vringh_notify_disable_iotlb - don't tell us if something changes.
+ * @vrh: the vring.
+ *
+ * This is our normal running state: we disable and then only enable when
+ * we're going to sleep.
+ */
+void vringh_notify_disable_iotlb(struct vringh *vrh)
+{
+ __vringh_notify_disable(vrh, putu16_iotlb);
+}
+EXPORT_SYMBOL(vringh_notify_disable_iotlb);
+
+/**
+ * vringh_need_notify_iotlb - must we tell the other side about used buffers?
+ * @vrh: the vring we've called vringh_complete_iotlb() on.
+ *
+ * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
+ */
+int vringh_need_notify_iotlb(struct vringh *vrh)
+{
+ return __vringh_need_notify(vrh, getu16_iotlb);
+}
+EXPORT_SYMBOL(vringh_need_notify_iotlb);
+
+
MODULE_LICENSE("GPL");
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index c2d7d57e98cf..97669484a3f6 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -621,7 +621,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs),
UIO_MAXIOV, VHOST_VSOCK_PKT_WEIGHT,
- VHOST_VSOCK_WEIGHT);
+ VHOST_VSOCK_WEIGHT, NULL);
file->private_data = vsock;
spin_lock_init(&vsock->send_pkt_list_lock);
diff --git a/drivers/video/backlight/corgi_lcd.c b/drivers/video/backlight/corgi_lcd.c
index 68f7592c5060..25ef0cbd7583 100644
--- a/drivers/video/backlight/corgi_lcd.c
+++ b/drivers/video/backlight/corgi_lcd.c
@@ -15,7 +15,7 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/delay.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
#include <linux/fb.h>
#include <linux/lcd.h>
#include <linux/spi/spi.h>
@@ -90,9 +90,8 @@ struct corgi_lcd {
int mode;
char buf[2];
- int gpio_backlight_on;
- int gpio_backlight_cont;
- int gpio_backlight_cont_inverted;
+ struct gpio_desc *backlight_on;
+ struct gpio_desc *backlight_cont;
void (*kick_battery)(void);
};
@@ -403,13 +402,13 @@ static int corgi_bl_set_intensity(struct corgi_lcd *lcd, int intensity)
corgi_ssp_lcdtg_send(lcd, DUTYCTRL_ADRS, intensity);
/* Bit 5 via GPIO_BACKLIGHT_CONT */
- cont = !!(intensity & 0x20) ^ lcd->gpio_backlight_cont_inverted;
+ cont = !!(intensity & 0x20);
- if (gpio_is_valid(lcd->gpio_backlight_cont))
- gpio_set_value_cansleep(lcd->gpio_backlight_cont, cont);
+ if (lcd->backlight_cont)
+ gpiod_set_value_cansleep(lcd->backlight_cont, cont);
- if (gpio_is_valid(lcd->gpio_backlight_on))
- gpio_set_value_cansleep(lcd->gpio_backlight_on, intensity);
+ if (lcd->backlight_on)
+ gpiod_set_value_cansleep(lcd->backlight_on, intensity);
if (lcd->kick_battery)
lcd->kick_battery();
@@ -482,48 +481,17 @@ static int setup_gpio_backlight(struct corgi_lcd *lcd,
struct corgi_lcd_platform_data *pdata)
{
struct spi_device *spi = lcd->spi_dev;
- int err;
-
- lcd->gpio_backlight_on = -1;
- lcd->gpio_backlight_cont = -1;
-
- if (gpio_is_valid(pdata->gpio_backlight_on)) {
- err = devm_gpio_request(&spi->dev, pdata->gpio_backlight_on,
- "BL_ON");
- if (err) {
- dev_err(&spi->dev,
- "failed to request GPIO%d for backlight_on\n",
- pdata->gpio_backlight_on);
- return err;
- }
-
- lcd->gpio_backlight_on = pdata->gpio_backlight_on;
- gpio_direction_output(lcd->gpio_backlight_on, 0);
- }
- if (gpio_is_valid(pdata->gpio_backlight_cont)) {
- err = devm_gpio_request(&spi->dev, pdata->gpio_backlight_cont,
- "BL_CONT");
- if (err) {
- dev_err(&spi->dev,
- "failed to request GPIO%d for backlight_cont\n",
- pdata->gpio_backlight_cont);
- return err;
- }
-
- lcd->gpio_backlight_cont = pdata->gpio_backlight_cont;
-
- /* spitz and akita use both GPIOs for backlight, and
- * have inverted polarity of GPIO_BACKLIGHT_CONT
- */
- if (gpio_is_valid(lcd->gpio_backlight_on)) {
- lcd->gpio_backlight_cont_inverted = 1;
- gpio_direction_output(lcd->gpio_backlight_cont, 1);
- } else {
- lcd->gpio_backlight_cont_inverted = 0;
- gpio_direction_output(lcd->gpio_backlight_cont, 0);
- }
- }
+ lcd->backlight_on = devm_gpiod_get_optional(&spi->dev,
+ "BL_ON", GPIOD_OUT_LOW);
+ if (IS_ERR(lcd->backlight_on))
+ return PTR_ERR(lcd->backlight_on);
+
+ lcd->backlight_cont = devm_gpiod_get_optional(&spi->dev, "BL_CONT",
+ GPIOD_OUT_LOW);
+ if (IS_ERR(lcd->backlight_cont))
+ return PTR_ERR(lcd->backlight_cont);
+
return 0;
}
diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c
index efb4efc2a13d..82b8d7594701 100644
--- a/drivers/video/backlight/pwm_bl.c
+++ b/drivers/video/backlight/pwm_bl.c
@@ -7,7 +7,6 @@
#include <linux/delay.h>
#include <linux/gpio/consumer.h>
-#include <linux/gpio.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
@@ -258,8 +257,6 @@ static int pwm_backlight_parse_dt(struct device *dev,
&data->post_pwm_on_delay);
of_property_read_u32(node, "pwm-off-delay-ms", &data->pwm_off_delay);
- data->enable_gpio = -EINVAL;
-
/*
* Determine the number of brightness levels, if this property is not
* set a default table of brightness levels will be used.
@@ -503,22 +500,6 @@ static int pwm_backlight_probe(struct platform_device *pdev)
}
/*
- * Compatibility fallback for drivers still using the integer GPIO
- * platform data. Must go away soon.
- */
- if (!pb->enable_gpio && gpio_is_valid(data->enable_gpio)) {
- ret = devm_gpio_request_one(&pdev->dev, data->enable_gpio,
- GPIOF_OUT_INIT_HIGH, "enable");
- if (ret < 0) {
- dev_err(&pdev->dev, "failed to request GPIO#%d: %d\n",
- data->enable_gpio, ret);
- goto err_alloc;
- }
-
- pb->enable_gpio = gpio_to_desc(data->enable_gpio);
- }
-
- /*
* If the GPIO is not known to be already configured as output, that
* is, if gpiod_get_direction returns either 1 or -EINVAL, change the
* direction to output and set the GPIO as active.
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index 28335788e76e..9d28a8e3328f 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -1282,6 +1282,9 @@ finished:
if (!con_is_bound(&fb_con))
fbcon_exit();
+ if (vc->vc_num == logo_shown)
+ logo_shown = FBCON_LOGO_CANSHOW;
+
return;
}
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 078615cf2afc..2bbf94b15bba 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -43,6 +43,19 @@ config VIRTIO_PCI_LEGACY
If unsure, say Y.
+config VIRTIO_VDPA
+ tristate "vDPA driver for virtio devices"
+ select VDPA
+ select VIRTIO
+ help
+ This driver provides support for virtio based paravirtual
+ device driver over vDPA bus. For this to be useful, you need
+ an appropriate vDPA device implementation that operates on a
+ physical device to allow the datapath of virtio to be
+ offloaded to hardware.
+
+ If unsure, say M.
+
config VIRTIO_PMEM
tristate "Support for virtio pmem driver"
depends on VIRTIO
@@ -58,6 +71,7 @@ config VIRTIO_BALLOON
tristate "Virtio balloon driver"
depends on VIRTIO
select MEMORY_BALLOON
+ select PAGE_REPORTING
---help---
This driver supports increasing and decreasing the amount
of memory within a KVM guest.
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index 3a2b5c5dcf46..29a1386ecc03 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -6,3 +6,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
+obj-$(CONFIG_VIRTIO_VDPA) += virtio_vdpa.o
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 341458fd95ca..0ef16566c3f3 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -14,11 +14,13 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/balloon_compaction.h>
+#include <linux/oom.h>
#include <linux/wait.h>
#include <linux/mm.h>
#include <linux/mount.h>
#include <linux/magic.h>
#include <linux/pseudo_fs.h>
+#include <linux/page_reporting.h>
/*
* Balloon device works in 4K page units. So each page is pointed to by
@@ -27,7 +29,9 @@
*/
#define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned)(PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT)
#define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256
-#define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
+/* Maximum number of (4k) pages to deflate on OOM notifications. */
+#define VIRTIO_BALLOON_OOM_NR_PAGES 256
+#define VIRTIO_BALLOON_OOM_NOTIFY_PRIORITY 80
#define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \
__GFP_NOMEMALLOC)
@@ -47,6 +51,7 @@ enum virtio_balloon_vq {
VIRTIO_BALLOON_VQ_DEFLATE,
VIRTIO_BALLOON_VQ_STATS,
VIRTIO_BALLOON_VQ_FREE_PAGE,
+ VIRTIO_BALLOON_VQ_REPORTING,
VIRTIO_BALLOON_VQ_MAX
};
@@ -112,8 +117,15 @@ struct virtio_balloon {
/* Memory statistics */
struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
- /* To register a shrinker to shrink memory upon memory pressure */
+ /* Shrinker to return free pages - VIRTIO_BALLOON_F_FREE_PAGE_HINT */
struct shrinker shrinker;
+
+ /* OOM notifier to deflate on OOM - VIRTIO_BALLOON_F_DEFLATE_ON_OOM */
+ struct notifier_block oom_nb;
+
+ /* Free page reporting device */
+ struct virtqueue *reporting_vq;
+ struct page_reporting_dev_info pr_dev_info;
};
static struct virtio_device_id id_table[] = {
@@ -153,6 +165,33 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
}
+int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_info,
+ struct scatterlist *sg, unsigned int nents)
+{
+ struct virtio_balloon *vb =
+ container_of(pr_dev_info, struct virtio_balloon, pr_dev_info);
+ struct virtqueue *vq = vb->reporting_vq;
+ unsigned int unused, err;
+
+ /* We should always be able to add these buffers to an empty queue. */
+ err = virtqueue_add_inbuf(vq, sg, nents, vb, GFP_NOWAIT | __GFP_NOWARN);
+
+ /*
+ * In the extremely unlikely case that something has occurred and we
+ * are able to trigger an error we will simply display a warning
+ * and exit without actually processing the pages.
+ */
+ if (WARN_ON_ONCE(err))
+ return err;
+
+ virtqueue_kick(vq);
+
+ /* When host has read buffer, this completes via balloon_ack */
+ wait_event(vb->acked, virtqueue_get_buf(vq, &unused));
+
+ return 0;
+}
+
static void set_page_pfns(struct virtio_balloon *vb,
__virtio32 pfns[], struct page *page)
{
@@ -481,6 +520,7 @@ static int init_vqs(struct virtio_balloon *vb)
names[VIRTIO_BALLOON_VQ_STATS] = NULL;
callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
+ names[VIRTIO_BALLOON_VQ_REPORTING] = NULL;
if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
names[VIRTIO_BALLOON_VQ_STATS] = "stats";
@@ -492,6 +532,11 @@ static int init_vqs(struct virtio_balloon *vb)
callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
}
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
+ names[VIRTIO_BALLOON_VQ_REPORTING] = "reporting_vq";
+ callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack;
+ }
+
err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
vqs, callbacks, names, NULL, NULL);
if (err)
@@ -524,6 +569,9 @@ static int init_vqs(struct virtio_balloon *vb)
if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE];
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
+ vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING];
+
return 0;
}
@@ -788,50 +836,13 @@ static unsigned long shrink_free_pages(struct virtio_balloon *vb,
return blocks_freed * VIRTIO_BALLOON_HINT_BLOCK_PAGES;
}
-static unsigned long leak_balloon_pages(struct virtio_balloon *vb,
- unsigned long pages_to_free)
-{
- return leak_balloon(vb, pages_to_free * VIRTIO_BALLOON_PAGES_PER_PAGE) /
- VIRTIO_BALLOON_PAGES_PER_PAGE;
-}
-
-static unsigned long shrink_balloon_pages(struct virtio_balloon *vb,
- unsigned long pages_to_free)
-{
- unsigned long pages_freed = 0;
-
- /*
- * One invocation of leak_balloon can deflate at most
- * VIRTIO_BALLOON_ARRAY_PFNS_MAX balloon pages, so we call it
- * multiple times to deflate pages till reaching pages_to_free.
- */
- while (vb->num_pages && pages_freed < pages_to_free)
- pages_freed += leak_balloon_pages(vb,
- pages_to_free - pages_freed);
-
- update_balloon_size(vb);
-
- return pages_freed;
-}
-
static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker,
struct shrink_control *sc)
{
- unsigned long pages_to_free, pages_freed = 0;
struct virtio_balloon *vb = container_of(shrinker,
struct virtio_balloon, shrinker);
- pages_to_free = sc->nr_to_scan;
-
- if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
- pages_freed = shrink_free_pages(vb, pages_to_free);
-
- if (pages_freed >= pages_to_free)
- return pages_freed;
-
- pages_freed += shrink_balloon_pages(vb, pages_to_free - pages_freed);
-
- return pages_freed;
+ return shrink_free_pages(vb, sc->nr_to_scan);
}
static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker,
@@ -839,12 +850,22 @@ static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker,
{
struct virtio_balloon *vb = container_of(shrinker,
struct virtio_balloon, shrinker);
- unsigned long count;
- count = vb->num_pages / VIRTIO_BALLOON_PAGES_PER_PAGE;
- count += vb->num_free_page_blocks * VIRTIO_BALLOON_HINT_BLOCK_PAGES;
+ return vb->num_free_page_blocks * VIRTIO_BALLOON_HINT_BLOCK_PAGES;
+}
+
+static int virtio_balloon_oom_notify(struct notifier_block *nb,
+ unsigned long dummy, void *parm)
+{
+ struct virtio_balloon *vb = container_of(nb,
+ struct virtio_balloon, oom_nb);
+ unsigned long *freed = parm;
+
+ *freed += leak_balloon(vb, VIRTIO_BALLOON_OOM_NR_PAGES) /
+ VIRTIO_BALLOON_PAGES_PER_PAGE;
+ update_balloon_size(vb);
- return count;
+ return NOTIFY_OK;
}
static void virtio_balloon_unregister_shrinker(struct virtio_balloon *vb)
@@ -864,7 +885,6 @@ static int virtio_balloon_register_shrinker(struct virtio_balloon *vb)
static int virtballoon_probe(struct virtio_device *vdev)
{
struct virtio_balloon *vb;
- __u32 poison_val;
int err;
if (!vdev->config->get) {
@@ -930,27 +950,65 @@ static int virtballoon_probe(struct virtio_device *vdev)
VIRTIO_BALLOON_CMD_ID_STOP);
spin_lock_init(&vb->free_page_list_lock);
INIT_LIST_HEAD(&vb->free_page_list);
- if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) {
- memset(&poison_val, PAGE_POISON, sizeof(poison_val));
- virtio_cwrite(vb->vdev, struct virtio_balloon_config,
- poison_val, &poison_val);
- }
- }
- /*
- * We continue to use VIRTIO_BALLOON_F_DEFLATE_ON_OOM to decide if a
- * shrinker needs to be registered to relieve memory pressure.
- */
- if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) {
+ /*
+ * We're allowed to reuse any free pages, even if they are
+ * still to be processed by the host.
+ */
err = virtio_balloon_register_shrinker(vb);
if (err)
goto out_del_balloon_wq;
}
+
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) {
+ vb->oom_nb.notifier_call = virtio_balloon_oom_notify;
+ vb->oom_nb.priority = VIRTIO_BALLOON_OOM_NOTIFY_PRIORITY;
+ err = register_oom_notifier(&vb->oom_nb);
+ if (err < 0)
+ goto out_unregister_shrinker;
+ }
+
+ if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) {
+ /* Start with poison val of 0 representing general init */
+ __u32 poison_val = 0;
+
+ /*
+ * Let the hypervisor know that we are expecting a
+ * specific value to be written back in balloon pages.
+ */
+ if (!want_init_on_free())
+ memset(&poison_val, PAGE_POISON, sizeof(poison_val));
+
+ virtio_cwrite(vb->vdev, struct virtio_balloon_config,
+ poison_val, &poison_val);
+ }
+
+ vb->pr_dev_info.report = virtballoon_free_page_report;
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
+ unsigned int capacity;
+
+ capacity = virtqueue_get_vring_size(vb->reporting_vq);
+ if (capacity < PAGE_REPORTING_CAPACITY) {
+ err = -ENOSPC;
+ goto out_unregister_oom;
+ }
+
+ err = page_reporting_register(&vb->pr_dev_info);
+ if (err)
+ goto out_unregister_oom;
+ }
+
virtio_device_ready(vdev);
if (towards_target(vb))
virtballoon_changed(vdev);
return 0;
+out_unregister_oom:
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
+ unregister_oom_notifier(&vb->oom_nb);
+out_unregister_shrinker:
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
+ virtio_balloon_unregister_shrinker(vb);
out_del_balloon_wq:
if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
destroy_workqueue(vb->balloon_wq);
@@ -989,7 +1047,11 @@ static void virtballoon_remove(struct virtio_device *vdev)
{
struct virtio_balloon *vb = vdev->priv;
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
+ page_reporting_unregister(&vb->pr_dev_info);
if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
+ unregister_oom_notifier(&vb->oom_nb);
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
virtio_balloon_unregister_shrinker(vb);
spin_lock_irq(&vb->stop_update_lock);
vb->stop_update = true;
@@ -1045,7 +1107,10 @@ static int virtballoon_restore(struct virtio_device *vdev)
static int virtballoon_validate(struct virtio_device *vdev)
{
- if (!page_poisoning_enabled())
+ /* Tell the host whether we care about poisoned pages. */
+ if (!want_init_on_free() &&
+ (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY) ||
+ !page_poisoning_enabled()))
__virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_POISON);
__virtio_clear_bit(vdev, VIRTIO_F_IOMMU_PLATFORM);
@@ -1058,6 +1123,7 @@ static unsigned int features[] = {
VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
VIRTIO_BALLOON_F_FREE_PAGE_HINT,
VIRTIO_BALLOON_F_PAGE_POISON,
+ VIRTIO_BALLOON_F_REPORTING,
};
static struct virtio_driver virtio_balloon_driver = {
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
new file mode 100644
index 000000000000..c30eb55030be
--- /dev/null
+++ b/drivers/virtio/virtio_vdpa.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VIRTIO based driver for vDPA device
+ *
+ * Copyright (c) 2020, Red Hat. All rights reserved.
+ * Author: Jason Wang <jasowang@redhat.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/virtio.h>
+#include <linux/vdpa.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ring.h>
+
+#define MOD_VERSION "0.1"
+#define MOD_AUTHOR "Jason Wang <jasowang@redhat.com>"
+#define MOD_DESC "vDPA bus driver for virtio devices"
+#define MOD_LICENSE "GPL v2"
+
+struct virtio_vdpa_device {
+ struct virtio_device vdev;
+ struct vdpa_device *vdpa;
+ u64 features;
+
+ /* The lock to protect virtqueue list */
+ spinlock_t lock;
+ /* List of virtio_vdpa_vq_info */
+ struct list_head virtqueues;
+};
+
+struct virtio_vdpa_vq_info {
+ /* the actual virtqueue */
+ struct virtqueue *vq;
+
+ /* the list node for the virtqueues list */
+ struct list_head node;
+};
+
+static inline struct virtio_vdpa_device *
+to_virtio_vdpa_device(struct virtio_device *dev)
+{
+ return container_of(dev, struct virtio_vdpa_device, vdev);
+}
+
+static struct vdpa_device *vd_get_vdpa(struct virtio_device *vdev)
+{
+ return to_virtio_vdpa_device(vdev)->vdpa;
+}
+
+static void virtio_vdpa_get(struct virtio_device *vdev, unsigned offset,
+ void *buf, unsigned len)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ ops->get_config(vdpa, offset, buf, len);
+}
+
+static void virtio_vdpa_set(struct virtio_device *vdev, unsigned offset,
+ const void *buf, unsigned len)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ ops->set_config(vdpa, offset, buf, len);
+}
+
+static u32 virtio_vdpa_generation(struct virtio_device *vdev)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ if (ops->get_generation)
+ return ops->get_generation(vdpa);
+
+ return 0;
+}
+
+static u8 virtio_vdpa_get_status(struct virtio_device *vdev)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ return ops->get_status(vdpa);
+}
+
+static void virtio_vdpa_set_status(struct virtio_device *vdev, u8 status)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ return ops->set_status(vdpa, status);
+}
+
+static void virtio_vdpa_reset(struct virtio_device *vdev)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ return ops->set_status(vdpa, 0);
+}
+
+static bool virtio_vdpa_notify(struct virtqueue *vq)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vq->vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ ops->kick_vq(vdpa, vq->index);
+
+ return true;
+}
+
+static irqreturn_t virtio_vdpa_config_cb(void *private)
+{
+ struct virtio_vdpa_device *vd_dev = private;
+
+ virtio_config_changed(&vd_dev->vdev);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t virtio_vdpa_virtqueue_cb(void *private)
+{
+ struct virtio_vdpa_vq_info *info = private;
+
+ return vring_interrupt(0, info->vq);
+}
+
+static struct virtqueue *
+virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
+ void (*callback)(struct virtqueue *vq),
+ const char *name, bool ctx)
+{
+ struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct virtio_vdpa_vq_info *info;
+ struct vdpa_callback cb;
+ struct virtqueue *vq;
+ u64 desc_addr, driver_addr, device_addr;
+ unsigned long flags;
+ u32 align, num;
+ int err;
+
+ if (!name)
+ return NULL;
+
+ /* Queue shouldn't already be set up. */
+ if (ops->get_vq_ready(vdpa, index))
+ return ERR_PTR(-ENOENT);
+
+ /* Allocate and fill out our active queue description */
+ info = kmalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return ERR_PTR(-ENOMEM);
+
+ num = ops->get_vq_num_max(vdpa);
+ if (num == 0) {
+ err = -ENOENT;
+ goto error_new_virtqueue;
+ }
+
+ /* Create the vring */
+ align = ops->get_vq_align(vdpa);
+ vq = vring_create_virtqueue(index, num, align, vdev,
+ true, true, ctx,
+ virtio_vdpa_notify, callback, name);
+ if (!vq) {
+ err = -ENOMEM;
+ goto error_new_virtqueue;
+ }
+
+ /* Setup virtqueue callback */
+ cb.callback = virtio_vdpa_virtqueue_cb;
+ cb.private = info;
+ ops->set_vq_cb(vdpa, index, &cb);
+ ops->set_vq_num(vdpa, index, virtqueue_get_vring_size(vq));
+
+ desc_addr = virtqueue_get_desc_addr(vq);
+ driver_addr = virtqueue_get_avail_addr(vq);
+ device_addr = virtqueue_get_used_addr(vq);
+
+ if (ops->set_vq_address(vdpa, index,
+ desc_addr, driver_addr,
+ device_addr)) {
+ err = -EINVAL;
+ goto err_vq;
+ }
+
+ ops->set_vq_ready(vdpa, index, 1);
+
+ vq->priv = info;
+ info->vq = vq;
+
+ spin_lock_irqsave(&vd_dev->lock, flags);
+ list_add(&info->node, &vd_dev->virtqueues);
+ spin_unlock_irqrestore(&vd_dev->lock, flags);
+
+ return vq;
+
+err_vq:
+ vring_del_virtqueue(vq);
+error_new_virtqueue:
+ ops->set_vq_ready(vdpa, index, 0);
+ /* VDPA driver should make sure vq is stopeed here */
+ WARN_ON(ops->get_vq_ready(vdpa, index));
+ kfree(info);
+ return ERR_PTR(err);
+}
+
+static void virtio_vdpa_del_vq(struct virtqueue *vq)
+{
+ struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vq->vdev);
+ struct vdpa_device *vdpa = vd_dev->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct virtio_vdpa_vq_info *info = vq->priv;
+ unsigned int index = vq->index;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vd_dev->lock, flags);
+ list_del(&info->node);
+ spin_unlock_irqrestore(&vd_dev->lock, flags);
+
+ /* Select and deactivate the queue */
+ ops->set_vq_ready(vdpa, index, 0);
+ WARN_ON(ops->get_vq_ready(vdpa, index));
+
+ vring_del_virtqueue(vq);
+
+ kfree(info);
+}
+
+static void virtio_vdpa_del_vqs(struct virtio_device *vdev)
+{
+ struct virtqueue *vq, *n;
+
+ list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+ virtio_vdpa_del_vq(vq);
+}
+
+static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+ struct virtqueue *vqs[],
+ vq_callback_t *callbacks[],
+ const char * const names[],
+ const bool *ctx,
+ struct irq_affinity *desc)
+{
+ struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct vdpa_callback cb;
+ int i, err, queue_idx = 0;
+
+ for (i = 0; i < nvqs; ++i) {
+ if (!names[i]) {
+ vqs[i] = NULL;
+ continue;
+ }
+
+ vqs[i] = virtio_vdpa_setup_vq(vdev, queue_idx++,
+ callbacks[i], names[i], ctx ?
+ ctx[i] : false);
+ if (IS_ERR(vqs[i])) {
+ err = PTR_ERR(vqs[i]);
+ goto err_setup_vq;
+ }
+ }
+
+ cb.callback = virtio_vdpa_config_cb;
+ cb.private = vd_dev;
+ ops->set_config_cb(vdpa, &cb);
+
+ return 0;
+
+err_setup_vq:
+ virtio_vdpa_del_vqs(vdev);
+ return err;
+}
+
+static u64 virtio_vdpa_get_features(struct virtio_device *vdev)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ return ops->get_features(vdpa);
+}
+
+static int virtio_vdpa_finalize_features(struct virtio_device *vdev)
+{
+ struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ /* Give virtio_ring a chance to accept features. */
+ vring_transport_features(vdev);
+
+ return ops->set_features(vdpa, vdev->features);
+}
+
+static const char *virtio_vdpa_bus_name(struct virtio_device *vdev)
+{
+ struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
+ struct vdpa_device *vdpa = vd_dev->vdpa;
+
+ return dev_name(&vdpa->dev);
+}
+
+static const struct virtio_config_ops virtio_vdpa_config_ops = {
+ .get = virtio_vdpa_get,
+ .set = virtio_vdpa_set,
+ .generation = virtio_vdpa_generation,
+ .get_status = virtio_vdpa_get_status,
+ .set_status = virtio_vdpa_set_status,
+ .reset = virtio_vdpa_reset,
+ .find_vqs = virtio_vdpa_find_vqs,
+ .del_vqs = virtio_vdpa_del_vqs,
+ .get_features = virtio_vdpa_get_features,
+ .finalize_features = virtio_vdpa_finalize_features,
+ .bus_name = virtio_vdpa_bus_name,
+};
+
+static void virtio_vdpa_release_dev(struct device *_d)
+{
+ struct virtio_device *vdev =
+ container_of(_d, struct virtio_device, dev);
+ struct virtio_vdpa_device *vd_dev =
+ container_of(vdev, struct virtio_vdpa_device, vdev);
+
+ kfree(vd_dev);
+}
+
+static int virtio_vdpa_probe(struct vdpa_device *vdpa)
+{
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct virtio_vdpa_device *vd_dev, *reg_dev = NULL;
+ int ret = -EINVAL;
+
+ vd_dev = kzalloc(sizeof(*vd_dev), GFP_KERNEL);
+ if (!vd_dev)
+ return -ENOMEM;
+
+ vd_dev->vdev.dev.parent = vdpa_get_dma_dev(vdpa);
+ vd_dev->vdev.dev.release = virtio_vdpa_release_dev;
+ vd_dev->vdev.config = &virtio_vdpa_config_ops;
+ vd_dev->vdpa = vdpa;
+ INIT_LIST_HEAD(&vd_dev->virtqueues);
+ spin_lock_init(&vd_dev->lock);
+
+ vd_dev->vdev.id.device = ops->get_device_id(vdpa);
+ if (vd_dev->vdev.id.device == 0)
+ goto err;
+
+ vd_dev->vdev.id.vendor = ops->get_vendor_id(vdpa);
+ ret = register_virtio_device(&vd_dev->vdev);
+ reg_dev = vd_dev;
+ if (ret)
+ goto err;
+
+ vdpa_set_drvdata(vdpa, vd_dev);
+
+ return 0;
+
+err:
+ if (reg_dev)
+ put_device(&vd_dev->vdev.dev);
+ else
+ kfree(vd_dev);
+ return ret;
+}
+
+static void virtio_vdpa_remove(struct vdpa_device *vdpa)
+{
+ struct virtio_vdpa_device *vd_dev = vdpa_get_drvdata(vdpa);
+
+ unregister_virtio_device(&vd_dev->vdev);
+}
+
+static struct vdpa_driver virtio_vdpa_driver = {
+ .driver = {
+ .name = "virtio_vdpa",
+ },
+ .probe = virtio_vdpa_probe,
+ .remove = virtio_vdpa_remove,
+};
+
+module_vdpa_driver(virtio_vdpa_driver);
+
+MODULE_VERSION(MOD_VERSION);
+MODULE_LICENSE(MOD_LICENSE);
+MODULE_AUTHOR(MOD_AUTHOR);
+MODULE_DESCRIPTION(MOD_DESC);
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 9ea2b43d4b01..0663c604bd64 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -584,6 +584,14 @@ config DAVINCI_WATCHDOG
NOTE: once enabled, this timer cannot be disabled.
Say N if you are unsure.
+config K3_RTI_WATCHDOG
+ tristate "Texas Instruments K3 RTI watchdog"
+ depends on ARCH_K3 || COMPILE_TEST
+ select WATCHDOG_CORE
+ help
+ Say Y here if you want to include support for the K3 watchdog
+ timer (RTI module) available in the K3 generation of processors.
+
config ORION_WATCHDOG
tristate "Orion watchdog"
depends on ARCH_ORION5X || ARCH_DOVE || MACH_DOVE || ARCH_MVEBU || (COMPILE_TEST && !ARCH_EBSA110)
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index 2ee352bf3372..6de2e4ceef19 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_EP93XX_WATCHDOG) += ep93xx_wdt.o
obj-$(CONFIG_PNX4008_WATCHDOG) += pnx4008_wdt.o
obj-$(CONFIG_IOP_WATCHDOG) += iop_wdt.o
obj-$(CONFIG_DAVINCI_WATCHDOG) += davinci_wdt.o
+obj-$(CONFIG_K3_RTI_WATCHDOG) += rti_wdt.o
obj-$(CONFIG_ORION_WATCHDOG) += orion_wdt.o
obj-$(CONFIG_SUNXI_WATCHDOG) += sunxi_wdt.o
obj-$(CONFIG_RN5T618_WATCHDOG) += rn5t618_wdt.o
diff --git a/drivers/watchdog/imx2_wdt.c b/drivers/watchdog/imx2_wdt.c
index f8d58bf0bf66..1fe472f56cb3 100644
--- a/drivers/watchdog/imx2_wdt.c
+++ b/drivers/watchdog/imx2_wdt.c
@@ -244,6 +244,11 @@ static const struct regmap_config imx2_wdt_regmap_config = {
.max_register = 0x8,
};
+static void imx2_wdt_action(void *data)
+{
+ clk_disable_unprepare(data);
+}
+
static int __init imx2_wdt_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
@@ -292,6 +297,10 @@ static int __init imx2_wdt_probe(struct platform_device *pdev)
if (ret)
return ret;
+ ret = devm_add_action_or_reset(dev, imx2_wdt_action, wdev->clk);
+ if (ret)
+ return ret;
+
regmap_read(wdev->regmap, IMX2_WDT_WRSR, &val);
wdog->bootstatus = val & IMX2_WDT_WRSR_TOUT ? WDIOF_CARDRESET : 0;
@@ -315,32 +324,7 @@ static int __init imx2_wdt_probe(struct platform_device *pdev)
*/
regmap_write(wdev->regmap, IMX2_WDT_WMCR, 0);
- ret = watchdog_register_device(wdog);
- if (ret)
- goto disable_clk;
-
- dev_info(dev, "timeout %d sec (nowayout=%d)\n",
- wdog->timeout, nowayout);
-
- return 0;
-
-disable_clk:
- clk_disable_unprepare(wdev->clk);
- return ret;
-}
-
-static int __exit imx2_wdt_remove(struct platform_device *pdev)
-{
- struct watchdog_device *wdog = platform_get_drvdata(pdev);
- struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog);
-
- watchdog_unregister_device(wdog);
-
- if (imx2_wdt_is_running(wdev)) {
- imx2_wdt_ping(wdog);
- dev_crit(&pdev->dev, "Device removed: Expect reboot!\n");
- }
- return 0;
+ return devm_watchdog_register_device(dev, wdog);
}
static void imx2_wdt_shutdown(struct platform_device *pdev)
@@ -417,7 +401,6 @@ static const struct of_device_id imx2_wdt_dt_ids[] = {
MODULE_DEVICE_TABLE(of, imx2_wdt_dt_ids);
static struct platform_driver imx2_wdt_driver = {
- .remove = __exit_p(imx2_wdt_remove),
.shutdown = imx2_wdt_shutdown,
.driver = {
.name = DRIVER_NAME,
diff --git a/drivers/watchdog/imx7ulp_wdt.c b/drivers/watchdog/imx7ulp_wdt.c
index 11b9e7c6b7f5..7993c8c41b3a 100644
--- a/drivers/watchdog/imx7ulp_wdt.c
+++ b/drivers/watchdog/imx7ulp_wdt.c
@@ -4,7 +4,6 @@
*/
#include <linux/clk.h>
-#include <linux/init.h>
#include <linux/io.h>
#include <linux/kernel.h>
#include <linux/module.h>
diff --git a/drivers/watchdog/imx_sc_wdt.c b/drivers/watchdog/imx_sc_wdt.c
index 8ed89f032ebf..60a32469f7de 100644
--- a/drivers/watchdog/imx_sc_wdt.c
+++ b/drivers/watchdog/imx_sc_wdt.c
@@ -6,13 +6,11 @@
#include <linux/arm-smccc.h>
#include <linux/firmware/imx/sci.h>
#include <linux/io.h>
-#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/of.h>
#include <linux/platform_device.h>
-#include <linux/reboot.h>
#include <linux/watchdog.h>
#define DEFAULT_TIMEOUT 60
diff --git a/drivers/watchdog/npcm_wdt.c b/drivers/watchdog/npcm_wdt.c
index 9c773c3d6d5d..765577f11c8d 100644
--- a/drivers/watchdog/npcm_wdt.c
+++ b/drivers/watchdog/npcm_wdt.c
@@ -103,30 +103,29 @@ static int npcm_wdt_stop(struct watchdog_device *wdd)
return 0;
}
-
static int npcm_wdt_set_timeout(struct watchdog_device *wdd,
unsigned int timeout)
{
if (timeout < 2)
wdd->timeout = 1;
else if (timeout < 3)
- wdd->timeout = 2;
+ wdd->timeout = 2;
else if (timeout < 6)
- wdd->timeout = 5;
+ wdd->timeout = 5;
else if (timeout < 11)
- wdd->timeout = 10;
+ wdd->timeout = 10;
else if (timeout < 22)
- wdd->timeout = 21;
+ wdd->timeout = 21;
else if (timeout < 44)
- wdd->timeout = 43;
+ wdd->timeout = 43;
else if (timeout < 87)
- wdd->timeout = 86;
+ wdd->timeout = 86;
else if (timeout < 173)
- wdd->timeout = 172;
+ wdd->timeout = 172;
else if (timeout < 688)
- wdd->timeout = 687;
+ wdd->timeout = 687;
else
- wdd->timeout = 2750;
+ wdd->timeout = 2750;
if (watchdog_active(wdd))
npcm_wdt_start(wdd);
diff --git a/drivers/watchdog/orion_wdt.c b/drivers/watchdog/orion_wdt.c
index 8e6dfe76f9c9..4ddb4ea2e4a3 100644
--- a/drivers/watchdog/orion_wdt.c
+++ b/drivers/watchdog/orion_wdt.c
@@ -52,7 +52,7 @@
#define WDT_A370_RATIO (1 << WDT_A370_RATIO_SHIFT)
static bool nowayout = WATCHDOG_NOWAYOUT;
-static int heartbeat = -1; /* module parameter (seconds) */
+static int heartbeat; /* module parameter (seconds) */
struct orion_watchdog;
diff --git a/drivers/watchdog/pm8916_wdt.c b/drivers/watchdog/pm8916_wdt.c
index 1213179f863c..0937b8d33104 100644
--- a/drivers/watchdog/pm8916_wdt.c
+++ b/drivers/watchdog/pm8916_wdt.c
@@ -192,6 +192,7 @@ static int pm8916_wdt_probe(struct platform_device *pdev)
wdt->wdev.timeout = PM8916_WDT_DEFAULT_TIMEOUT;
wdt->wdev.pretimeout = 0;
watchdog_set_drvdata(&wdt->wdev, wdt);
+ platform_set_drvdata(pdev, wdt);
watchdog_init_timeout(&wdt->wdev, 0, dev);
pm8916_wdt_configure_timers(&wdt->wdev);
@@ -199,6 +200,29 @@ static int pm8916_wdt_probe(struct platform_device *pdev)
return devm_watchdog_register_device(dev, &wdt->wdev);
}
+static int __maybe_unused pm8916_wdt_suspend(struct device *dev)
+{
+ struct pm8916_wdt *wdt = dev_get_drvdata(dev);
+
+ if (watchdog_active(&wdt->wdev))
+ return pm8916_wdt_stop(&wdt->wdev);
+
+ return 0;
+}
+
+static int __maybe_unused pm8916_wdt_resume(struct device *dev)
+{
+ struct pm8916_wdt *wdt = dev_get_drvdata(dev);
+
+ if (watchdog_active(&wdt->wdev))
+ return pm8916_wdt_start(&wdt->wdev);
+
+ return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(pm8916_wdt_pm_ops, pm8916_wdt_suspend,
+ pm8916_wdt_resume);
+
static const struct of_device_id pm8916_wdt_id_table[] = {
{ .compatible = "qcom,pm8916-wdt" },
{ }
@@ -210,6 +234,7 @@ static struct platform_driver pm8916_wdt_driver = {
.driver = {
.name = "pm8916-wdt",
.of_match_table = of_match_ptr(pm8916_wdt_id_table),
+ .pm = &pm8916_wdt_pm_ops,
},
};
module_platform_driver(pm8916_wdt_driver);
diff --git a/drivers/watchdog/qcom-wdt.c b/drivers/watchdog/qcom-wdt.c
index eb47fe5ed280..ab7465d186fd 100644
--- a/drivers/watchdog/qcom-wdt.c
+++ b/drivers/watchdog/qcom-wdt.c
@@ -40,6 +40,11 @@ static const u32 reg_offset_data_kpss[] = {
[WDT_BITE_TIME] = 0x14,
};
+struct qcom_wdt_match_data {
+ const u32 *offset;
+ bool pretimeout;
+};
+
struct qcom_wdt {
struct watchdog_device wdd;
unsigned long rate;
@@ -179,19 +184,29 @@ static void qcom_clk_disable_unprepare(void *data)
clk_disable_unprepare(data);
}
+static const struct qcom_wdt_match_data match_data_apcs_tmr = {
+ .offset = reg_offset_data_apcs_tmr,
+ .pretimeout = false,
+};
+
+static const struct qcom_wdt_match_data match_data_kpss = {
+ .offset = reg_offset_data_kpss,
+ .pretimeout = true,
+};
+
static int qcom_wdt_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
struct qcom_wdt *wdt;
struct resource *res;
struct device_node *np = dev->of_node;
- const u32 *regs;
+ const struct qcom_wdt_match_data *data;
u32 percpu_offset;
int irq, ret;
struct clk *clk;
- regs = of_device_get_match_data(dev);
- if (!regs) {
+ data = of_device_get_match_data(dev);
+ if (!data) {
dev_err(dev, "Unsupported QCOM WDT module\n");
return -ENODEV;
}
@@ -247,9 +262,8 @@ static int qcom_wdt_probe(struct platform_device *pdev)
/* check if there is pretimeout support */
irq = platform_get_irq_optional(pdev, 0);
- if (irq > 0) {
- ret = devm_request_irq(dev, irq, qcom_wdt_isr,
- IRQF_TRIGGER_RISING,
+ if (data->pretimeout && irq > 0) {
+ ret = devm_request_irq(dev, irq, qcom_wdt_isr, 0,
"wdt_bark", &wdt->wdd);
if (ret)
return ret;
@@ -267,7 +281,7 @@ static int qcom_wdt_probe(struct platform_device *pdev)
wdt->wdd.min_timeout = 1;
wdt->wdd.max_timeout = 0x10000000U / wdt->rate;
wdt->wdd.parent = dev;
- wdt->layout = regs;
+ wdt->layout = data->offset;
if (readl(wdt_addr(wdt, WDT_STS)) & 1)
wdt->wdd.bootstatus = WDIOF_CARDRESET;
@@ -311,9 +325,9 @@ static int __maybe_unused qcom_wdt_resume(struct device *dev)
static SIMPLE_DEV_PM_OPS(qcom_wdt_pm_ops, qcom_wdt_suspend, qcom_wdt_resume);
static const struct of_device_id qcom_wdt_of_table[] = {
- { .compatible = "qcom,kpss-timer", .data = reg_offset_data_apcs_tmr },
- { .compatible = "qcom,scss-timer", .data = reg_offset_data_apcs_tmr },
- { .compatible = "qcom,kpss-wdt", .data = reg_offset_data_kpss },
+ { .compatible = "qcom,kpss-timer", .data = &match_data_apcs_tmr },
+ { .compatible = "qcom,scss-timer", .data = &match_data_apcs_tmr },
+ { .compatible = "qcom,kpss-wdt", .data = &match_data_kpss },
{ },
};
MODULE_DEVICE_TABLE(of, qcom_wdt_of_table);
diff --git a/drivers/watchdog/rti_wdt.c b/drivers/watchdog/rti_wdt.c
new file mode 100644
index 000000000000..d456dd72d99a
--- /dev/null
+++ b/drivers/watchdog/rti_wdt.c
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Watchdog driver for the K3 RTI module
+ *
+ * (c) Copyright 2019-2020 Texas Instruments Inc.
+ * All rights reserved.
+ */
+
+#include <linux/clk.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+#include <linux/watchdog.h>
+
+#define DEFAULT_HEARTBEAT 60
+
+/* Max heartbeat is calculated at 32kHz source clock */
+#define MAX_HEARTBEAT 1000
+
+/* Timer register set definition */
+#define RTIDWDCTRL 0x90
+#define RTIDWDPRLD 0x94
+#define RTIWDSTATUS 0x98
+#define RTIWDKEY 0x9c
+#define RTIDWDCNTR 0xa0
+#define RTIWWDRXCTRL 0xa4
+#define RTIWWDSIZECTRL 0xa8
+
+#define RTIWWDRX_NMI 0xa
+
+#define RTIWWDSIZE_50P 0x50
+
+#define WDENABLE_KEY 0xa98559da
+
+#define WDKEY_SEQ0 0xe51a
+#define WDKEY_SEQ1 0xa35c
+
+#define WDT_PRELOAD_SHIFT 13
+
+#define WDT_PRELOAD_MAX 0xfff
+
+#define DWDST BIT(1)
+
+static int heartbeat;
+
+/*
+ * struct to hold data for each WDT device
+ * @base - base io address of WD device
+ * @freq - source clock frequency of WDT
+ * @wdd - hold watchdog device as is in WDT core
+ */
+struct rti_wdt_device {
+ void __iomem *base;
+ unsigned long freq;
+ struct watchdog_device wdd;
+};
+
+static int rti_wdt_start(struct watchdog_device *wdd)
+{
+ u32 timer_margin;
+ struct rti_wdt_device *wdt = watchdog_get_drvdata(wdd);
+
+ /* set timeout period */
+ timer_margin = (u64)wdd->timeout * wdt->freq;
+ timer_margin >>= WDT_PRELOAD_SHIFT;
+ if (timer_margin > WDT_PRELOAD_MAX)
+ timer_margin = WDT_PRELOAD_MAX;
+ writel_relaxed(timer_margin, wdt->base + RTIDWDPRLD);
+
+ /*
+ * RTI only supports a windowed mode, where the watchdog can only
+ * be petted during the open window; not too early or not too late.
+ * The HW configuration options only allow for the open window size
+ * to be 50% or less than that; we obviouly want to configure the open
+ * window as large as possible so we select the 50% option. To avoid
+ * any glitches, we accommodate 5% safety margin also, so we setup
+ * the min_hw_hearbeat at 55% of the timeout period.
+ */
+ wdd->min_hw_heartbeat_ms = 11 * wdd->timeout * 1000 / 20;
+
+ /* Generate NMI when wdt expires */
+ writel_relaxed(RTIWWDRX_NMI, wdt->base + RTIWWDRXCTRL);
+
+ /* Open window size 50%; this is the largest window size available */
+ writel_relaxed(RTIWWDSIZE_50P, wdt->base + RTIWWDSIZECTRL);
+
+ readl_relaxed(wdt->base + RTIWWDSIZECTRL);
+
+ /* enable watchdog */
+ writel_relaxed(WDENABLE_KEY, wdt->base + RTIDWDCTRL);
+ return 0;
+}
+
+static int rti_wdt_ping(struct watchdog_device *wdd)
+{
+ struct rti_wdt_device *wdt = watchdog_get_drvdata(wdd);
+
+ /* put watchdog in service state */
+ writel_relaxed(WDKEY_SEQ0, wdt->base + RTIWDKEY);
+ /* put watchdog in active state */
+ writel_relaxed(WDKEY_SEQ1, wdt->base + RTIWDKEY);
+
+ return 0;
+}
+
+static unsigned int rti_wdt_get_timeleft(struct watchdog_device *wdd)
+{
+ u64 timer_counter;
+ u32 val;
+ struct rti_wdt_device *wdt = watchdog_get_drvdata(wdd);
+
+ /* if timeout has occurred then return 0 */
+ val = readl_relaxed(wdt->base + RTIWDSTATUS);
+ if (val & DWDST)
+ return 0;
+
+ timer_counter = readl_relaxed(wdt->base + RTIDWDCNTR);
+
+ do_div(timer_counter, wdt->freq);
+
+ return timer_counter;
+}
+
+static const struct watchdog_info rti_wdt_info = {
+ .options = WDIOF_KEEPALIVEPING,
+ .identity = "K3 RTI Watchdog",
+};
+
+static const struct watchdog_ops rti_wdt_ops = {
+ .owner = THIS_MODULE,
+ .start = rti_wdt_start,
+ .ping = rti_wdt_ping,
+ .get_timeleft = rti_wdt_get_timeleft,
+};
+
+static int rti_wdt_probe(struct platform_device *pdev)
+{
+ int ret = 0;
+ struct device *dev = &pdev->dev;
+ struct resource *wdt_mem;
+ struct watchdog_device *wdd;
+ struct rti_wdt_device *wdt;
+ struct clk *clk;
+
+ wdt = devm_kzalloc(dev, sizeof(*wdt), GFP_KERNEL);
+ if (!wdt)
+ return -ENOMEM;
+
+ clk = clk_get(dev, NULL);
+ if (IS_ERR(clk)) {
+ if (PTR_ERR(clk) != -EPROBE_DEFER)
+ dev_err(dev, "failed to get clock\n");
+ return PTR_ERR(clk);
+ }
+
+ wdt->freq = clk_get_rate(clk);
+
+ clk_put(clk);
+
+ if (!wdt->freq) {
+ dev_err(dev, "Failed to get fck rate.\n");
+ return -EINVAL;
+ }
+
+ pm_runtime_enable(dev);
+ ret = pm_runtime_get_sync(dev);
+ if (ret) {
+ if (ret != -EPROBE_DEFER)
+ dev_err(&pdev->dev, "runtime pm failed\n");
+ return ret;
+ }
+
+ platform_set_drvdata(pdev, wdt);
+
+ wdd = &wdt->wdd;
+ wdd->info = &rti_wdt_info;
+ wdd->ops = &rti_wdt_ops;
+ wdd->min_timeout = 1;
+ wdd->max_hw_heartbeat_ms = (WDT_PRELOAD_MAX << WDT_PRELOAD_SHIFT) /
+ wdt->freq * 1000;
+ wdd->timeout = DEFAULT_HEARTBEAT;
+ wdd->parent = dev;
+
+ watchdog_init_timeout(wdd, heartbeat, dev);
+
+ watchdog_set_drvdata(wdd, wdt);
+ watchdog_set_nowayout(wdd, 1);
+ watchdog_set_restart_priority(wdd, 128);
+
+ wdt_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ wdt->base = devm_ioremap_resource(dev, wdt_mem);
+ if (IS_ERR(wdt->base)) {
+ ret = PTR_ERR(wdt->base);
+ goto err_iomap;
+ }
+
+ ret = watchdog_register_device(wdd);
+ if (ret) {
+ dev_err(dev, "cannot register watchdog device\n");
+ goto err_iomap;
+ }
+
+ return 0;
+
+err_iomap:
+ pm_runtime_put_sync(&pdev->dev);
+
+ return ret;
+}
+
+static int rti_wdt_remove(struct platform_device *pdev)
+{
+ struct rti_wdt_device *wdt = platform_get_drvdata(pdev);
+
+ watchdog_unregister_device(&wdt->wdd);
+ pm_runtime_put(&pdev->dev);
+
+ return 0;
+}
+
+static const struct of_device_id rti_wdt_of_match[] = {
+ { .compatible = "ti,j7-rti-wdt", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, rti_wdt_of_match);
+
+static struct platform_driver rti_wdt_driver = {
+ .driver = {
+ .name = "rti-wdt",
+ .of_match_table = rti_wdt_of_match,
+ },
+ .probe = rti_wdt_probe,
+ .remove = rti_wdt_remove,
+};
+
+module_platform_driver(rti_wdt_driver);
+
+MODULE_AUTHOR("Tero Kristo <t-kristo@ti.com>");
+MODULE_DESCRIPTION("K3 RTI Watchdog Driver");
+
+module_param(heartbeat, int, 0);
+MODULE_PARM_DESC(heartbeat,
+ "Watchdog heartbeat period in seconds from 1 to "
+ __MODULE_STRING(MAX_HEARTBEAT) ", default "
+ __MODULE_STRING(DEFAULT_HEARTBEAT));
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:rti-wdt");
diff --git a/drivers/watchdog/watchdog_core.c b/drivers/watchdog/watchdog_core.c
index 861daf4f37b2..423844757812 100644
--- a/drivers/watchdog/watchdog_core.c
+++ b/drivers/watchdog/watchdog_core.c
@@ -39,6 +39,10 @@
static DEFINE_IDA(watchdog_ida);
+static int stop_on_reboot = -1;
+module_param(stop_on_reboot, int, 0444);
+MODULE_PARM_DESC(stop_on_reboot, "Stop watchdogs on reboot (0=keep watching, 1=stop)");
+
/*
* Deferred Registration infrastructure.
*
@@ -254,6 +258,14 @@ static int __watchdog_register_device(struct watchdog_device *wdd)
}
}
+ /* Module parameter to force watchdog policy on reboot. */
+ if (stop_on_reboot != -1) {
+ if (stop_on_reboot)
+ set_bit(WDOG_STOP_ON_REBOOT, &wdd->status);
+ else
+ clear_bit(WDOG_STOP_ON_REBOOT, &wdd->status);
+ }
+
if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) {
wdd->reboot_nb.notifier_call = watchdog_reboot_notifier;
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index 8b5c742f24e8..7e4cd34a8c20 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -282,6 +282,7 @@ static int watchdog_start(struct watchdog_device *wdd)
if (err == 0) {
set_bit(WDOG_ACTIVE, &wdd->status);
wd_data->last_keepalive = started_at;
+ wd_data->last_hw_keepalive = started_at;
watchdog_update_worker(wdd);
}
diff --git a/drivers/watchdog/wm831x_wdt.c b/drivers/watchdog/wm831x_wdt.c
index 030ce240620d..d96ad8f38bd2 100644
--- a/drivers/watchdog/wm831x_wdt.c
+++ b/drivers/watchdog/wm831x_wdt.c
@@ -13,7 +13,6 @@
#include <linux/platform_device.h>
#include <linux/watchdog.h>
#include <linux/uaccess.h>
-#include <linux/gpio.h>
#include <linux/mfd/wm831x/core.h>
#include <linux/mfd/wm831x/pdata.h>
@@ -29,7 +28,6 @@ struct wm831x_wdt_drvdata {
struct watchdog_device wdt;
struct wm831x *wm831x;
struct mutex lock;
- int update_gpio;
int update_state;
};
@@ -103,14 +101,6 @@ static int wm831x_wdt_ping(struct watchdog_device *wdt_dev)
mutex_lock(&driver_data->lock);
- if (driver_data->update_gpio) {
- gpio_set_value_cansleep(driver_data->update_gpio,
- driver_data->update_state);
- driver_data->update_state = !driver_data->update_state;
- ret = 0;
- goto out;
- }
-
reg = wm831x_reg_read(wm831x, WM831X_WATCHDOG);
if (!(reg & WM831X_WDOG_RST_SRC)) {
@@ -239,23 +229,6 @@ static int wm831x_wdt_probe(struct platform_device *pdev)
reg |= pdata->secondary << WM831X_WDOG_SECACT_SHIFT;
reg |= pdata->software << WM831X_WDOG_RST_SRC_SHIFT;
- if (pdata->update_gpio) {
- ret = devm_gpio_request_one(dev, pdata->update_gpio,
- GPIOF_OUT_INIT_LOW,
- "Watchdog update");
- if (ret < 0) {
- dev_err(wm831x->dev,
- "Failed to request update GPIO: %d\n",
- ret);
- return ret;
- }
-
- driver_data->update_gpio = pdata->update_gpio;
-
- /* Make sure the watchdog takes hardware updates */
- reg |= WM831X_WDOG_RST_SRC;
- }
-
ret = wm831x_reg_unlock(wm831x);
if (ret == 0) {
ret = wm831x_reg_write(wm831x, WM831X_WATCHDOG, reg);
diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c
index 4a363a8b2d20..cab86a08456b 100644
--- a/drivers/watchdog/ziirave_wdt.c
+++ b/drivers/watchdog/ziirave_wdt.c
@@ -422,7 +422,7 @@ static int ziirave_firm_upload(struct watchdog_device *wdd,
static const struct watchdog_info ziirave_wdt_info = {
.options = WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE | WDIOF_KEEPALIVEPING,
- .identity = "Zodiac RAVE Watchdog",
+ .identity = "RAVE Switch Watchdog",
};
static const struct watchdog_ops ziirave_wdt_ops = {
diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c
index 8edef51c92e5..64df919a2111 100644
--- a/drivers/xen/events/events_2l.c
+++ b/drivers/xen/events/events_2l.c
@@ -53,37 +53,37 @@ static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu)
set_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, cpu)));
}
-static void evtchn_2l_clear_pending(unsigned port)
+static void evtchn_2l_clear_pending(evtchn_port_t port)
{
struct shared_info *s = HYPERVISOR_shared_info;
sync_clear_bit(port, BM(&s->evtchn_pending[0]));
}
-static void evtchn_2l_set_pending(unsigned port)
+static void evtchn_2l_set_pending(evtchn_port_t port)
{
struct shared_info *s = HYPERVISOR_shared_info;
sync_set_bit(port, BM(&s->evtchn_pending[0]));
}
-static bool evtchn_2l_is_pending(unsigned port)
+static bool evtchn_2l_is_pending(evtchn_port_t port)
{
struct shared_info *s = HYPERVISOR_shared_info;
return sync_test_bit(port, BM(&s->evtchn_pending[0]));
}
-static bool evtchn_2l_test_and_set_mask(unsigned port)
+static bool evtchn_2l_test_and_set_mask(evtchn_port_t port)
{
struct shared_info *s = HYPERVISOR_shared_info;
return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0]));
}
-static void evtchn_2l_mask(unsigned port)
+static void evtchn_2l_mask(evtchn_port_t port)
{
struct shared_info *s = HYPERVISOR_shared_info;
sync_set_bit(port, BM(&s->evtchn_mask[0]));
}
-static void evtchn_2l_unmask(unsigned port)
+static void evtchn_2l_unmask(evtchn_port_t port)
{
struct shared_info *s = HYPERVISOR_shared_info;
unsigned int cpu = get_cpu();
@@ -173,7 +173,7 @@ static void evtchn_2l_handle_events(unsigned cpu)
/* Timer interrupt has highest priority. */
irq = irq_from_virq(cpu, VIRQ_TIMER);
if (irq != -1) {
- unsigned int evtchn = evtchn_from_irq(irq);
+ evtchn_port_t evtchn = evtchn_from_irq(irq);
word_idx = evtchn / BITS_PER_LONG;
bit_idx = evtchn % BITS_PER_LONG;
if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx))
@@ -228,7 +228,7 @@ static void evtchn_2l_handle_events(unsigned cpu)
do {
xen_ulong_t bits;
- int port;
+ evtchn_port_t port;
bits = MASK_LSBS(pending_bits, bit_idx);
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 499eff7d3f65..3a791c8485d0 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -116,7 +116,7 @@ static void clear_evtchn_to_irq_all(void)
}
}
-static int set_evtchn_to_irq(unsigned evtchn, unsigned irq)
+static int set_evtchn_to_irq(evtchn_port_t evtchn, unsigned int irq)
{
unsigned row;
unsigned col;
@@ -143,7 +143,7 @@ static int set_evtchn_to_irq(unsigned evtchn, unsigned irq)
return 0;
}
-int get_evtchn_to_irq(unsigned evtchn)
+int get_evtchn_to_irq(evtchn_port_t evtchn)
{
if (evtchn >= xen_evtchn_max_channels())
return -1;
@@ -162,7 +162,7 @@ struct irq_info *info_for_irq(unsigned irq)
static int xen_irq_info_common_setup(struct irq_info *info,
unsigned irq,
enum xen_irq_type type,
- unsigned evtchn,
+ evtchn_port_t evtchn,
unsigned short cpu)
{
int ret;
@@ -184,7 +184,7 @@ static int xen_irq_info_common_setup(struct irq_info *info,
}
static int xen_irq_info_evtchn_setup(unsigned irq,
- unsigned evtchn)
+ evtchn_port_t evtchn)
{
struct irq_info *info = info_for_irq(irq);
@@ -193,7 +193,7 @@ static int xen_irq_info_evtchn_setup(unsigned irq,
static int xen_irq_info_ipi_setup(unsigned cpu,
unsigned irq,
- unsigned evtchn,
+ evtchn_port_t evtchn,
enum ipi_vector ipi)
{
struct irq_info *info = info_for_irq(irq);
@@ -207,7 +207,7 @@ static int xen_irq_info_ipi_setup(unsigned cpu,
static int xen_irq_info_virq_setup(unsigned cpu,
unsigned irq,
- unsigned evtchn,
+ evtchn_port_t evtchn,
unsigned virq)
{
struct irq_info *info = info_for_irq(irq);
@@ -220,7 +220,7 @@ static int xen_irq_info_virq_setup(unsigned cpu,
}
static int xen_irq_info_pirq_setup(unsigned irq,
- unsigned evtchn,
+ evtchn_port_t evtchn,
unsigned pirq,
unsigned gsi,
uint16_t domid,
@@ -245,7 +245,7 @@ static void xen_irq_info_cleanup(struct irq_info *info)
/*
* Accessors for packed IRQ information.
*/
-unsigned int evtchn_from_irq(unsigned irq)
+evtchn_port_t evtchn_from_irq(unsigned irq)
{
if (WARN(irq >= nr_irqs, "Invalid irq %d!\n", irq))
return 0;
@@ -253,7 +253,7 @@ unsigned int evtchn_from_irq(unsigned irq)
return info_for_irq(irq)->evtchn;
}
-unsigned irq_from_evtchn(unsigned int evtchn)
+unsigned int irq_from_evtchn(evtchn_port_t evtchn)
{
return get_evtchn_to_irq(evtchn);
}
@@ -304,7 +304,7 @@ unsigned cpu_from_irq(unsigned irq)
return info_for_irq(irq)->cpu;
}
-unsigned int cpu_from_evtchn(unsigned int evtchn)
+unsigned int cpu_from_evtchn(evtchn_port_t evtchn)
{
int irq = get_evtchn_to_irq(evtchn);
unsigned ret = 0;
@@ -330,9 +330,9 @@ static bool pirq_needs_eoi_flag(unsigned irq)
return info->u.pirq.flags & PIRQ_NEEDS_EOI;
}
-static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu)
{
- int irq = get_evtchn_to_irq(chn);
+ int irq = get_evtchn_to_irq(evtchn);
struct irq_info *info = info_for_irq(irq);
BUG_ON(irq == -1);
@@ -354,7 +354,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
*/
void notify_remote_via_irq(int irq)
{
- int evtchn = evtchn_from_irq(irq);
+ evtchn_port_t evtchn = evtchn_from_irq(irq);
if (VALID_EVTCHN(evtchn))
notify_remote_via_evtchn(evtchn);
@@ -445,7 +445,7 @@ static void xen_free_irq(unsigned irq)
irq_free_desc(irq);
}
-static void xen_evtchn_close(unsigned int port)
+static void xen_evtchn_close(evtchn_port_t port)
{
struct evtchn_close close;
@@ -472,7 +472,7 @@ static void pirq_query_unmask(int irq)
static void eoi_pirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(data->irq);
+ evtchn_port_t evtchn = evtchn_from_irq(data->irq);
struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) };
int rc = 0;
@@ -508,7 +508,7 @@ static unsigned int __startup_pirq(unsigned int irq)
{
struct evtchn_bind_pirq bind_pirq;
struct irq_info *info = info_for_irq(irq);
- int evtchn = evtchn_from_irq(irq);
+ evtchn_port_t evtchn = evtchn_from_irq(irq);
int rc;
BUG_ON(info->type != IRQT_PIRQ);
@@ -561,7 +561,7 @@ static void shutdown_pirq(struct irq_data *data)
{
unsigned int irq = data->irq;
struct irq_info *info = info_for_irq(irq);
- unsigned evtchn = evtchn_from_irq(irq);
+ evtchn_port_t evtchn = evtchn_from_irq(irq);
BUG_ON(info->type != IRQT_PIRQ);
@@ -601,7 +601,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(unsigned int irq)
{
- int evtchn = evtchn_from_irq(irq);
+ evtchn_port_t evtchn = evtchn_from_irq(irq);
struct irq_info *info = irq_get_handler_data(irq);
if (info->refcnt > 0) {
@@ -827,7 +827,7 @@ int xen_pirq_from_irq(unsigned irq)
}
EXPORT_SYMBOL_GPL(xen_pirq_from_irq);
-int bind_evtchn_to_irq(unsigned int evtchn)
+int bind_evtchn_to_irq(evtchn_port_t evtchn)
{
int irq;
int ret;
@@ -870,8 +870,8 @@ EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
{
struct evtchn_bind_ipi bind_ipi;
- int evtchn, irq;
- int ret;
+ evtchn_port_t evtchn;
+ int ret, irq;
mutex_lock(&irq_mapping_update_lock);
@@ -909,7 +909,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
}
int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
- unsigned int remote_port)
+ evtchn_port_t remote_port)
{
struct evtchn_bind_interdomain bind_interdomain;
int err;
@@ -924,10 +924,11 @@ int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
}
EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq);
-static int find_virq(unsigned int virq, unsigned int cpu)
+static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn)
{
struct evtchn_status status;
- int port, rc = -ENOENT;
+ evtchn_port_t port;
+ int rc = -ENOENT;
memset(&status, 0, sizeof(status));
for (port = 0; port < xen_evtchn_max_channels(); port++) {
@@ -939,7 +940,7 @@ static int find_virq(unsigned int virq, unsigned int cpu)
if (status.status != EVTCHNSTAT_virq)
continue;
if (status.u.virq == virq && status.vcpu == xen_vcpu_nr(cpu)) {
- rc = port;
+ *evtchn = port;
break;
}
}
@@ -962,7 +963,8 @@ EXPORT_SYMBOL_GPL(xen_evtchn_nr_channels);
int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu)
{
struct evtchn_bind_virq bind_virq;
- int evtchn, irq, ret;
+ evtchn_port_t evtchn = 0;
+ int irq, ret;
mutex_lock(&irq_mapping_update_lock);
@@ -988,9 +990,8 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu)
evtchn = bind_virq.port;
else {
if (ret == -EEXIST)
- ret = find_virq(virq, cpu);
+ ret = find_virq(virq, cpu, &evtchn);
BUG_ON(ret < 0);
- evtchn = ret;
}
ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq);
@@ -1019,7 +1020,7 @@ static void unbind_from_irq(unsigned int irq)
mutex_unlock(&irq_mapping_update_lock);
}
-int bind_evtchn_to_irqhandler(unsigned int evtchn,
+int bind_evtchn_to_irqhandler(evtchn_port_t evtchn,
irq_handler_t handler,
unsigned long irqflags,
const char *devname, void *dev_id)
@@ -1040,7 +1041,7 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn,
EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
- unsigned int remote_port,
+ evtchn_port_t remote_port,
irq_handler_t handler,
unsigned long irqflags,
const char *devname,
@@ -1132,7 +1133,7 @@ int xen_set_irq_priority(unsigned irq, unsigned priority)
}
EXPORT_SYMBOL_GPL(xen_set_irq_priority);
-int evtchn_make_refcounted(unsigned int evtchn)
+int evtchn_make_refcounted(evtchn_port_t evtchn)
{
int irq = get_evtchn_to_irq(evtchn);
struct irq_info *info;
@@ -1153,7 +1154,7 @@ int evtchn_make_refcounted(unsigned int evtchn)
}
EXPORT_SYMBOL_GPL(evtchn_make_refcounted);
-int evtchn_get(unsigned int evtchn)
+int evtchn_get(evtchn_port_t evtchn)
{
int irq;
struct irq_info *info;
@@ -1186,7 +1187,7 @@ int evtchn_get(unsigned int evtchn)
}
EXPORT_SYMBOL_GPL(evtchn_get);
-void evtchn_put(unsigned int evtchn)
+void evtchn_put(evtchn_port_t evtchn)
{
int irq = get_evtchn_to_irq(evtchn);
if (WARN_ON(irq == -1))
@@ -1252,7 +1253,7 @@ void xen_hvm_evtchn_do_upcall(void)
EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall);
/* Rebind a new event channel to an existing irq. */
-void rebind_evtchn_irq(int evtchn, int irq)
+void rebind_evtchn_irq(evtchn_port_t evtchn, int irq)
{
struct irq_info *info = info_for_irq(irq);
@@ -1284,7 +1285,7 @@ void rebind_evtchn_irq(int evtchn, int irq)
}
/* Rebind an evtchn so that it gets delivered to a specific cpu */
-static int xen_rebind_evtchn_to_cpu(int evtchn, unsigned int tcpu)
+static int xen_rebind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int tcpu)
{
struct evtchn_bind_vcpu bind_vcpu;
int masked;
@@ -1342,7 +1343,7 @@ EXPORT_SYMBOL_GPL(xen_set_affinity_evtchn);
static void enable_dynirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(data->irq);
+ evtchn_port_t evtchn = evtchn_from_irq(data->irq);
if (VALID_EVTCHN(evtchn))
unmask_evtchn(evtchn);
@@ -1350,7 +1351,7 @@ static void enable_dynirq(struct irq_data *data)
static void disable_dynirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(data->irq);
+ evtchn_port_t evtchn = evtchn_from_irq(data->irq);
if (VALID_EVTCHN(evtchn))
mask_evtchn(evtchn);
@@ -1358,7 +1359,7 @@ static void disable_dynirq(struct irq_data *data)
static void ack_dynirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(data->irq);
+ evtchn_port_t evtchn = evtchn_from_irq(data->irq);
if (!VALID_EVTCHN(evtchn))
return;
@@ -1385,7 +1386,7 @@ static void mask_ack_dynirq(struct irq_data *data)
static int retrigger_dynirq(struct irq_data *data)
{
- unsigned int evtchn = evtchn_from_irq(data->irq);
+ evtchn_port_t evtchn = evtchn_from_irq(data->irq);
int masked;
if (!VALID_EVTCHN(evtchn))
@@ -1440,7 +1441,8 @@ static void restore_pirqs(void)
static void restore_cpu_virqs(unsigned int cpu)
{
struct evtchn_bind_virq bind_virq;
- int virq, irq, evtchn;
+ evtchn_port_t evtchn;
+ int virq, irq;
for (virq = 0; virq < NR_VIRQS; virq++) {
if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
@@ -1465,7 +1467,8 @@ static void restore_cpu_virqs(unsigned int cpu)
static void restore_cpu_ipis(unsigned int cpu)
{
struct evtchn_bind_ipi bind_ipi;
- int ipi, irq, evtchn;
+ evtchn_port_t evtchn;
+ int ipi, irq;
for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) {
if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
@@ -1489,7 +1492,7 @@ static void restore_cpu_ipis(unsigned int cpu)
/* Clear an irq's pending state, in preparation for polling on it */
void xen_clear_irq_pending(int irq)
{
- int evtchn = evtchn_from_irq(irq);
+ evtchn_port_t evtchn = evtchn_from_irq(irq);
if (VALID_EVTCHN(evtchn))
clear_evtchn(evtchn);
@@ -1497,7 +1500,7 @@ void xen_clear_irq_pending(int irq)
EXPORT_SYMBOL(xen_clear_irq_pending);
void xen_set_irq_pending(int irq)
{
- int evtchn = evtchn_from_irq(irq);
+ evtchn_port_t evtchn = evtchn_from_irq(irq);
if (VALID_EVTCHN(evtchn))
set_evtchn(evtchn);
@@ -1505,7 +1508,7 @@ void xen_set_irq_pending(int irq)
bool xen_test_irq_pending(int irq)
{
- int evtchn = evtchn_from_irq(irq);
+ evtchn_port_t evtchn = evtchn_from_irq(irq);
bool ret = false;
if (VALID_EVTCHN(evtchn))
@@ -1667,7 +1670,7 @@ module_param(fifo_events, bool, 0);
void __init xen_init_IRQ(void)
{
int ret = -EINVAL;
- unsigned int evtchn;
+ evtchn_port_t evtchn;
if (fifo_events)
ret = xen_evtchn_fifo_init();
diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c
index 76b318e88382..c60ee0450173 100644
--- a/drivers/xen/events/events_fifo.c
+++ b/drivers/xen/events/events_fifo.c
@@ -82,7 +82,7 @@ static unsigned event_array_pages __read_mostly;
#endif
-static inline event_word_t *event_word_from_port(unsigned port)
+static inline event_word_t *event_word_from_port(evtchn_port_t port)
{
unsigned i = port / EVENT_WORDS_PER_PAGE;
@@ -140,7 +140,7 @@ static void init_array_page(event_word_t *array_page)
static int evtchn_fifo_setup(struct irq_info *info)
{
- unsigned port = info->evtchn;
+ evtchn_port_t port = info->evtchn;
unsigned new_array_pages;
int ret;
@@ -191,37 +191,37 @@ static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu)
/* no-op */
}
-static void evtchn_fifo_clear_pending(unsigned port)
+static void evtchn_fifo_clear_pending(evtchn_port_t port)
{
event_word_t *word = event_word_from_port(port);
sync_clear_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word));
}
-static void evtchn_fifo_set_pending(unsigned port)
+static void evtchn_fifo_set_pending(evtchn_port_t port)
{
event_word_t *word = event_word_from_port(port);
sync_set_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word));
}
-static bool evtchn_fifo_is_pending(unsigned port)
+static bool evtchn_fifo_is_pending(evtchn_port_t port)
{
event_word_t *word = event_word_from_port(port);
return sync_test_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word));
}
-static bool evtchn_fifo_test_and_set_mask(unsigned port)
+static bool evtchn_fifo_test_and_set_mask(evtchn_port_t port)
{
event_word_t *word = event_word_from_port(port);
return sync_test_and_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word));
}
-static void evtchn_fifo_mask(unsigned port)
+static void evtchn_fifo_mask(evtchn_port_t port)
{
event_word_t *word = event_word_from_port(port);
sync_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word));
}
-static bool evtchn_fifo_is_masked(unsigned port)
+static bool evtchn_fifo_is_masked(evtchn_port_t port)
{
event_word_t *word = event_word_from_port(port);
return sync_test_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word));
@@ -242,7 +242,7 @@ static void clear_masked(volatile event_word_t *word)
} while (w != old);
}
-static void evtchn_fifo_unmask(unsigned port)
+static void evtchn_fifo_unmask(evtchn_port_t port)
{
event_word_t *word = event_word_from_port(port);
@@ -270,7 +270,7 @@ static uint32_t clear_linked(volatile event_word_t *word)
return w & EVTCHN_FIFO_LINK_MASK;
}
-static void handle_irq_for_port(unsigned port)
+static void handle_irq_for_port(evtchn_port_t port)
{
int irq;
@@ -286,7 +286,7 @@ static void consume_one_event(unsigned cpu,
{
struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu);
uint32_t head;
- unsigned port;
+ evtchn_port_t port;
event_word_t *word;
head = q->head[priority];
diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h
index 82938cff6c7a..10684feb094e 100644
--- a/drivers/xen/events/events_internal.h
+++ b/drivers/xen/events/events_internal.h
@@ -33,7 +33,7 @@ struct irq_info {
int refcnt;
enum xen_irq_type type; /* type */
unsigned irq;
- unsigned int evtchn; /* event channel */
+ evtchn_port_t evtchn; /* event channel */
unsigned short cpu; /* cpu bound */
union {
@@ -60,12 +60,12 @@ struct evtchn_ops {
int (*setup)(struct irq_info *info);
void (*bind_to_cpu)(struct irq_info *info, unsigned cpu);
- void (*clear_pending)(unsigned port);
- void (*set_pending)(unsigned port);
- bool (*is_pending)(unsigned port);
- bool (*test_and_set_mask)(unsigned port);
- void (*mask)(unsigned port);
- void (*unmask)(unsigned port);
+ void (*clear_pending)(evtchn_port_t port);
+ void (*set_pending)(evtchn_port_t port);
+ bool (*is_pending)(evtchn_port_t port);
+ bool (*test_and_set_mask)(evtchn_port_t port);
+ void (*mask)(evtchn_port_t port);
+ void (*unmask)(evtchn_port_t port);
void (*handle_events)(unsigned cpu);
void (*resume)(void);
@@ -74,11 +74,11 @@ struct evtchn_ops {
extern const struct evtchn_ops *evtchn_ops;
extern int **evtchn_to_irq;
-int get_evtchn_to_irq(unsigned int evtchn);
+int get_evtchn_to_irq(evtchn_port_t evtchn);
struct irq_info *info_for_irq(unsigned irq);
unsigned cpu_from_irq(unsigned irq);
-unsigned cpu_from_evtchn(unsigned int evtchn);
+unsigned int cpu_from_evtchn(evtchn_port_t evtchn);
static inline unsigned xen_evtchn_max_channels(void)
{
@@ -102,32 +102,32 @@ static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info,
evtchn_ops->bind_to_cpu(info, cpu);
}
-static inline void clear_evtchn(unsigned port)
+static inline void clear_evtchn(evtchn_port_t port)
{
evtchn_ops->clear_pending(port);
}
-static inline void set_evtchn(unsigned port)
+static inline void set_evtchn(evtchn_port_t port)
{
evtchn_ops->set_pending(port);
}
-static inline bool test_evtchn(unsigned port)
+static inline bool test_evtchn(evtchn_port_t port)
{
return evtchn_ops->is_pending(port);
}
-static inline bool test_and_set_mask(unsigned port)
+static inline bool test_and_set_mask(evtchn_port_t port)
{
return evtchn_ops->test_and_set_mask(port);
}
-static inline void mask_evtchn(unsigned port)
+static inline void mask_evtchn(evtchn_port_t port)
{
return evtchn_ops->mask(port);
}
-static inline void unmask_evtchn(unsigned port)
+static inline void unmask_evtchn(evtchn_port_t port)
{
return evtchn_ops->unmask(port);
}
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 052b55a14ebc..6e0b1dd5573c 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -83,7 +83,7 @@ struct per_user_data {
struct user_evtchn {
struct rb_node node;
struct per_user_data *user;
- unsigned port;
+ evtchn_port_t port;
bool enabled;
};
@@ -138,7 +138,8 @@ static void del_evtchn(struct per_user_data *u, struct user_evtchn *evtchn)
kfree(evtchn);
}
-static struct user_evtchn *find_evtchn(struct per_user_data *u, unsigned port)
+static struct user_evtchn *find_evtchn(struct per_user_data *u,
+ evtchn_port_t port)
{
struct rb_node *node = u->evtchns.rb_node;
@@ -163,7 +164,7 @@ static irqreturn_t evtchn_interrupt(int irq, void *data)
struct per_user_data *u = evtchn->user;
WARN(!evtchn->enabled,
- "Interrupt for port %d, but apparently not enabled; per-user %p\n",
+ "Interrupt for port %u, but apparently not enabled; per-user %p\n",
evtchn->port, u);
disable_irq_nosync(irq);
@@ -286,7 +287,7 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,
mutex_lock(&u->bind_mutex);
for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) {
- unsigned port = kbuf[i];
+ evtchn_port_t port = kbuf[i];
struct user_evtchn *evtchn;
evtchn = find_evtchn(u, port);
@@ -361,7 +362,7 @@ static int evtchn_resize_ring(struct per_user_data *u)
return 0;
}
-static int evtchn_bind_to_user(struct per_user_data *u, int port)
+static int evtchn_bind_to_user(struct per_user_data *u, evtchn_port_t port)
{
struct user_evtchn *evtchn;
struct evtchn_close close;
@@ -423,7 +424,7 @@ static void evtchn_unbind_from_user(struct per_user_data *u,
static DEFINE_PER_CPU(int, bind_last_selected_cpu);
-static void evtchn_bind_interdom_next_vcpu(int evtchn)
+static void evtchn_bind_interdom_next_vcpu(evtchn_port_t evtchn)
{
unsigned int selected_cpu, irq;
struct irq_desc *desc;
diff --git a/drivers/xen/gntdev-common.h b/drivers/xen/gntdev-common.h
index 9a3960ecff6c..20d7d059dadb 100644
--- a/drivers/xen/gntdev-common.h
+++ b/drivers/xen/gntdev-common.h
@@ -15,6 +15,7 @@
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/types.h>
+#include <xen/interface/event_channel.h>
struct gntdev_dmabuf_priv;
@@ -38,7 +39,7 @@ struct gntdev_unmap_notify {
int flags;
/* Address relative to the start of the gntdev_grant_map. */
int addr;
- int event;
+ evtchn_port_t event;
};
struct gntdev_grant_map {
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 0258415ca0b2..50651e566564 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -652,7 +652,7 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u)
struct gntdev_grant_map *map;
int rc;
int out_flags;
- unsigned int out_event;
+ evtchn_port_t out_event;
if (copy_from_user(&op, u, sizeof(op)))
return -EFAULT;
diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c
index c57c71b7d53d..cf4ce3e9358d 100644
--- a/drivers/xen/pvcalls-back.c
+++ b/drivers/xen/pvcalls-back.c
@@ -300,7 +300,7 @@ static struct sock_mapping *pvcalls_new_active_socket(
struct pvcalls_fedata *fedata,
uint64_t id,
grant_ref_t ref,
- uint32_t evtchn,
+ evtchn_port_t evtchn,
struct socket *sock)
{
int ret;
@@ -905,7 +905,8 @@ static irqreturn_t pvcalls_back_conn_event(int irq, void *sock_map)
static int backend_connect(struct xenbus_device *dev)
{
- int err, evtchn;
+ int err;
+ evtchn_port_t evtchn;
grant_ref_t ring_ref;
struct pvcalls_fedata *fedata = NULL;
diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index 57592a6b5c9e..b43b5595e988 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -368,12 +368,12 @@ out:
return -ENOMEM;
}
-static int create_active(struct sock_mapping *map, int *evtchn)
+static int create_active(struct sock_mapping *map, evtchn_port_t *evtchn)
{
void *bytes;
int ret = -ENOMEM, irq = -1, i;
- *evtchn = -1;
+ *evtchn = 0;
init_waitqueue_head(&map->active.inflight_conn_req);
bytes = map->active.data.in;
@@ -404,7 +404,7 @@ static int create_active(struct sock_mapping *map, int *evtchn)
return 0;
out_error:
- if (*evtchn >= 0)
+ if (*evtchn > 0)
xenbus_free_evtchn(pvcalls_front_dev, *evtchn);
return ret;
}
@@ -415,7 +415,8 @@ int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr,
struct pvcalls_bedata *bedata;
struct sock_mapping *map = NULL;
struct xen_pvcalls_request *req;
- int notify, req_id, ret, evtchn;
+ int notify, req_id, ret;
+ evtchn_port_t evtchn;
if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM)
return -EOPNOTSUPP;
@@ -765,7 +766,8 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags)
struct sock_mapping *map;
struct sock_mapping *map2 = NULL;
struct xen_pvcalls_request *req;
- int notify, req_id, ret, evtchn, nonblock;
+ int notify, req_id, ret, nonblock;
+ evtchn_port_t evtchn;
map = pvcalls_enter_sock(sock);
if (IS_ERR(map))
@@ -1125,7 +1127,8 @@ static int pvcalls_front_remove(struct xenbus_device *dev)
static int pvcalls_front_probe(struct xenbus_device *dev,
const struct xenbus_device_id *id)
{
- int ret = -ENOMEM, evtchn, i;
+ int ret = -ENOMEM, i;
+ evtchn_port_t evtchn;
unsigned int max_page_order, function_calls, len;
char *versions;
grant_ref_t gref_head = 0;
diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c
index 833b2d2c4318..f2115587855f 100644
--- a/drivers/xen/xen-pciback/xenbus.c
+++ b/drivers/xen/xen-pciback/xenbus.c
@@ -105,13 +105,13 @@ static void free_pdev(struct xen_pcibk_device *pdev)
}
static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref,
- int remote_evtchn)
+ evtchn_port_t remote_evtchn)
{
int err = 0;
void *vaddr;
dev_dbg(&pdev->xdev->dev,
- "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
+ "Attaching to frontend resources - gnt_ref=%d evtchn=%u\n",
gnt_ref, remote_evtchn);
err = xenbus_map_ring_valloc(pdev->xdev, &gnt_ref, 1, &vaddr);
@@ -142,7 +142,8 @@ out:
static int xen_pcibk_attach(struct xen_pcibk_device *pdev)
{
int err = 0;
- int gnt_ref, remote_evtchn;
+ int gnt_ref;
+ evtchn_port_t remote_evtchn;
char *magic = NULL;
diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c
index ba0942e481bc..75c0a2e9a6db 100644
--- a/drivers/xen/xen-scsiback.c
+++ b/drivers/xen/xen-scsiback.c
@@ -854,7 +854,8 @@ unmap_page:
static int scsiback_map(struct vscsibk_info *info)
{
struct xenbus_device *dev = info->dev;
- unsigned int ring_ref, evtchn;
+ unsigned int ring_ref;
+ evtchn_port_t evtchn;
int err;
err = xenbus_gather(XBT_NIL, dev->otherend,
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index 31eb822ac313..385843256865 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -391,7 +391,7 @@ EXPORT_SYMBOL_GPL(xenbus_grant_ring);
* error, the device will switch to XenbusStateClosing, and the error will be
* saved in the store.
*/
-int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
+int xenbus_alloc_evtchn(struct xenbus_device *dev, evtchn_port_t *port)
{
struct evtchn_alloc_unbound alloc_unbound;
int err;
@@ -414,7 +414,7 @@ EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
/**
* Free an existing event channel. Returns 0 on success or -errno on error.
*/
-int xenbus_free_evtchn(struct xenbus_device *dev, int port)
+int xenbus_free_evtchn(struct xenbus_device *dev, evtchn_port_t port)
{
struct evtchn_close close;
int err;
@@ -423,7 +423,7 @@ int xenbus_free_evtchn(struct xenbus_device *dev, int port)
err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
if (err)
- xenbus_dev_error(dev, err, "freeing event channel %d", port);
+ xenbus_dev_error(dev, err, "freeing event channel %u", port);
return err;
}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f4713ea76e82..13f25e241ac4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -27,6 +27,7 @@
#include <linux/highuid.h>
#include <linux/compiler.h>
#include <linux/highmem.h>
+#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/vmalloc.h>
#include <linux/security.h>
@@ -698,19 +699,11 @@ static int load_elf_binary(struct linux_binprm *bprm)
unsigned long reloc_func_desc __maybe_unused = 0;
int executable_stack = EXSTACK_DEFAULT;
struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf;
- struct {
- struct elfhdr interp_elf_ex;
- } *loc;
+ struct elfhdr *interp_elf_ex = NULL;
struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
struct mm_struct *mm;
struct pt_regs *regs;
- loc = kmalloc(sizeof(*loc), GFP_KERNEL);
- if (!loc) {
- retval = -ENOMEM;
- goto out_ret;
- }
-
retval = -ENOEXEC;
/* First of all, some simple consistency checks */
if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
@@ -770,9 +763,15 @@ static int load_elf_binary(struct linux_binprm *bprm)
*/
would_dump(bprm, interpreter);
+ interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL);
+ if (!interp_elf_ex) {
+ retval = -ENOMEM;
+ goto out_free_ph;
+ }
+
/* Get the exec headers */
- retval = elf_read(interpreter, &loc->interp_elf_ex,
- sizeof(loc->interp_elf_ex), 0);
+ retval = elf_read(interpreter, interp_elf_ex,
+ sizeof(*interp_elf_ex), 0);
if (retval < 0)
goto out_free_dentry;
@@ -806,25 +805,25 @@ out_free_interp:
if (interpreter) {
retval = -ELIBBAD;
/* Not an ELF interpreter */
- if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
+ if (memcmp(interp_elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
goto out_free_dentry;
/* Verify the interpreter has a valid arch */
- if (!elf_check_arch(&loc->interp_elf_ex) ||
- elf_check_fdpic(&loc->interp_elf_ex))
+ if (!elf_check_arch(interp_elf_ex) ||
+ elf_check_fdpic(interp_elf_ex))
goto out_free_dentry;
/* Load the interpreter program headers */
- interp_elf_phdata = load_elf_phdrs(&loc->interp_elf_ex,
+ interp_elf_phdata = load_elf_phdrs(interp_elf_ex,
interpreter);
if (!interp_elf_phdata)
goto out_free_dentry;
/* Pass PT_LOPROC..PT_HIPROC headers to arch code */
elf_ppnt = interp_elf_phdata;
- for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++)
+ for (i = 0; i < interp_elf_ex->e_phnum; i++, elf_ppnt++)
switch (elf_ppnt->p_type) {
case PT_LOPROC ... PT_HIPROC:
- retval = arch_elf_pt_proc(&loc->interp_elf_ex,
+ retval = arch_elf_pt_proc(interp_elf_ex,
elf_ppnt, interpreter,
true, &arch_state);
if (retval)
@@ -839,7 +838,7 @@ out_free_interp:
* the exec syscall.
*/
retval = arch_check_elf(elf_ex,
- !!interpreter, &loc->interp_elf_ex,
+ !!interpreter, interp_elf_ex,
&arch_state);
if (retval)
goto out_free_dentry;
@@ -1055,7 +1054,7 @@ out_free_interp:
}
if (interpreter) {
- elf_entry = load_elf_interp(&loc->interp_elf_ex,
+ elf_entry = load_elf_interp(interp_elf_ex,
interpreter,
load_bias, interp_elf_phdata);
if (!IS_ERR((void *)elf_entry)) {
@@ -1064,7 +1063,7 @@ out_free_interp:
* adjustment
*/
interp_load_addr = elf_entry;
- elf_entry += loc->interp_elf_ex.e_entry;
+ elf_entry += interp_elf_ex->e_entry;
}
if (BAD_ADDR(elf_entry)) {
retval = IS_ERR((void *)elf_entry) ?
@@ -1075,6 +1074,9 @@ out_free_interp:
allow_write_access(interpreter);
fput(interpreter);
+
+ kfree(interp_elf_ex);
+ kfree(interp_elf_phdata);
} else {
elf_entry = e_entry;
if (BAD_ADDR(elf_entry)) {
@@ -1083,7 +1085,6 @@ out_free_interp:
}
}
- kfree(interp_elf_phdata);
kfree(elf_phdata);
set_binfmt(&elf_format);
@@ -1153,12 +1154,11 @@ out_free_interp:
start_thread(regs, elf_entry, bprm->p);
retval = 0;
out:
- kfree(loc);
-out_ret:
return retval;
/* error cleanup */
out_free_dentry:
+ kfree(interp_elf_ex);
kfree(interp_elf_phdata);
allow_write_access(interpreter);
if (interpreter)
@@ -1317,7 +1317,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
}
/* Hugetlb memory check */
- if (vma->vm_flags & VM_HUGETLB) {
+ if (is_vm_hugetlb_page(vma)) {
if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
goto whole;
if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 7ab616601141..6f4678d98df7 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -159,8 +159,6 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
if (!PagePrivate(page))
return;
- ClearPageChecked(page);
-
dout("%p invalidatepage %p idx %lu full dirty page\n",
inode, page, page->index);
@@ -183,6 +181,47 @@ static int ceph_releasepage(struct page *page, gfp_t g)
}
/*
+ * Read some contiguous pages. If we cross a stripe boundary, shorten
+ * *plen. Return number of bytes read, or error.
+ */
+static int ceph_sync_readpages(struct ceph_fs_client *fsc,
+ struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ u64 off, u64 *plen,
+ u32 truncate_seq, u64 truncate_size,
+ struct page **pages, int num_pages,
+ int page_align)
+{
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_osd_request *req;
+ int rc = 0;
+
+ dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+ vino.snap, off, *plen);
+ req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+ NULL, truncate_seq, truncate_size,
+ false);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ /* it may be a short read due to an object boundary */
+ osd_req_op_extent_osd_data_pages(req, 0,
+ pages, *plen, page_align, false, false);
+
+ dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
+ off, *plen, *plen, page_align);
+
+ rc = ceph_osdc_start_request(osdc, req, false);
+ if (!rc)
+ rc = ceph_osdc_wait_request(osdc, req);
+
+ ceph_osdc_put_request(req);
+ dout("readpages result %d\n", rc);
+ return rc;
+}
+
+/*
* read a single page, without unlocking it.
*/
static int ceph_do_readpage(struct file *filp, struct page *page)
@@ -218,7 +257,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
dout("readpage inode %p file %p page %p index %lu\n",
inode, filp, page, page->index);
- err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
+ err = ceph_sync_readpages(fsc, ceph_vino(inode),
&ci->i_layout, off, &len,
ci->i_truncate_seq, ci->i_truncate_size,
&page, 1, 0);
@@ -571,6 +610,47 @@ static u64 get_writepages_data_length(struct inode *inode,
}
/*
+ * do a synchronous write on N pages
+ */
+static int ceph_sync_writepages(struct ceph_fs_client *fsc,
+ struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ struct ceph_snap_context *snapc,
+ u64 off, u64 len,
+ u32 truncate_seq, u64 truncate_size,
+ struct timespec64 *mtime,
+ struct page **pages, int num_pages)
+{
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_osd_request *req;
+ int rc = 0;
+ int page_align = off & ~PAGE_MASK;
+
+ req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
+ snapc, truncate_seq, truncate_size,
+ true);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ /* it may be a short write due to an object boundary */
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
+ false, false);
+ dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
+
+ req->r_mtime = *mtime;
+ rc = ceph_osdc_start_request(osdc, req, true);
+ if (!rc)
+ rc = ceph_osdc_wait_request(osdc, req);
+
+ ceph_osdc_put_request(req);
+ if (rc == 0)
+ rc = len;
+ dout("writepages result %d\n", rc);
+ return rc;
+}
+
+/*
* Write a single page, but leave the page locked.
*
* If we get a write error, mark the mapping for error, but still adjust the
@@ -628,7 +708,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
set_page_writeback(page);
- err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode),
+ err = ceph_sync_writepages(fsc, ceph_vino(inode),
&ci->i_layout, snapc, page_off, len,
ceph_wbc.truncate_seq,
ceph_wbc.truncate_size,
@@ -1575,7 +1655,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
do {
lock_page(page);
- if ((off > size) || (page->mapping != inode->i_mapping)) {
+ if (page_mkwrite_check_truncate(page, inode) < 0) {
unlock_page(page);
ret = VM_FAULT_NOPAGE;
break;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 270b769607a2..2f5cb6bc78e1 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -32,7 +32,7 @@ struct ceph_fscache_entry {
size_t uniq_len;
/* The following members must be last */
struct ceph_fsid fsid;
- char uniquifier[0];
+ char uniquifier[];
};
static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 28ae0c134700..185db76300b3 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -490,13 +490,10 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
struct ceph_mount_options *opt = mdsc->fsc->mount_options;
-
- ci->i_hold_caps_min = round_jiffies(jiffies +
- opt->caps_wanted_delay_min * HZ);
ci->i_hold_caps_max = round_jiffies(jiffies +
opt->caps_wanted_delay_max * HZ);
- dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
- ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+ dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
+ ci->i_hold_caps_max - jiffies);
}
/*
@@ -508,10 +505,9 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
* -> we take mdsc->cap_delay_lock
*/
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
- struct ceph_inode_info *ci,
- bool set_timeout)
+ struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+ dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
ci->i_ceph_flags, ci->i_hold_caps_max);
if (!mdsc->stopping) {
spin_lock(&mdsc->cap_delay_lock);
@@ -520,8 +516,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
goto no_change;
list_del_init(&ci->i_cap_delay_list);
}
- if (set_timeout)
- __cap_set_timeouts(mdsc, ci);
+ __cap_set_timeouts(mdsc, ci);
list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
no_change:
spin_unlock(&mdsc->cap_delay_lock);
@@ -561,19 +556,20 @@ static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
spin_unlock(&mdsc->cap_delay_lock);
}
-/*
- * Common issue checks for add_cap, handle_cap_grant.
- */
+/* Common issue checks for add_cap, handle_cap_grant. */
static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
unsigned issued)
{
unsigned had = __ceph_caps_issued(ci, NULL);
+ lockdep_assert_held(&ci->i_ceph_lock);
+
/*
* Each time we receive FILE_CACHE anew, we increment
* i_rdcache_gen.
*/
- if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+ if (S_ISREG(ci->vfs_inode.i_mode) &&
+ (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
(had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
ci->i_rdcache_gen++;
}
@@ -592,6 +588,13 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
__ceph_dir_clear_complete(ci);
}
}
+
+ /* Wipe saved layout if we're losing DIR_CREATE caps */
+ if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
+ !(issued & CEPH_CAP_DIR_CREATE)) {
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
+ }
}
/*
@@ -605,7 +608,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
*/
void ceph_add_cap(struct inode *inode,
struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
+ unsigned issued, unsigned wanted,
unsigned seq, unsigned mseq, u64 realmino, int flags,
struct ceph_cap **new_cap)
{
@@ -621,13 +624,6 @@ void ceph_add_cap(struct inode *inode,
dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
session->s_mds, cap_id, ceph_cap_string(issued), seq);
- /*
- * If we are opening the file, include file mode wanted bits
- * in wanted.
- */
- if (fmode >= 0)
- wanted |= ceph_caps_for_mode(fmode);
-
spin_lock(&session->s_gen_ttl_lock);
gen = session->s_cap_gen;
spin_unlock(&session->s_gen_ttl_lock);
@@ -725,7 +721,7 @@ void ceph_add_cap(struct inode *inode,
dout(" issued %s, mds wanted %s, actual %s, queueing\n",
ceph_cap_string(issued), ceph_cap_string(wanted),
ceph_cap_string(actual_wanted));
- __cap_delay_requeue(mdsc, ci, true);
+ __cap_delay_requeue(mdsc, ci);
}
if (flags & CEPH_CAP_FLAG_AUTH) {
@@ -752,9 +748,6 @@ void ceph_add_cap(struct inode *inode,
cap->issue_seq = seq;
cap->mseq = mseq;
cap->cap_gen = gen;
-
- if (fmode >= 0)
- __ceph_get_fmode(ci, fmode);
}
/*
@@ -958,29 +951,97 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
if (ci->i_rd_ref)
used |= CEPH_CAP_FILE_RD;
if (ci->i_rdcache_ref ||
- (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
+ (S_ISREG(ci->vfs_inode.i_mode) &&
ci->vfs_inode.i_data.nrpages))
used |= CEPH_CAP_FILE_CACHE;
if (ci->i_wr_ref)
used |= CEPH_CAP_FILE_WR;
if (ci->i_wb_ref || ci->i_wrbuffer_ref)
used |= CEPH_CAP_FILE_BUFFER;
+ if (ci->i_fx_ref)
+ used |= CEPH_CAP_FILE_EXCL;
return used;
}
+#define FMODE_WAIT_BIAS 1000
+
/*
* wanted, by virtue of open file modes
*/
int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
{
- int i, bits = 0;
- for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
- if (ci->i_nr_by_mode[i])
- bits |= 1 << i;
+ const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
+ const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
+ const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
+ const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
+ struct ceph_mount_options *opt =
+ ceph_inode_to_client(&ci->vfs_inode)->mount_options;
+ unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
+ unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
+
+ if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ int want = 0;
+
+ /* use used_cutoff here, to keep dir's wanted caps longer */
+ if (ci->i_nr_by_mode[RD_SHIFT] > 0 ||
+ time_after(ci->i_last_rd, used_cutoff))
+ want |= CEPH_CAP_ANY_SHARED;
+
+ if (ci->i_nr_by_mode[WR_SHIFT] > 0 ||
+ time_after(ci->i_last_wr, used_cutoff)) {
+ want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
+ want |= CEPH_CAP_ANY_DIR_OPS;
+ }
+
+ if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0)
+ want |= CEPH_CAP_PIN;
+
+ return want;
+ } else {
+ int bits = 0;
+
+ if (ci->i_nr_by_mode[RD_SHIFT] > 0) {
+ if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS ||
+ time_after(ci->i_last_rd, used_cutoff))
+ bits |= 1 << RD_SHIFT;
+ } else if (time_after(ci->i_last_rd, idle_cutoff)) {
+ bits |= 1 << RD_SHIFT;
+ }
+
+ if (ci->i_nr_by_mode[WR_SHIFT] > 0) {
+ if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS ||
+ time_after(ci->i_last_wr, used_cutoff))
+ bits |= 1 << WR_SHIFT;
+ } else if (time_after(ci->i_last_wr, idle_cutoff)) {
+ bits |= 1 << WR_SHIFT;
+ }
+
+ /* check lazyio only when read/write is wanted */
+ if ((bits & (CEPH_FILE_MODE_RDWR << 1)) &&
+ ci->i_nr_by_mode[LAZY_SHIFT] > 0)
+ bits |= 1 << LAZY_SHIFT;
+
+ return bits ? ceph_caps_for_mode(bits >> 1) : 0;
}
- if (bits == 0)
- return 0;
- return ceph_caps_for_mode(bits >> 1);
+}
+
+/*
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
+ */
+int __ceph_caps_wanted(struct ceph_inode_info *ci)
+{
+ int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
+ if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ /* we want EXCL if holding caps of dir ops */
+ if (w & CEPH_CAP_ANY_DIR_OPS)
+ w |= CEPH_CAP_FILE_EXCL;
+ } else {
+ /* we want EXCL if dirty data */
+ if (w & CEPH_CAP_FILE_BUFFER)
+ w |= CEPH_CAP_FILE_EXCL;
+ }
+ return w;
}
/*
@@ -1004,14 +1065,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
return mds_wanted;
}
-/*
- * called under i_ceph_lock
- */
-static int __ceph_is_single_caps(struct ceph_inode_info *ci)
-{
- return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
-}
-
int ceph_is_any_caps(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1274,9 +1327,15 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
struct cap_msg_args arg;
int held, revoking;
int wake = 0;
- int delayed = 0;
int ret;
+ /* Don't send anything if it's still being created. Return delayed */
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ spin_unlock(&ci->i_ceph_lock);
+ dout("%s async create in flight for %p\n", __func__, inode);
+ return 1;
+ }
+
held = cap->issued | cap->implemented;
revoking = cap->implemented & ~cap->issued;
retain &= ~revoking;
@@ -1287,28 +1346,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ceph_cap_string(revoking));
BUG_ON((retain & CEPH_CAP_PIN) == 0);
- arg.session = cap->session;
-
- /* don't release wanted unless we've waited a bit. */
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
- time_before(jiffies, ci->i_hold_caps_min)) {
- dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & retain),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(want));
- want |= cap->mds_wanted;
- retain |= cap->issued;
- delayed = 1;
- }
- ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
- if (want & ~cap->mds_wanted) {
- /* user space may open/close single file frequently.
- * This avoids droping mds_wanted immediately after
- * requesting new mds_wanted.
- */
- __cap_set_timeouts(mdsc, ci);
- }
+ ci->i_ceph_flags &= ~CEPH_I_FLUSH;
cap->issued &= retain; /* drop bits we don't want */
if (cap->implemented & ~cap->issued) {
@@ -1323,6 +1361,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
cap->implemented &= cap->issued | used;
cap->mds_wanted = want;
+ arg.session = cap->session;
arg.ino = ceph_vino(inode).ino;
arg.cid = cap->cap_id;
arg.follows = flushing ? ci->i_head_snapc->seq : 0;
@@ -1332,7 +1371,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
arg.size = inode->i_size;
ci->i_reported_size = arg.size;
arg.max_size = ci->i_wanted_max_size;
- ci->i_requested_max_size = arg.max_size;
+ if (cap == ci->i_auth_cap)
+ ci->i_requested_max_size = arg.max_size;
if (flushing & CEPH_CAP_XATTR_EXCL) {
old_blob = __ceph_build_xattrs_blob(ci);
@@ -1383,14 +1423,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ret = send_cap_msg(&arg);
if (ret < 0) {
- dout("error sending cap msg, must requeue %p\n", inode);
- delayed = 1;
+ pr_err("error sending cap msg, ino (%llx.%llx) "
+ "flushing %s tid %llu, requeue\n",
+ ceph_vinop(inode), ceph_cap_string(flushing),
+ flush_tid);
+ spin_lock(&ci->i_ceph_lock);
+ __cap_delay_requeue(mdsc, ci);
+ spin_unlock(&ci->i_ceph_lock);
}
if (wake)
wake_up_all(&ci->i_cap_wq);
- return delayed;
+ return ret;
}
static inline int __send_flush_snap(struct inode *inode,
@@ -1617,6 +1662,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
int was = ci->i_dirty_caps;
int dirty = 0;
+ lockdep_assert_held(&ci->i_ceph_lock);
+
if (!ci->i_auth_cap) {
pr_warn("__mark_dirty_caps %p %llx mask %s, "
"but no auth cap (session was closed?)\n",
@@ -1654,7 +1701,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
(mask & CEPH_CAP_FILE_BUFFER))
dirty |= I_DIRTY_DATASYNC;
- __cap_delay_requeue(mdsc, ci, true);
+ __cap_delay_requeue(mdsc, ci);
return dirty;
}
@@ -1726,6 +1773,7 @@ static u64 __mark_caps_flushing(struct inode *inode,
struct ceph_cap_flush *cf = NULL;
int flushing;
+ lockdep_assert_held(&ci->i_ceph_lock);
BUG_ON(ci->i_dirty_caps == 0);
BUG_ON(list_empty(&ci->i_dirty_item));
BUG_ON(!ci->i_prealloc_cap_flush);
@@ -1805,8 +1853,6 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
* versus held caps. Release, flush, ack revoked caps to mds as
* appropriate.
*
- * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
- * cap release further.
* CHECK_CAPS_AUTHONLY - we should only check the auth cap
* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
* further delay.
@@ -1825,24 +1871,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
int mds = -1; /* keep track of how far we've gone through i_caps list
to avoid an infinite loop on retry */
struct rb_node *p;
- int delayed = 0, sent = 0;
- bool no_delay = flags & CHECK_CAPS_NODELAY;
bool queue_invalidate = false;
bool tried_invalidate = false;
- /* if we are unmounting, flush any unused caps immediately. */
- if (mdsc->stopping)
- no_delay = true;
-
spin_lock(&ci->i_ceph_lock);
-
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
- if (!(flags & CHECK_CAPS_AUTHONLY) ||
- (ci->i_auth_cap && __ceph_is_single_caps(ci)))
- __cap_delay_cancel(mdsc, ci);
-
goto retry_locked;
retry:
spin_lock(&ci->i_ceph_lock);
@@ -1866,10 +1901,11 @@ retry_locked:
* revoking the shared cap on every create/unlink
* operation.
*/
- if (IS_RDONLY(inode))
+ if (IS_RDONLY(inode)) {
want = CEPH_CAP_ANY_SHARED;
- else
- want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ } else {
+ want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ }
retain |= want;
} else {
@@ -1885,14 +1921,13 @@ retry_locked:
}
dout("check_caps %p file_want %s used %s dirty %s flushing %s"
- " issued %s revoking %s retain %s %s%s%s\n", inode,
+ " issued %s revoking %s retain %s %s%s\n", inode,
ceph_cap_string(file_wanted),
ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
ceph_cap_string(ci->i_flushing_caps),
ceph_cap_string(issued), ceph_cap_string(revoking),
ceph_cap_string(retain),
(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
- (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
(flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
/*
@@ -1900,8 +1935,8 @@ retry_locked:
* have cached pages, but don't want them, then try to invalidate.
* If we fail, it's because pages are locked.... try again later.
*/
- if ((!no_delay || mdsc->stopping) &&
- !S_ISDIR(inode->i_mode) && /* ignore readdir cache */
+ if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
+ S_ISREG(inode->i_mode) &&
!(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */
(revoking & (CEPH_CAP_FILE_CACHE|
@@ -1973,28 +2008,17 @@ retry_locked:
}
/* want more caps from mds? */
- if (want & ~(cap->mds_wanted | cap->issued))
- goto ack;
+ if (want & ~cap->mds_wanted) {
+ if (want & ~(cap->mds_wanted | cap->issued))
+ goto ack;
+ if (!__cap_is_valid(cap))
+ goto ack;
+ }
/* things we might delay */
if ((cap->issued & ~retain) == 0)
continue; /* nope, all good */
- if (no_delay)
- goto ack;
-
- /* delay? */
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
- time_before(jiffies, ci->i_hold_caps_max)) {
- dout(" delaying issued %s -> %s, wanted %s -> %s\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & retain),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(want));
- delayed++;
- continue;
- }
-
ack:
if (session && session != cap->session) {
dout("oops, wrong session %p mutex\n", session);
@@ -2055,18 +2079,20 @@ ack:
}
mds = cap->mds; /* remember mds, so we don't repeat */
- sent++;
/* __send_cap drops i_ceph_lock */
- delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
- cap_used, want, retain, flushing,
- flush_tid, oldest_flush_tid);
+ __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want,
+ retain, flushing, flush_tid, oldest_flush_tid);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
- /* Reschedule delayed caps release if we delayed anything */
- if (delayed)
- __cap_delay_requeue(mdsc, ci, false);
+ /* periodically re-calculate caps wanted by open files */
+ if (__ceph_is_any_real_caps(ci) &&
+ list_empty(&ci->i_cap_delay_list) &&
+ (file_wanted & ~CEPH_CAP_PIN) &&
+ !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+ __cap_delay_requeue(mdsc, ci);
+ }
spin_unlock(&ci->i_ceph_lock);
@@ -2095,7 +2121,6 @@ retry:
retry_locked:
if (ci->i_dirty_caps && ci->i_auth_cap) {
struct ceph_cap *cap = ci->i_auth_cap;
- int delayed;
if (session != cap->session) {
spin_unlock(&ci->i_ceph_lock);
@@ -2124,18 +2149,10 @@ retry_locked:
&oldest_flush_tid);
/* __send_cap drops i_ceph_lock */
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- CEPH_CLIENT_CAPS_SYNC,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- (cap->issued | cap->implemented),
- flushing, flush_tid, oldest_flush_tid);
-
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci, true);
- spin_unlock(&ci->i_ceph_lock);
- }
+ __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
+ __ceph_caps_used(ci), __ceph_caps_wanted(ci),
+ (cap->issued | cap->implemented),
+ flushing, flush_tid, oldest_flush_tid);
} else {
if (!list_empty(&ci->i_cap_flush_list)) {
struct ceph_cap_flush *cf =
@@ -2233,6 +2250,10 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (datasync)
goto out;
+ ret = ceph_wait_on_async_create(inode);
+ if (ret)
+ goto out;
+
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
@@ -2335,22 +2356,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
if (cf->caps) {
dout("kick_flushing_caps %p cap %p tid %llu %s\n",
inode, cap, cf->tid, ceph_cap_string(cf->caps));
- ci->i_ceph_flags |= CEPH_I_NODELAY;
-
- ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+ __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
(cf->tid < last_snap_flush ?
CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
__ceph_caps_used(ci),
__ceph_caps_wanted(ci),
(cap->issued | cap->implemented),
cf->caps, cf->tid, oldest_flush_tid);
- if (ret) {
- pr_err("kick_flushing_caps: error sending "
- "cap flush, ino (%llx.%llx) "
- "tid %llu flushing %s\n",
- ceph_vinop(inode), cf->tid,
- ceph_cap_string(cf->caps));
- }
} else {
struct ceph_cap_snap *capsnap =
container_of(cf, struct ceph_cap_snap,
@@ -2457,16 +2469,15 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
}
}
-static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- struct inode *inode)
- __releases(ci->i_ceph_lock)
+void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
+ struct ceph_inode_info *ci)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap *cap;
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_cap *cap = ci->i_auth_cap;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
- cap = ci->i_auth_cap;
- dout("kick_flushing_inode_caps %p flushing %s\n", inode,
+ dout("%s %p flushing %s\n", __func__, &ci->vfs_inode,
ceph_cap_string(ci->i_flushing_caps));
if (!list_empty(&ci->i_cap_flush_list)) {
@@ -2478,9 +2489,6 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
spin_unlock(&mdsc->cap_dirty_lock);
__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
- spin_unlock(&ci->i_ceph_lock);
- } else {
- spin_unlock(&ci->i_ceph_lock);
}
}
@@ -2488,18 +2496,20 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
/*
* Take references to capabilities we hold, so that we don't release
* them to the MDS prematurely.
- *
- * Protected by i_ceph_lock.
*/
-static void __take_cap_refs(struct ceph_inode_info *ci, int got,
+void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
bool snap_rwsem_locked)
{
+ lockdep_assert_held(&ci->i_ceph_lock);
+
if (got & CEPH_CAP_PIN)
ci->i_pin_ref++;
if (got & CEPH_CAP_FILE_RD)
ci->i_rd_ref++;
if (got & CEPH_CAP_FILE_CACHE)
ci->i_rdcache_ref++;
+ if (got & CEPH_CAP_FILE_EXCL)
+ ci->i_fx_ref++;
if (got & CEPH_CAP_FILE_WR) {
if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
BUG_ON(!snap_rwsem_locked);
@@ -2512,7 +2522,7 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
if (ci->i_wb_ref == 0)
ihold(&ci->vfs_inode);
ci->i_wb_ref++;
- dout("__take_cap_refs %p wb %d -> %d (?)\n",
+ dout("%s %p wb %d -> %d (?)\n", __func__,
&ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
}
}
@@ -2524,14 +2534,16 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
* Note that caller is responsible for ensuring max_size increases are
* requested from the MDS.
*
- * Returns 0 if caps were not able to be acquired (yet), a 1 if they were,
- * or a negative error code.
- *
- * FIXME: how does a 0 return differ from -EAGAIN?
+ * Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
+ * or a negative error code. There are 3 speical error codes:
+ * -EAGAIN: need to sleep but non-blocking is specified
+ * -EFBIG: ask caller to call check_max_size() and try again.
+ * -ESTALE: ask caller to call ceph_renew_caps() and try again.
*/
enum {
- NON_BLOCKING = 1,
- CHECK_FILELOCK = 2,
+ /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
+ NON_BLOCKING = (1 << 8),
+ CHECK_FILELOCK = (1 << 9),
};
static int try_get_cap_refs(struct inode *inode, int need, int want,
@@ -2541,7 +2553,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
int ret = 0;
int have, implemented;
- int file_wanted;
bool snap_rwsem_locked = false;
dout("get_cap_refs %p need %s want %s\n", inode,
@@ -2557,15 +2568,6 @@ again:
goto out_unlock;
}
- /* make sure file is actually open */
- file_wanted = __ceph_caps_file_wanted(ci);
- if ((file_wanted & need) != need) {
- dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
- ceph_cap_string(need), ceph_cap_string(file_wanted));
- ret = -EBADF;
- goto out_unlock;
- }
-
/* finish pending truncate */
while (ci->i_truncate_pending) {
spin_unlock(&ci->i_ceph_lock);
@@ -2584,7 +2586,7 @@ again:
dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
inode, endoff, ci->i_max_size);
if (endoff > ci->i_requested_max_size)
- ret = -EAGAIN;
+ ret = ci->i_auth_cap ? -EFBIG : -ESTALE;
goto out_unlock;
}
/*
@@ -2630,51 +2632,55 @@ again:
}
snap_rwsem_locked = true;
}
- *got = need | (have & want);
- if ((need & CEPH_CAP_FILE_RD) &&
+ if ((have & want) == want)
+ *got = need | want;
+ else
+ *got = need;
+ if (S_ISREG(inode->i_mode) &&
+ (need & CEPH_CAP_FILE_RD) &&
!(*got & CEPH_CAP_FILE_CACHE))
ceph_disable_fscache_readpage(ci);
- __take_cap_refs(ci, *got, true);
+ ceph_take_cap_refs(ci, *got, true);
ret = 1;
}
} else {
int session_readonly = false;
- if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
+ int mds_wanted;
+ if (ci->i_auth_cap &&
+ (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
struct ceph_mds_session *s = ci->i_auth_cap->session;
spin_lock(&s->s_cap_lock);
session_readonly = s->s_readonly;
spin_unlock(&s->s_cap_lock);
}
if (session_readonly) {
- dout("get_cap_refs %p needed %s but mds%d readonly\n",
+ dout("get_cap_refs %p need %s but mds%d readonly\n",
inode, ceph_cap_string(need), ci->i_auth_cap->mds);
ret = -EROFS;
goto out_unlock;
}
- if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
- int mds_wanted;
- if (READ_ONCE(mdsc->fsc->mount_state) ==
- CEPH_MOUNT_SHUTDOWN) {
- dout("get_cap_refs %p forced umount\n", inode);
- ret = -EIO;
- goto out_unlock;
- }
- mds_wanted = __ceph_caps_mds_wanted(ci, false);
- if (need & ~(mds_wanted & need)) {
- dout("get_cap_refs %p caps were dropped"
- " (session killed?)\n", inode);
- ret = -ESTALE;
- goto out_unlock;
- }
- if (!(file_wanted & ~mds_wanted))
- ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ dout("get_cap_refs %p forced umount\n", inode);
+ ret = -EIO;
+ goto out_unlock;
+ }
+ mds_wanted = __ceph_caps_mds_wanted(ci, false);
+ if (need & ~mds_wanted) {
+ dout("get_cap_refs %p need %s > mds_wanted %s\n",
+ inode, ceph_cap_string(need),
+ ceph_cap_string(mds_wanted));
+ ret = -ESTALE;
+ goto out_unlock;
}
- dout("get_cap_refs %p have %s needed %s\n", inode,
+ dout("get_cap_refs %p have %s need %s\n", inode,
ceph_cap_string(have), ceph_cap_string(need));
}
out_unlock:
+
+ __ceph_touch_fmode(ci, mdsc, flags);
+
spin_unlock(&ci->i_ceph_lock);
if (snap_rwsem_locked)
up_read(&mdsc->snap_rwsem);
@@ -2712,20 +2718,40 @@ static void check_max_size(struct inode *inode, loff_t endoff)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
}
+static inline int get_used_fmode(int caps)
+{
+ int fmode = 0;
+ if (caps & CEPH_CAP_FILE_RD)
+ fmode |= CEPH_FILE_MODE_RD;
+ if (caps & CEPH_CAP_FILE_WR)
+ fmode |= CEPH_FILE_MODE_WR;
+ return fmode;
+}
+
int ceph_try_get_caps(struct inode *inode, int need, int want,
bool nonblock, int *got)
{
- int ret;
+ int ret, flags;
BUG_ON(need & ~CEPH_CAP_FILE_RD);
- BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
- ret = ceph_pool_perm_check(inode, need);
- if (ret < 0)
- return ret;
+ BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO |
+ CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_ANY_DIR_OPS));
+ if (need) {
+ ret = ceph_pool_perm_check(inode, need);
+ if (ret < 0)
+ return ret;
+ }
+
+ flags = get_used_fmode(need | want);
+ if (nonblock)
+ flags |= NON_BLOCKING;
- ret = try_get_cap_refs(inode, need, want, 0,
- (nonblock ? NON_BLOCKING : 0), got);
- return ret == -EAGAIN ? 0 : ret;
+ ret = try_get_cap_refs(inode, need, want, 0, flags, got);
+ /* three special error codes */
+ if (ret == -EAGAIN || ret == -EFBIG || ret == -EAGAIN)
+ ret = 0;
+ return ret;
}
/*
@@ -2750,16 +2776,16 @@ int ceph_get_caps(struct file *filp, int need, int want,
fi->filp_gen != READ_ONCE(fsc->filp_gen))
return -EBADF;
- while (true) {
- if (endoff > 0)
- check_max_size(inode, endoff);
+ flags = get_used_fmode(need | want);
- flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
+ while (true) {
+ flags &= CEPH_FILE_MODE_MASK;
+ if (atomic_read(&fi->num_locks))
+ flags |= CHECK_FILELOCK;
_got = 0;
ret = try_get_cap_refs(inode, need, want, endoff,
flags, &_got);
- if (ret == -EAGAIN)
- continue;
+ WARN_ON_ONCE(ret == -EAGAIN);
if (!ret) {
struct ceph_mds_client *mdsc = fsc->mdsc;
struct cap_wait cw;
@@ -2774,6 +2800,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
list_add(&cw.list, &mdsc->cap_wait_list);
spin_unlock(&mdsc->caps_list_lock);
+ /* make sure used fmode not timeout */
+ ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
add_wait_queue(&ci->i_cap_wq, &wait);
flags |= NON_BLOCKING;
@@ -2787,6 +2815,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
}
remove_wait_queue(&ci->i_cap_wq, &wait);
+ ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
spin_lock(&mdsc->caps_list_lock);
list_del(&cw.list);
@@ -2804,16 +2833,26 @@ int ceph_get_caps(struct file *filp, int need, int want,
}
if (ret < 0) {
+ if (ret == -EFBIG || ret == -ESTALE) {
+ int ret2 = ceph_wait_on_async_create(inode);
+ if (ret2 < 0)
+ return ret2;
+ }
+ if (ret == -EFBIG) {
+ check_max_size(inode, endoff);
+ continue;
+ }
if (ret == -ESTALE) {
/* session was killed, try renew caps */
- ret = ceph_renew_caps(inode);
+ ret = ceph_renew_caps(inode, flags);
if (ret == 0)
continue;
}
return ret;
}
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ if (S_ISREG(ci->vfs_inode.i_mode) &&
+ ci->i_inline_version != CEPH_INLINE_NONE &&
(_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
i_size_read(inode) > 0) {
struct page *page =
@@ -2846,7 +2885,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
break;
}
- if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
+ if (S_ISREG(ci->vfs_inode.i_mode) &&
+ (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
ceph_fscache_revalidate_cookie(ci);
*got = _got;
@@ -2860,7 +2900,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
{
spin_lock(&ci->i_ceph_lock);
- __take_cap_refs(ci, caps, false);
+ ceph_take_cap_refs(ci, caps, false);
spin_unlock(&ci->i_ceph_lock);
}
@@ -2911,6 +2951,9 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
if (had & CEPH_CAP_FILE_CACHE)
if (--ci->i_rdcache_ref == 0)
last++;
+ if (had & CEPH_CAP_FILE_EXCL)
+ if (--ci->i_fx_ref == 0)
+ last++;
if (had & CEPH_CAP_FILE_BUFFER) {
if (--ci->i_wb_ref == 0) {
last++;
@@ -2950,7 +2993,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
last ? " last" : "", put ? " put" : "");
- if (last && !flushsnaps)
+ if (last)
ceph_check_caps(ci, 0, NULL);
else if (flushsnaps)
ceph_flush_snaps(ci, NULL);
@@ -3032,7 +3075,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
spin_unlock(&ci->i_ceph_lock);
if (last) {
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, 0, NULL);
} else if (flush_snaps) {
ceph_flush_snaps(ci, NULL);
}
@@ -3133,7 +3176,7 @@ static void handle_cap_grant(struct inode *inode,
* try to invalidate (once). (If there are dirty buffers, we
* will invalidate _after_ writeback.)
*/
- if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
+ if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */
((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
!(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
@@ -3297,11 +3340,12 @@ static void handle_cap_grant(struct inode *inode,
ceph_cap_string(cap->issued),
ceph_cap_string(newcaps),
ceph_cap_string(revoking));
- if (revoking & used & CEPH_CAP_FILE_BUFFER)
+ if (S_ISREG(inode->i_mode) &&
+ (revoking & used & CEPH_CAP_FILE_BUFFER))
writeback = true; /* initiate writeback; will delay ack */
- else if (revoking == CEPH_CAP_FILE_CACHE &&
- (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
- queue_invalidate)
+ else if (queue_invalidate &&
+ revoking == CEPH_CAP_FILE_CACHE &&
+ (newcaps & CEPH_CAP_FILE_LAZYIO) == 0)
; /* do nothing yet, invalidation will be queued */
else if (cap == ci->i_auth_cap)
check_caps = 1; /* check auth cap only */
@@ -3339,7 +3383,8 @@ static void handle_cap_grant(struct inode *inode,
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
if (newcaps & ~extra_info->issued)
wake = true;
- kick_flushing_inode_caps(session->s_mdsc, session, inode);
+ ceph_kick_flushing_inode_caps(session, ci);
+ spin_unlock(&ci->i_ceph_lock);
up_read(&session->s_mdsc->snap_rwsem);
} else {
spin_unlock(&ci->i_ceph_lock);
@@ -3367,10 +3412,10 @@ static void handle_cap_grant(struct inode *inode,
wake_up_all(&ci->i_cap_wq);
if (check_caps == 1)
- ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
session);
else if (check_caps == 2)
- ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
+ ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
else
mutex_unlock(&session->s_mutex);
}
@@ -3619,8 +3664,6 @@ retry:
goto out_unlock;
if (target < 0) {
- if (cap->mds_wanted | cap->issued)
- ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
__ceph_remove_cap(cap, false);
goto out_unlock;
}
@@ -3668,7 +3711,7 @@ retry:
/* add placeholder for the export tagert */
int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
tcap = new_cap;
- ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
+ ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
if (!list_empty(&ci->i_cap_flush_list) &&
@@ -3773,7 +3816,7 @@ retry:
__ceph_caps_issued(ci, &issued);
issued |= __ceph_caps_dirty(ci);
- ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
+ ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
@@ -4047,7 +4090,6 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
{
struct inode *inode;
struct ceph_inode_info *ci;
- int flags = CHECK_CAPS_NODELAY;
dout("check_delayed_caps\n");
while (1) {
@@ -4067,7 +4109,7 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
if (inode) {
dout("check_delayed_caps on %p\n", inode);
- ceph_check_caps(ci, flags, NULL);
+ ceph_check_caps(ci, 0, NULL);
/* avoid calling iput_final() in tick thread */
ceph_async_iput(inode);
}
@@ -4092,7 +4134,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
ihold(inode);
dout("flush_dirty_caps %p\n", inode);
spin_unlock(&mdsc->cap_dirty_lock);
- ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
iput(inode);
spin_lock(&mdsc->cap_dirty_lock);
}
@@ -4100,14 +4142,31 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
dout("flush_dirty_caps done\n");
}
-void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
+void __ceph_touch_fmode(struct ceph_inode_info *ci,
+ struct ceph_mds_client *mdsc, int fmode)
+{
+ unsigned long now = jiffies;
+ if (fmode & CEPH_FILE_MODE_RD)
+ ci->i_last_rd = now;
+ if (fmode & CEPH_FILE_MODE_WR)
+ ci->i_last_wr = now;
+ /* queue periodic check */
+ if (fmode &&
+ __ceph_is_any_real_caps(ci) &&
+ list_empty(&ci->i_cap_delay_list))
+ __cap_delay_requeue(mdsc, ci);
+}
+
+void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
int i;
int bits = (fmode << 1) | 1;
+ spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i))
- ci->i_nr_by_mode[i]++;
+ ci->i_nr_by_mode[i] += count;
}
+ spin_unlock(&ci->i_ceph_lock);
}
/*
@@ -4115,26 +4174,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
* we may need to release capabilities to the MDS (or schedule
* their delayed release).
*/
-void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- int i, last = 0;
+ int i;
int bits = (fmode << 1) | 1;
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i)) {
- BUG_ON(ci->i_nr_by_mode[i] == 0);
- if (--ci->i_nr_by_mode[i] == 0)
- last++;
+ BUG_ON(ci->i_nr_by_mode[i] < count);
+ ci->i_nr_by_mode[i] -= count;
}
}
- dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
- &ci->vfs_inode, fmode,
- ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
- ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
spin_unlock(&ci->i_ceph_lock);
-
- if (last && ci->i_vino.snap == CEPH_NOSNAP)
- ceph_check_caps(ci, 0, NULL);
}
/*
@@ -4152,7 +4203,6 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
if (inode->i_nlink == 1) {
drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
- ci->i_ceph_flags |= CEPH_I_NODELAY;
if (__ceph_caps_dirty(ci)) {
struct ceph_mds_client *mdsc =
ceph_inode_to_client(inode)->mdsc;
@@ -4208,8 +4258,6 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
if (force || (cap->issued & drop)) {
if (cap->issued & drop) {
int wanted = __ceph_caps_wanted(ci);
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
- wanted |= cap->mds_wanted;
dout("encode_inode_release %p cap %p "
"%s -> %s, wanted %s -> %s\n", inode, cap,
ceph_cap_string(cap->issued),
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fb7cabd98e7b..481ac97b4d25 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -218,10 +218,10 @@ static int mds_sessions_show(struct seq_file *s, void *ptr)
return 0;
}
-CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
-CEPH_DEFINE_SHOW_FUNC(mdsc_show)
-CEPH_DEFINE_SHOW_FUNC(caps_show)
-CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
+DEFINE_SHOW_ATTRIBUTE(mdsmap);
+DEFINE_SHOW_ATTRIBUTE(mdsc);
+DEFINE_SHOW_ATTRIBUTE(caps);
+DEFINE_SHOW_ATTRIBUTE(mds_sessions);
/*
@@ -281,25 +281,25 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
0400,
fsc->client->debugfs_dir,
fsc,
- &mdsmap_show_fops);
+ &mdsmap_fops);
fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
0400,
fsc->client->debugfs_dir,
fsc,
- &mds_sessions_show_fops);
+ &mds_sessions_fops);
fsc->debugfs_mdsc = debugfs_create_file("mdsc",
0400,
fsc->client->debugfs_dir,
fsc,
- &mdsc_show_fops);
+ &mdsc_fops);
fsc->debugfs_caps = debugfs_create_file("caps",
0400,
fsc->client->debugfs_dir,
fsc,
- &caps_show_fops);
+ &caps_fops);
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index d0cd0aba5843..d594c2627430 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -335,8 +335,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
ctx->pos = 2;
}
- /* can we use the dcache? */
spin_lock(&ci->i_ceph_lock);
+ /* request Fx cap. if have Fx, we don't need to release Fs cap
+ * for later create/unlink. */
+ __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_WR);
+ /* can we use the dcache? */
if (ceph_test_mount_opt(fsc, DCACHE) &&
!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR &&
@@ -752,7 +755,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
struct ceph_dentry_info *di = ceph_dentry(dentry);
spin_lock(&ci->i_ceph_lock);
- dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+ dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
if (strncmp(dentry->d_name.name,
fsc->mount_options->snapdir_name,
dentry->d_name.len) &&
@@ -760,6 +763,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
ceph_test_mount_opt(fsc, DCACHE) &&
__ceph_dir_is_complete(ci) &&
(__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
+ __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
spin_unlock(&ci->i_ceph_lock);
dout(" dir %p complete, -ENOENT\n", dir);
d_add(dentry, NULL);
@@ -1036,6 +1040,78 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
return err;
}
+static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ int result = req->r_err ? req->r_err :
+ le32_to_cpu(req->r_reply_info.head->result);
+
+ if (result == -EJUKEBOX)
+ goto out;
+
+ /* If op failed, mark everyone involved for errors */
+ if (result) {
+ int pathlen;
+ u64 base;
+ char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ &base, 0);
+
+ /* mark error on parent + clear complete */
+ mapping_set_error(req->r_parent->i_mapping, result);
+ ceph_dir_clear_complete(req->r_parent);
+
+ /* drop the dentry -- we don't know its status */
+ if (!d_unhashed(req->r_dentry))
+ d_drop(req->r_dentry);
+
+ /* mark inode itself for an error (since metadata is bogus) */
+ mapping_set_error(req->r_old_inode->i_mapping, result);
+
+ pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n",
+ base, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path(path, pathlen);
+ }
+out:
+ iput(req->r_old_inode);
+ ceph_mdsc_release_dir_caps(req);
+}
+
+static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di;
+ int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_UNLINK;
+
+ spin_lock(&ci->i_ceph_lock);
+ if ((__ceph_caps_issued(ci, NULL) & want) == want) {
+ ceph_take_cap_refs(ci, want, false);
+ got = want;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ /* If we didn't get anything, return 0 */
+ if (!got)
+ return 0;
+
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ /*
+ * - We are holding Fx, which implies Fs caps.
+ * - Only support async unlink for primary linkage
+ */
+ if (atomic_read(&ci->i_shared_gen) != di->lease_shared_gen ||
+ !(di->flags & CEPH_DENTRY_PRIMARY_LINK))
+ want = 0;
+ spin_unlock(&dentry->d_lock);
+
+ /* Do we still want what we've got? */
+ if (want == got)
+ return got;
+
+ ceph_put_cap_refs(ci, got);
+ return 0;
+}
+
/*
* rmdir and unlink are differ only by the metadata op code
*/
@@ -1045,6 +1121,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = d_inode(dentry);
struct ceph_mds_request *req;
+ bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
int err = -EROFS;
int op;
@@ -1059,6 +1136,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
} else
goto out;
+retry:
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req)) {
err = PTR_ERR(req);
@@ -1067,13 +1145,39 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_parent = dir;
- set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
- err = ceph_mdsc_do_request(mdsc, dir, req);
- if (!err && !req->r_reply_info.head->is_dentry)
- d_delete(dentry);
+
+ if (try_async && op == CEPH_MDS_OP_UNLINK &&
+ (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
+ dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
+ dentry->d_name.len, dentry->d_name.name,
+ ceph_cap_string(req->r_dir_caps));
+ set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
+ req->r_callback = ceph_async_unlink_cb;
+ req->r_old_inode = d_inode(dentry);
+ ihold(req->r_old_inode);
+ err = ceph_mdsc_submit_request(mdsc, dir, req);
+ if (!err) {
+ /*
+ * We have enough caps, so we assume that the unlink
+ * will succeed. Fix up the target inode and dcache.
+ */
+ drop_nlink(inode);
+ d_delete(dentry);
+ } else if (err == -EJUKEBOX) {
+ try_async = false;
+ ceph_mdsc_put_request(req);
+ goto retry;
+ }
+ } else {
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry)
+ d_delete(dentry);
+ }
+
ceph_mdsc_put_request(req);
out:
return err;
@@ -1411,6 +1515,7 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
spin_lock(&dentry->d_lock);
di->time = jiffies;
di->lease_shared_gen = 0;
+ di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
__dentry_lease_unlist(di);
spin_unlock(&dentry->d_lock);
}
@@ -1520,7 +1625,8 @@ static int __dir_lease_try_check(const struct dentry *dentry)
/*
* Check if directory-wide content lease/cap is valid.
*/
-static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
+ struct ceph_mds_client *mdsc)
{
struct ceph_inode_info *ci = ceph_inode(dir);
int valid;
@@ -1528,7 +1634,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
spin_lock(&ci->i_ceph_lock);
valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
- shared_gen = atomic_read(&ci->i_shared_gen);
+ if (valid) {
+ __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
+ shared_gen = atomic_read(&ci->i_shared_gen);
+ }
spin_unlock(&ci->i_ceph_lock);
if (valid) {
struct ceph_dentry_info *di;
@@ -1554,6 +1663,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
int valid = 0;
struct dentry *parent;
struct inode *dir, *inode;
+ struct ceph_mds_client *mdsc;
if (flags & LOOKUP_RCU) {
parent = READ_ONCE(dentry->d_parent);
@@ -1570,6 +1680,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry,
dentry, inode, ceph_dentry(dentry)->offset);
+ mdsc = ceph_sb_to_client(dir->i_sb)->mdsc;
+
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
@@ -1581,7 +1693,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
valid = dentry_lease_is_valid(dentry, flags);
if (valid == -ECHILD)
return valid;
- if (valid || dir_lease_is_valid(dir, dentry)) {
+ if (valid || dir_lease_is_valid(dir, dentry, mdsc)) {
if (inode)
valid = ceph_is_any_caps(inode);
else
@@ -1590,8 +1702,6 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
}
if (!valid) {
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(dir->i_sb)->mdsc;
struct ceph_mds_request *req;
int op, err;
u32 mask;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index b6bfa94332c3..79dc06881e78 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -315,6 +315,11 @@ static struct dentry *__get_parent(struct super_block *sb,
req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err) {
+ ceph_mdsc_put_request(req);
+ return ERR_PTR(err);
+ }
+
inode = req->r_target_inode;
if (inode)
ihold(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5a478cd06e11..4a5ccbb7e808 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -212,10 +212,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
if (isdir) {
struct ceph_dir_file_info *dfi =
kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
- if (!dfi) {
- ceph_put_fmode(ci, fmode); /* clean up */
+ if (!dfi)
return -ENOMEM;
- }
file->private_data = dfi;
fi = &dfi->file_info;
@@ -223,15 +221,15 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
dfi->readdir_cache_idx = -1;
} else {
fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
- if (!fi) {
- ceph_put_fmode(ci, fmode); /* clean up */
+ if (!fi)
return -ENOMEM;
- }
file->private_data = fi;
}
+ ceph_get_fmode(ci, fmode, 1);
fi->fmode = fmode;
+
spin_lock_init(&fi->rw_contexts_lock);
INIT_LIST_HEAD(&fi->rw_contexts);
fi->meta_err = errseq_sample(&ci->i_meta_err);
@@ -263,7 +261,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFLNK:
dout("init_file %p %p 0%o (symlink)\n", inode, file,
inode->i_mode);
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
break;
default:
@@ -273,7 +270,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
* we need to drop the open ref now, since we don't
* have .release set to ceph_release.
*/
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
BUG_ON(inode->i_fop->release == ceph_release);
/* call the proper open fop */
@@ -285,14 +281,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
/*
* try renew caps after session gets killed.
*/
-int ceph_renew_caps(struct inode *inode)
+int ceph_renew_caps(struct inode *inode, int fmode)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
int err, flags, wanted;
spin_lock(&ci->i_ceph_lock);
+ __ceph_touch_fmode(ci, mdsc, fmode);
wanted = __ceph_caps_file_wanted(ci);
if (__ceph_is_any_real_caps(ci) &&
(!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
@@ -326,7 +323,6 @@ int ceph_renew_caps(struct inode *inode)
req->r_inode = inode;
ihold(inode);
req->r_num_caps = 1;
- req->r_fmode = -1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
@@ -372,9 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
/* trivially open snapdir */
if (ceph_snap(inode) == CEPH_SNAPDIR) {
- spin_lock(&ci->i_ceph_lock);
- __ceph_get_fmode(ci, fmode);
- spin_unlock(&ci->i_ceph_lock);
return ceph_init_file(inode, file, fmode);
}
@@ -392,7 +385,7 @@ int ceph_open(struct inode *inode, struct file *file)
dout("open %p fmode %d want %s issued %s using existing\n",
inode, fmode, ceph_cap_string(wanted),
ceph_cap_string(issued));
- __ceph_get_fmode(ci, fmode);
+ __ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
/* adjust wanted? */
@@ -404,7 +397,7 @@ int ceph_open(struct inode *inode, struct file *file)
return ceph_init_file(inode, file, fmode);
} else if (ceph_snap(inode) != CEPH_NOSNAP &&
(ci->i_snap_caps & wanted) == wanted) {
- __ceph_get_fmode(ci, fmode);
+ __ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
return ceph_init_file(inode, file, fmode);
}
@@ -430,6 +423,236 @@ out:
return err;
}
+/* Clone the layout from a synchronous create, if the dir now has Dc caps */
+static void
+cache_file_layout(struct inode *dst, struct inode *src)
+{
+ struct ceph_inode_info *cdst = ceph_inode(dst);
+ struct ceph_inode_info *csrc = ceph_inode(src);
+
+ spin_lock(&cdst->i_ceph_lock);
+ if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
+ !ceph_file_layout_is_valid(&cdst->i_cached_layout)) {
+ memcpy(&cdst->i_cached_layout, &csrc->i_layout,
+ sizeof(cdst->i_cached_layout));
+ rcu_assign_pointer(cdst->i_cached_layout.pool_ns,
+ ceph_try_get_string(csrc->i_layout.pool_ns));
+ }
+ spin_unlock(&cdst->i_ceph_lock);
+}
+
+/*
+ * Try to set up an async create. We need caps, a file layout, and inode number,
+ * and either a lease on the dentry or complete dir info. If any of those
+ * criteria are not satisfied, then return false and the caller can go
+ * synchronous.
+ */
+static int try_prep_async_create(struct inode *dir, struct dentry *dentry,
+ struct ceph_file_layout *lo, u64 *pino)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE;
+ u64 ino;
+
+ spin_lock(&ci->i_ceph_lock);
+ /* No auth cap means no chance for Dc caps */
+ if (!ci->i_auth_cap)
+ goto no_async;
+
+ /* Any delegated inos? */
+ if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos))
+ goto no_async;
+
+ if (!ceph_file_layout_is_valid(&ci->i_cached_layout))
+ goto no_async;
+
+ if ((__ceph_caps_issued(ci, NULL) & want) != want)
+ goto no_async;
+
+ if (d_in_lookup(dentry)) {
+ if (!__ceph_dir_is_complete(ci))
+ goto no_async;
+ spin_lock(&dentry->d_lock);
+ di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
+ spin_unlock(&dentry->d_lock);
+ } else if (atomic_read(&ci->i_shared_gen) !=
+ READ_ONCE(di->lease_shared_gen)) {
+ goto no_async;
+ }
+
+ ino = ceph_get_deleg_ino(ci->i_auth_cap->session);
+ if (!ino)
+ goto no_async;
+
+ *pino = ino;
+ ceph_take_cap_refs(ci, want, false);
+ memcpy(lo, &ci->i_cached_layout, sizeof(*lo));
+ rcu_assign_pointer(lo->pool_ns,
+ ceph_try_get_string(ci->i_cached_layout.pool_ns));
+ got = want;
+no_async:
+ spin_unlock(&ci->i_ceph_lock);
+ return got;
+}
+
+static void restore_deleg_ino(struct inode *dir, u64 ino)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_mds_session *s = NULL;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_auth_cap)
+ s = ceph_get_mds_session(ci->i_auth_cap->session);
+ spin_unlock(&ci->i_ceph_lock);
+ if (s) {
+ int err = ceph_restore_deleg_ino(s, ino);
+ if (err)
+ pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n",
+ ino, err);
+ ceph_put_mds_session(s);
+ }
+}
+
+static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ int result = req->r_err ? req->r_err :
+ le32_to_cpu(req->r_reply_info.head->result);
+
+ if (result == -EJUKEBOX)
+ goto out;
+
+ mapping_set_error(req->r_parent->i_mapping, result);
+
+ if (result) {
+ struct dentry *dentry = req->r_dentry;
+ int pathlen;
+ u64 base;
+ char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ &base, 0);
+
+ ceph_dir_clear_complete(req->r_parent);
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+
+ /* FIXME: start returning I/O errors on all accesses? */
+ pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
+ base, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path(path, pathlen);
+ }
+
+ if (req->r_target_inode) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+ u64 ino = ceph_vino(req->r_target_inode).ino;
+
+ if (req->r_deleg_ino != ino)
+ pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
+ __func__, req->r_err, req->r_deleg_ino, ino);
+ mapping_set_error(req->r_target_inode->i_mapping, result);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
+ wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
+ }
+ ceph_kick_flushing_inode_caps(req->r_session, ci);
+ spin_unlock(&ci->i_ceph_lock);
+ } else {
+ pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
+ req->r_deleg_ino);
+ }
+out:
+ ceph_mdsc_release_dir_caps(req);
+}
+
+static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
+ struct file *file, umode_t mode,
+ struct ceph_mds_request *req,
+ struct ceph_acl_sec_ctx *as_ctx,
+ struct ceph_file_layout *lo)
+{
+ int ret;
+ char xattr_buf[4];
+ struct ceph_mds_reply_inode in = { };
+ struct ceph_mds_reply_info_in iinfo = { .in = &in };
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct inode *inode;
+ struct timespec64 now;
+ struct ceph_vino vino = { .ino = req->r_deleg_ino,
+ .snap = CEPH_NOSNAP };
+
+ ktime_get_real_ts64(&now);
+
+ inode = ceph_get_inode(dentry->d_sb, vino);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ iinfo.inline_version = CEPH_INLINE_NONE;
+ iinfo.change_attr = 1;
+ ceph_encode_timespec64(&iinfo.btime, &now);
+
+ iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
+ iinfo.xattr_data = xattr_buf;
+ memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+
+ in.ino = cpu_to_le64(vino.ino);
+ in.snapid = cpu_to_le64(CEPH_NOSNAP);
+ in.version = cpu_to_le64(1); // ???
+ in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE);
+ in.cap.cap_id = cpu_to_le64(1);
+ in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino);
+ in.cap.flags = CEPH_CAP_FLAG_AUTH;
+ in.ctime = in.mtime = in.atime = iinfo.btime;
+ in.mode = cpu_to_le32((u32)mode);
+ in.truncate_seq = cpu_to_le32(1);
+ in.truncate_size = cpu_to_le64(-1ULL);
+ in.xattr_version = cpu_to_le64(1);
+ in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
+ in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ?
+ dir->i_gid : current_fsgid()));
+ in.nlink = cpu_to_le32(1);
+ in.max_size = cpu_to_le64(lo->stripe_unit);
+
+ ceph_file_layout_to_legacy(lo, &in.layout);
+
+ ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session,
+ req->r_fmode, NULL);
+ if (ret) {
+ dout("%s failed to fill inode: %d\n", __func__, ret);
+ ceph_dir_clear_complete(dir);
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+ if (inode->i_state & I_NEW)
+ discard_new_inode(inode);
+ } else {
+ struct dentry *dn;
+
+ dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
+ vino.ino, dir->i_ino, dentry->d_name.name);
+ ceph_dir_clear_ordered(dir);
+ ceph_init_inode_acls(inode, as_ctx);
+ if (inode->i_state & I_NEW) {
+ /*
+ * If it's not I_NEW, then someone created this before
+ * we got here. Assume the server is aware of it at
+ * that point and don't worry about setting
+ * CEPH_I_ASYNC_CREATE.
+ */
+ ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE;
+ unlock_new_inode(inode);
+ }
+ if (d_in_lookup(dentry) || d_really_is_negative(dentry)) {
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+ dn = d_splice_alias(inode, dentry);
+ WARN_ON_ONCE(dn && dn != dentry);
+ }
+ file->f_mode |= FMODE_CREATED;
+ ret = finish_open(file, dentry, ceph_open);
+ }
+ return ret;
+}
/*
* Do a lookup + open with a single request. If we get a non-existent
@@ -443,6 +666,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct ceph_mds_request *req;
struct dentry *dn;
struct ceph_acl_sec_ctx as_ctx = {};
+ bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
int mask;
int err;
@@ -466,7 +690,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
/* If it's not being looked up, it's negative */
return -ENOENT;
}
-
+retry:
/* do the open */
req = prepare_open_request(dir->i_sb, flags, mode);
if (IS_ERR(req)) {
@@ -475,21 +699,43 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
}
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
+ mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+ if (ceph_security_xattr_wanted(dir))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.open.mask = cpu_to_le32(mask);
+ req->r_parent = dir;
+
if (flags & O_CREAT) {
+ struct ceph_file_layout lo;
+
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
if (as_ctx.pagelist) {
req->r_pagelist = as_ctx.pagelist;
as_ctx.pagelist = NULL;
}
+ if (try_async &&
+ (req->r_dir_caps =
+ try_prep_async_create(dir, dentry, &lo,
+ &req->r_deleg_ino))) {
+ set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
+ req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL);
+ req->r_callback = ceph_async_create_cb;
+ err = ceph_mdsc_submit_request(mdsc, dir, req);
+ if (!err) {
+ err = ceph_finish_async_create(dir, dentry,
+ file, mode, req,
+ &as_ctx, &lo);
+ } else if (err == -EJUKEBOX) {
+ restore_deleg_ino(dir, req->r_deleg_ino);
+ ceph_mdsc_put_request(req);
+ try_async = false;
+ goto retry;
+ }
+ goto out_req;
+ }
}
- mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
- if (ceph_security_xattr_wanted(dir))
- mask |= CEPH_CAP_XATTR_SHARED;
- req->r_args.open.mask = cpu_to_le32(mask);
-
- req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
@@ -518,14 +764,15 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
} else {
dout("atomic_open finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
- ceph_init_inode_acls(d_inode(dentry), &as_ctx);
+ struct inode *newino = d_inode(dentry);
+
+ cache_file_layout(dir, newino);
+ ceph_init_inode_acls(newino, &as_ctx);
file->f_mode |= FMODE_CREATED;
}
err = finish_open(file, dentry, ceph_open);
}
out_req:
- if (!req->r_err && req->r_target_inode)
- ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
ceph_mdsc_put_request(req);
out_ctx:
ceph_release_acl_sec_ctx(&as_ctx);
@@ -542,7 +789,7 @@ int ceph_release(struct inode *inode, struct file *file)
dout("release inode %p dir file %p\n", inode, file);
WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
- ceph_put_fmode(ci, dfi->file_info.fmode);
+ ceph_put_fmode(ci, dfi->file_info.fmode, 1);
if (dfi->last_readdir)
ceph_mdsc_put_request(dfi->last_readdir);
@@ -554,7 +801,8 @@ int ceph_release(struct inode *inode, struct file *file)
dout("release inode %p regular file %p\n", inode, file);
WARN_ON(!list_empty(&fi->rw_contexts));
- ceph_put_fmode(ci, fi->fmode);
+ ceph_put_fmode(ci, fi->fmode, 1);
+
kmem_cache_free(ceph_file_cachep, fi);
}
@@ -1567,7 +1815,7 @@ retry_snap:
if (dirty)
__mark_inode_dirty(inode, dirty);
if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
- ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
+ ceph_check_caps(ci, 0, NULL);
}
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
@@ -1944,6 +2192,71 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
return 0;
}
+static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off,
+ struct ceph_inode_info *dst_ci, u64 *dst_off,
+ struct ceph_fs_client *fsc,
+ size_t len, unsigned int flags)
+{
+ struct ceph_object_locator src_oloc, dst_oloc;
+ struct ceph_object_id src_oid, dst_oid;
+ size_t bytes = 0;
+ u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
+ u32 src_objlen, dst_objlen;
+ u32 object_size = src_ci->i_layout.object_size;
+ int ret;
+
+ src_oloc.pool = src_ci->i_layout.pool_id;
+ src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
+ dst_oloc.pool = dst_ci->i_layout.pool_id;
+ dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
+
+ while (len >= object_size) {
+ ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off,
+ object_size, &src_objnum,
+ &src_objoff, &src_objlen);
+ ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off,
+ object_size, &dst_objnum,
+ &dst_objoff, &dst_objlen);
+ ceph_oid_init(&src_oid);
+ ceph_oid_printf(&src_oid, "%llx.%08llx",
+ src_ci->i_vino.ino, src_objnum);
+ ceph_oid_init(&dst_oid);
+ ceph_oid_printf(&dst_oid, "%llx.%08llx",
+ dst_ci->i_vino.ino, dst_objnum);
+ /* Do an object remote copy */
+ ret = ceph_osdc_copy_from(&fsc->client->osdc,
+ src_ci->i_vino.snap, 0,
+ &src_oid, &src_oloc,
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
+ &dst_oid, &dst_oloc,
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
+ dst_ci->i_truncate_seq,
+ dst_ci->i_truncate_size,
+ CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
+ if (ret) {
+ if (ret == -EOPNOTSUPP) {
+ fsc->have_copy_from2 = false;
+ pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
+ }
+ dout("ceph_osdc_copy_from returned %d\n", ret);
+ if (!bytes)
+ bytes = ret;
+ goto out;
+ }
+ len -= object_size;
+ bytes += object_size;
+ *src_off += object_size;
+ *dst_off += object_size;
+ }
+
+out:
+ ceph_oloc_destroy(&src_oloc);
+ ceph_oloc_destroy(&dst_oloc);
+ return bytes;
+}
+
static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off,
size_t len, unsigned int flags)
@@ -1954,14 +2267,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
struct ceph_cap_flush *prealloc_cf;
struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
- struct ceph_object_locator src_oloc, dst_oloc;
- struct ceph_object_id src_oid, dst_oid;
- loff_t endoff = 0, size;
- ssize_t ret = -EIO;
+ loff_t size;
+ ssize_t ret = -EIO, bytes;
u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
- u32 src_objlen, dst_objlen, object_size;
+ u32 src_objlen, dst_objlen;
int src_got = 0, dst_got = 0, err, dirty;
- bool do_final_copy = false;
if (src_inode->i_sb != dst_inode->i_sb) {
struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
@@ -2039,22 +2349,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (ret < 0)
goto out_caps;
- size = i_size_read(dst_inode);
- endoff = dst_off + len;
-
/* Drop dst file cached pages */
ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
dst_off >> PAGE_SHIFT,
- endoff >> PAGE_SHIFT);
+ (dst_off + len) >> PAGE_SHIFT);
if (ret < 0) {
dout("Failed to invalidate inode pages (%zd)\n", ret);
ret = 0; /* XXX */
}
- src_oloc.pool = src_ci->i_layout.pool_id;
- src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
- dst_oloc.pool = dst_ci->i_layout.pool_id;
- dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
-
ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
src_ci->i_layout.object_size,
&src_objnum, &src_objoff, &src_objlen);
@@ -2073,6 +2375,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
* starting at the src_off
*/
if (src_objoff) {
+ dout("Initial partial copy of %u bytes\n", src_objlen);
+
/*
* we need to temporarily drop all caps as we'll be calling
* {read,write}_iter, which will get caps again.
@@ -2080,8 +2384,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
ret = do_splice_direct(src_file, &src_off, dst_file,
&dst_off, src_objlen, flags);
- if (ret < 0) {
- dout("do_splice_direct returned %d\n", err);
+ /* Abort on short copies or on error */
+ if (ret < src_objlen) {
+ dout("Failed partial copy (%zd)\n", ret);
goto out;
}
len -= ret;
@@ -2094,65 +2399,27 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (err < 0)
goto out_caps;
}
- object_size = src_ci->i_layout.object_size;
- while (len >= object_size) {
- ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
- object_size, &src_objnum,
- &src_objoff, &src_objlen);
- ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
- object_size, &dst_objnum,
- &dst_objoff, &dst_objlen);
- ceph_oid_init(&src_oid);
- ceph_oid_printf(&src_oid, "%llx.%08llx",
- src_ci->i_vino.ino, src_objnum);
- ceph_oid_init(&dst_oid);
- ceph_oid_printf(&dst_oid, "%llx.%08llx",
- dst_ci->i_vino.ino, dst_objnum);
- /* Do an object remote copy */
- err = ceph_osdc_copy_from(
- &src_fsc->client->osdc,
- src_ci->i_vino.snap, 0,
- &src_oid, &src_oloc,
- CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
- CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
- &dst_oid, &dst_oloc,
- CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
- CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
- dst_ci->i_truncate_seq, dst_ci->i_truncate_size,
- CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
- if (err) {
- if (err == -EOPNOTSUPP) {
- src_fsc->have_copy_from2 = false;
- pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
- }
- dout("ceph_osdc_copy_from returned %d\n", err);
- if (!ret)
- ret = err;
- goto out_caps;
- }
- len -= object_size;
- src_off += object_size;
- dst_off += object_size;
- ret += object_size;
- }
- if (len)
- /* We still need one final local copy */
- do_final_copy = true;
+ size = i_size_read(dst_inode);
+ bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off,
+ src_fsc, len, flags);
+ if (bytes <= 0) {
+ if (!ret)
+ ret = bytes;
+ goto out_caps;
+ }
+ dout("Copied %zu bytes out of %zu\n", bytes, len);
+ len -= bytes;
+ ret += bytes;
file_update_time(dst_file);
inode_inc_iversion_raw(dst_inode);
- if (endoff > size) {
- int caps_flags = 0;
-
+ if (dst_off > size) {
/* Let the MDS know about dst file size change */
- if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
- caps_flags |= CHECK_CAPS_NODELAY;
- if (ceph_inode_set_size(dst_inode, endoff))
- caps_flags |= CHECK_CAPS_AUTHONLY;
- if (caps_flags)
- ceph_check_caps(dst_ci, caps_flags, NULL);
+ if (ceph_inode_set_size(dst_inode, dst_off) ||
+ ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
+ ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL);
}
/* Mark Fw dirty */
spin_lock(&dst_ci->i_ceph_lock);
@@ -2165,15 +2432,18 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
out_caps:
put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
- if (do_final_copy) {
- err = do_splice_direct(src_file, &src_off, dst_file,
- &dst_off, len, flags);
- if (err < 0) {
- dout("do_splice_direct returned %d\n", err);
- goto out;
- }
- len -= err;
- ret += err;
+ /*
+ * Do the final manual copy if we still have some bytes left, unless
+ * there were errors in remote object copies (len >= object_size).
+ */
+ if (len && (len < src_ci->i_layout.object_size)) {
+ dout("Final partial copy of %zu bytes\n", len);
+ bytes = do_splice_direct(src_file, &src_off, dst_file,
+ &dst_off, len, flags);
+ if (bytes > 0)
+ ret += bytes;
+ else
+ dout("Failed partial copy (%zd)\n", bytes);
}
out:
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d01710a16a4a..7fef94fd1e55 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -82,10 +82,14 @@ struct inode *ceph_get_snapdir(struct inode *parent)
inode->i_mode = parent->i_mode;
inode->i_uid = parent->i_uid;
inode->i_gid = parent->i_gid;
+ inode->i_mtime = parent->i_mtime;
+ inode->i_ctime = parent->i_ctime;
+ inode->i_atime = parent->i_atime;
inode->i_op = &ceph_snapdir_iops;
inode->i_fop = &ceph_snapdir_fops;
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
ci->i_rbytes = 0;
+ ci->i_btime = ceph_inode(parent)->i_btime;
if (inode->i_state & I_NEW)
unlock_new_inode(inode);
@@ -447,6 +451,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_max_files = 0;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
ci->i_fragtree = RB_ROOT;
@@ -471,13 +476,13 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_prealloc_cap_flush = NULL;
INIT_LIST_HEAD(&ci->i_cap_flush_list);
init_waitqueue_head(&ci->i_cap_wq);
- ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0;
INIT_LIST_HEAD(&ci->i_cap_delay_list);
INIT_LIST_HEAD(&ci->i_cap_snaps);
ci->i_head_snapc = NULL;
ci->i_snap_caps = 0;
+ ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
ci->i_nr_by_mode[i] = 0;
@@ -496,6 +501,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_rdcache_ref = 0;
ci->i_wr_ref = 0;
ci->i_wb_ref = 0;
+ ci->i_fx_ref = 0;
ci->i_wrbuffer_ref = 0;
ci->i_wrbuffer_ref_head = 0;
atomic_set(&ci->i_filelock_ref, 0);
@@ -586,6 +592,7 @@ void ceph_evict_inode(struct inode *inode)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
}
static inline blkcnt_t calc_inode_blocks(u64 size)
@@ -636,7 +643,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
if ((issued & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_BUFFER)) ||
mapping_mapped(inode->i_mapping) ||
- __ceph_caps_file_wanted(ci)) {
+ __ceph_is_file_opened(ci)) {
ci->i_truncate_pending++;
queue_trunc = 1;
}
@@ -727,11 +734,11 @@ void ceph_fill_file_time(struct inode *inode, int issued,
* Populate an inode based on info from mds. May be called on new or
* existing inodes.
*/
-static int fill_inode(struct inode *inode, struct page *locked_page,
- struct ceph_mds_reply_info_in *iinfo,
- struct ceph_mds_reply_dirfrag *dirinfo,
- struct ceph_mds_session *session, int cap_fmode,
- struct ceph_cap_reservation *caps_reservation)
+int ceph_fill_inode(struct inode *inode, struct page *locked_page,
+ struct ceph_mds_reply_info_in *iinfo,
+ struct ceph_mds_reply_dirfrag *dirinfo,
+ struct ceph_mds_session *session, int cap_fmode,
+ struct ceph_cap_reservation *caps_reservation)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_reply_inode *info = iinfo->in;
@@ -748,7 +755,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
bool new_version = false;
bool fill_inline = false;
- dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
+ dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
inode, ceph_vinop(inode), le64_to_cpu(info->version),
ci->i_version);
@@ -769,7 +776,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (iinfo->xattr_len > 4) {
xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
if (!xattr_blob)
- pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
+ pr_err("%s ENOMEM xattr blob %d bytes\n", __func__,
iinfo->xattr_len);
}
@@ -932,8 +939,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
spin_unlock(&ci->i_ceph_lock);
if (symlen != i_size_read(inode)) {
- pr_err("fill_inode %llx.%llx BAD symlink "
- "size %lld\n", ceph_vinop(inode),
+ pr_err("%s %llx.%llx BAD symlink "
+ "size %lld\n", __func__,
+ ceph_vinop(inode),
i_size_read(inode));
i_size_write(inode, symlen);
inode->i_blocks = calc_inode_blocks(symlen);
@@ -957,7 +965,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
inode->i_fop = &ceph_dir_fops;
break;
default:
- pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
+ pr_err("%s %llx.%llx BAD mode 0%o\n", __func__,
ceph_vinop(inode), inode->i_mode);
}
@@ -966,7 +974,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (ceph_snap(inode) == CEPH_NOSNAP) {
ceph_add_cap(inode, session,
le64_to_cpu(info->cap.cap_id),
- cap_fmode, info_caps,
+ info_caps,
le32_to_cpu(info->cap.wanted),
le32_to_cpu(info->cap.seq),
le32_to_cpu(info->cap.mseq),
@@ -991,13 +999,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
dout(" %p got snap_caps %s\n", inode,
ceph_cap_string(info_caps));
ci->i_snap_caps |= info_caps;
- if (cap_fmode >= 0)
- __ceph_get_fmode(ci, cap_fmode);
}
- } else if (cap_fmode >= 0) {
- pr_warn("mds issued no caps on %llx.%llx\n",
- ceph_vinop(inode));
- __ceph_get_fmode(ci, cap_fmode);
}
if (iinfo->inline_version > 0 &&
@@ -1009,6 +1011,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
fill_inline = true;
}
+ if (cap_fmode >= 0) {
+ if (!info_caps)
+ pr_warn("mds issued no caps on %llx.%llx\n",
+ ceph_vinop(inode));
+ __ceph_touch_fmode(ci, mdsc, cap_fmode);
+ }
+
spin_unlock(&ci->i_ceph_lock);
if (fill_inline)
@@ -1050,6 +1059,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
struct ceph_mds_session **old_lease_session)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
+ unsigned mask = le16_to_cpu(lease->mask);
long unsigned duration = le32_to_cpu(lease->duration_ms);
long unsigned ttl = from_time + (duration * HZ) / 1000;
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
@@ -1061,8 +1071,13 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
if (ceph_snap(dir) != CEPH_NOSNAP)
return;
+ if (mask & CEPH_LEASE_PRIMARY_LINK)
+ di->flags |= CEPH_DENTRY_PRIMARY_LINK;
+ else
+ di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
+
di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
- if (duration == 0) {
+ if (!(mask & CEPH_LEASE_VALID)) {
__ceph_dentry_dir_lease_touch(di);
return;
}
@@ -1239,10 +1254,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
struct inode *dir = req->r_parent;
if (dir) {
- err = fill_inode(dir, NULL,
- &rinfo->diri, rinfo->dirfrag,
- session, -1,
- &req->r_caps_reservation);
+ err = ceph_fill_inode(dir, NULL, &rinfo->diri,
+ rinfo->dirfrag, session, -1,
+ &req->r_caps_reservation);
if (err < 0)
goto done;
} else {
@@ -1307,13 +1321,14 @@ retry_lookup:
goto done;
}
- err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
- session,
+ err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
+ NULL, session,
(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+ !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
rinfo->head->result == 0) ? req->r_fmode : -1,
&req->r_caps_reservation);
if (err < 0) {
- pr_err("fill_inode badness %p %llx.%llx\n",
+ pr_err("ceph_fill_inode badness %p %llx.%llx\n",
in, ceph_vinop(in));
if (in->i_state & I_NEW)
discard_new_inode(in);
@@ -1500,10 +1515,11 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
dout("new_inode badness got %d\n", err);
continue;
}
- rc = fill_inode(in, NULL, &rde->inode, NULL, session,
- -1, &req->r_caps_reservation);
+ rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
+ -1, &req->r_caps_reservation);
if (rc < 0) {
- pr_err("fill_inode badness on %p got %d\n", in, rc);
+ pr_err("ceph_fill_inode badness on %p got %d\n",
+ in, rc);
err = rc;
if (in->i_state & I_NEW) {
ihold(in);
@@ -1707,10 +1723,10 @@ retry_lookup:
}
}
- ret = fill_inode(in, NULL, &rde->inode, NULL, session,
- -1, &req->r_caps_reservation);
+ ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
+ -1, &req->r_caps_reservation);
if (ret < 0) {
- pr_err("fill_inode badness on %p\n", in);
+ pr_err("ceph_fill_inode badness on %p\n", in);
if (d_really_is_negative(dn)) {
/* avoid calling iput_final() in mds
* dispatch threads */
@@ -1972,7 +1988,7 @@ retry:
mutex_unlock(&ci->i_truncate_mutex);
if (wrbuffer_refs == 0)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, 0, NULL);
wake_up_all(&ci->i_cap_wq);
}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index c90f03beb15d..6e061bf62ad4 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file)
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
spin_lock(&ci->i_ceph_lock);
fi->fmode |= CEPH_FILE_MODE_LAZY;
ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
+ __ceph_touch_fmode(ci, mdsc, fi->fmode);
spin_unlock(&ci->i_ceph_lock);
dout("ioctl_layzio: file %p marked lazy\n", file);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 544e9e85b120..d6b9166e71e4 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -210,6 +210,21 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
return 0;
}
+static int try_unlock_file(struct file *file, struct file_lock *fl)
+{
+ int err;
+ unsigned int orig_flags = fl->fl_flags;
+ fl->fl_flags |= FL_EXISTS;
+ err = locks_lock_file_wait(file, fl);
+ fl->fl_flags = orig_flags;
+ if (err == -ENOENT) {
+ if (!(orig_flags & FL_EXISTS))
+ err = 0;
+ return err;
+ }
+ return 1;
+}
+
/**
* Attempt to set an fcntl lock.
* For now, this just goes away to the server. Later it may be more awesome.
@@ -255,9 +270,15 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
else
lock_cmd = CEPH_LOCK_UNLOCK;
+ if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) {
+ err = try_unlock_file(file, fl);
+ if (err <= 0)
+ return err;
+ }
+
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
if (!err) {
- if (op == CEPH_MDS_OP_SETFILELOCK) {
+ if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) {
dout("mds locked, locking locally\n");
err = posix_lock_file(file, fl, NULL);
if (err) {
@@ -311,9 +332,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
else
lock_cmd = CEPH_LOCK_UNLOCK;
+ if (F_UNLCK == fl->fl_type) {
+ err = try_unlock_file(file, fl);
+ if (err <= 0)
+ return err;
+ }
+
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
inode, lock_cmd, wait, fl);
- if (!err) {
+ if (!err && F_UNLCK != fl->fl_type) {
err = locks_lock_file_wait(file, fl);
if (err) {
ceph_lock_message(CEPH_LOCK_FLOCK,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index bbbbddf71326..486f91f9685b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -415,21 +415,121 @@ bad:
return -EIO;
}
+
+#if BITS_PER_LONG == 64
+
+#define DELEGATED_INO_AVAILABLE xa_mk_value(1)
+
+static int ceph_parse_deleg_inos(void **p, void *end,
+ struct ceph_mds_session *s)
+{
+ u32 sets;
+
+ ceph_decode_32_safe(p, end, sets, bad);
+ dout("got %u sets of delegated inodes\n", sets);
+ while (sets--) {
+ u64 start, len, ino;
+
+ ceph_decode_64_safe(p, end, start, bad);
+ ceph_decode_64_safe(p, end, len, bad);
+ while (len--) {
+ int err = xa_insert(&s->s_delegated_inos, ino = start++,
+ DELEGATED_INO_AVAILABLE,
+ GFP_KERNEL);
+ if (!err) {
+ dout("added delegated inode 0x%llx\n",
+ start - 1);
+ } else if (err == -EBUSY) {
+ pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n",
+ start - 1);
+ } else {
+ return err;
+ }
+ }
+ }
+ return 0;
+bad:
+ return -EIO;
+}
+
+u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
+{
+ unsigned long ino;
+ void *val;
+
+ xa_for_each(&s->s_delegated_inos, ino, val) {
+ val = xa_erase(&s->s_delegated_inos, ino);
+ if (val == DELEGATED_INO_AVAILABLE)
+ return ino;
+ }
+ return 0;
+}
+
+int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
+{
+ return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
+ GFP_KERNEL);
+}
+#else /* BITS_PER_LONG == 64 */
+/*
+ * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
+ * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
+ * and bottom words?
+ */
+static int ceph_parse_deleg_inos(void **p, void *end,
+ struct ceph_mds_session *s)
+{
+ u32 sets;
+
+ ceph_decode_32_safe(p, end, sets, bad);
+ if (sets)
+ ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
+ return 0;
+bad:
+ return -EIO;
+}
+
+u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
+{
+ return 0;
+}
+
+int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
+{
+ return 0;
+}
+#endif /* BITS_PER_LONG == 64 */
+
/*
* parse create results
*/
static int parse_reply_info_create(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- u64 features)
+ u64 features, struct ceph_mds_session *s)
{
+ int ret;
+
if (features == (u64)-1 ||
(features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
- /* Malformed reply? */
if (*p == end) {
+ /* Malformed reply? */
info->has_create_ino = false;
- } else {
+ } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
+ u8 struct_v, struct_compat;
+ u32 len;
+
info->has_create_ino = true;
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_64_safe(p, end, info->ino, bad);
+ ret = ceph_parse_deleg_inos(p, end, s);
+ if (ret)
+ return ret;
+ } else {
+ /* legacy */
ceph_decode_64_safe(p, end, info->ino, bad);
+ info->has_create_ino = true;
}
} else {
if (*p != end)
@@ -448,7 +548,7 @@ bad:
*/
static int parse_reply_info_extra(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- u64 features)
+ u64 features, struct ceph_mds_session *s)
{
u32 op = le32_to_cpu(info->head->op);
@@ -457,7 +557,7 @@ static int parse_reply_info_extra(void **p, void *end,
else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
return parse_reply_info_readdir(p, end, info, features);
else if (op == CEPH_MDS_OP_CREATE)
- return parse_reply_info_create(p, end, info, features);
+ return parse_reply_info_create(p, end, info, features, s);
else
return -EIO;
}
@@ -465,7 +565,7 @@ static int parse_reply_info_extra(void **p, void *end,
/*
* parse entire mds reply
*/
-static int parse_reply_info(struct ceph_msg *msg,
+static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
struct ceph_mds_reply_info_parsed *info,
u64 features)
{
@@ -490,7 +590,7 @@ static int parse_reply_info(struct ceph_msg *msg,
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
ceph_decode_need(&p, end, len, bad);
- err = parse_reply_info_extra(&p, p+len, info, features);
+ err = parse_reply_info_extra(&p, p+len, info, features, s);
if (err < 0)
goto out_bad;
}
@@ -558,6 +658,7 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
if (refcount_dec_and_test(&s->s_ref)) {
if (s->s_auth.authorizer)
ceph_auth_destroy_authorizer(s->s_auth.authorizer);
+ xa_destroy(&s->s_delegated_inos);
kfree(s);
}
}
@@ -645,6 +746,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
refcount_set(&s->s_ref, 1);
INIT_LIST_HEAD(&s->s_waiting);
INIT_LIST_HEAD(&s->s_unsafe);
+ xa_init(&s->s_delegated_inos);
s->s_num_cap_releases = 0;
s->s_cap_reconnect = 0;
s->s_cap_iterator = NULL;
@@ -699,6 +801,7 @@ void ceph_mdsc_release_request(struct kref *kref)
struct ceph_mds_request *req = container_of(kref,
struct ceph_mds_request,
r_kref);
+ ceph_mdsc_release_dir_caps(req);
destroy_reply_info(&req->r_reply_info);
if (req->r_request)
ceph_msg_put(req->r_request);
@@ -736,7 +839,7 @@ void ceph_mdsc_release_request(struct kref *kref)
put_request_session(req);
ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
WARN_ON_ONCE(!list_empty(&req->r_wait));
- kfree(req);
+ kmem_cache_free(ceph_mds_request_cachep, req);
}
DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
@@ -793,8 +896,13 @@ static void __register_request(struct ceph_mds_client *mdsc,
mdsc->oldest_tid = req->r_tid;
if (dir) {
+ struct ceph_inode_info *ci = ceph_inode(dir);
+
ihold(dir);
req->r_unsafe_dir = dir;
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
+ spin_unlock(&ci->i_unsafe_lock);
}
}
@@ -822,8 +930,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
erase_request(&mdsc->request_tree, req);
- if (req->r_unsafe_dir &&
- test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+ if (req->r_unsafe_dir) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_dir_item);
@@ -1407,8 +1514,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
dout("removing cap %p, ci is %p, inode is %p\n",
cap, ci, &ci->vfs_inode);
spin_lock(&ci->i_ceph_lock);
- if (cap->mds_wanted | cap->issued)
- ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
__ceph_remove_cap(cap, false);
if (!ci->i_auth_cap) {
struct ceph_cap_flush *cf;
@@ -1574,9 +1679,6 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
/* mds did not re-issue stale cap */
spin_lock(&ci->i_ceph_lock);
cap->issued = cap->implemented = CEPH_CAP_PIN;
- /* make sure mds knows what we want */
- if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted)
- ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
spin_unlock(&ci->i_ceph_lock);
}
} else if (ev == FORCE_RO) {
@@ -1772,7 +1874,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
}
/* The inode has cached pages, but it's no longer used.
* we can safely drop it */
- if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
+ if (S_ISREG(inode->i_mode) &&
+ wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
!(oissued & CEPH_CAP_FILE_CACHE)) {
used = 0;
oissued = 0;
@@ -2089,8 +2192,9 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
{
- struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+ struct ceph_mds_request *req;
+ req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS);
if (!req)
return ERR_PTR(-ENOMEM);
@@ -2368,7 +2472,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
head->op = cpu_to_le32(req->r_op);
head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
- head->ino = 0;
+ head->ino = cpu_to_le64(req->r_deleg_ino);
head->args = req->r_args;
ceph_encode_filepath(&p, end, ino1, path1);
@@ -2382,7 +2486,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_inode_drop)
releases += ceph_encode_inode_release(&p,
req->r_inode ? req->r_inode : d_inode(req->r_dentry),
- mds, req->r_inode_drop, req->r_inode_unless, 0);
+ mds, req->r_inode_drop, req->r_inode_unless,
+ req->r_op == CEPH_MDS_OP_READDIR);
if (req->r_dentry_drop)
releases += ceph_encode_dentry_release(&p, req->r_dentry,
req->r_parent, mds, req->r_dentry_drop,
@@ -2522,12 +2627,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
flags |= CEPH_MDS_FLAG_REPLAY;
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
+ flags |= CEPH_MDS_FLAG_ASYNC;
if (req->r_parent)
flags |= CEPH_MDS_FLAG_WANT_DENTRY;
rhead->flags = cpu_to_le32(flags);
rhead->num_fwd = req->r_num_fwd;
rhead->num_retry = req->r_attempts - 1;
- rhead->ino = 0;
dout(" r_parent = %p\n", req->r_parent);
return 0;
@@ -2573,7 +2679,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (req->r_timeout &&
time_after_eq(jiffies, req->r_started + req->r_timeout)) {
dout("do_request timed out\n");
- err = -EIO;
+ err = -ETIMEDOUT;
goto finish;
}
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
@@ -2605,6 +2711,10 @@ static void __do_request(struct ceph_mds_client *mdsc,
mds = __choose_mds(mdsc, req, &random);
if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
+ err = -EJUKEBOX;
+ goto finish;
+ }
dout("do_request no mds or not active, waiting for map\n");
list_add(&req->r_wait, &mdsc->waiting_for_map);
return;
@@ -2629,6 +2739,15 @@ static void __do_request(struct ceph_mds_client *mdsc,
err = -EACCES;
goto out_session;
}
+ /*
+ * We cannot queue async requests since the caps and delegated
+ * inodes are bound to the session. Just return -EJUKEBOX and
+ * let the caller retry a sync request in that case.
+ */
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
+ err = -EJUKEBOX;
+ goto out_session;
+ }
if (session->s_state == CEPH_MDS_SESSION_NEW ||
session->s_state == CEPH_MDS_SESSION_CLOSING) {
__open_session(mdsc, session);
@@ -2709,19 +2828,43 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
struct ceph_mds_request *req)
{
- int err;
+ int err = 0;
/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
if (req->r_inode)
ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
if (req->r_parent) {
- ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+ struct ceph_inode_info *ci = ceph_inode(req->r_parent);
+ int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ?
+ CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD;
+ spin_lock(&ci->i_ceph_lock);
+ ceph_take_cap_refs(ci, CEPH_CAP_PIN, false);
+ __ceph_touch_fmode(ci, mdsc, fmode);
+ spin_unlock(&ci->i_ceph_lock);
ihold(req->r_parent);
}
if (req->r_old_dentry_dir)
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
+ if (req->r_inode) {
+ err = ceph_wait_on_async_create(req->r_inode);
+ if (err) {
+ dout("%s: wait for async create returned: %d\n",
+ __func__, err);
+ return err;
+ }
+ }
+
+ if (!err && req->r_old_inode) {
+ err = ceph_wait_on_async_create(req->r_old_inode);
+ if (err) {
+ dout("%s: wait for async create returned: %d\n",
+ __func__, err);
+ return err;
+ }
+ }
+
dout("submit_request on %p for inode %p\n", req, dir);
mutex_lock(&mdsc->mutex);
__register_request(mdsc, req, dir);
@@ -2747,7 +2890,7 @@ static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
if (timeleft > 0)
err = 0;
else if (!timeleft)
- err = -EIO; /* timed out */
+ err = -ETIMEDOUT; /* timed out */
else
err = timeleft; /* killed */
}
@@ -2935,22 +3078,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
} else {
set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
- if (req->r_unsafe_dir) {
- struct ceph_inode_info *ci =
- ceph_inode(req->r_unsafe_dir);
- spin_lock(&ci->i_unsafe_lock);
- list_add_tail(&req->r_unsafe_dir_item,
- &ci->i_unsafe_dirops);
- spin_unlock(&ci->i_unsafe_lock);
- }
}
dout("handle_reply tid %lld result %d\n", tid, result);
rinfo = &req->r_reply_info;
if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
- err = parse_reply_info(msg, rinfo, (u64)-1);
+ err = parse_reply_info(session, msg, rinfo, (u64)-1);
else
- err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
+ err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features);
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
@@ -3249,6 +3384,17 @@ bad:
return;
}
+void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
+{
+ int dcaps;
+
+ dcaps = xchg(&req->r_dir_caps, 0);
+ if (dcaps) {
+ dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
+ ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
+ }
+}
+
/*
* called under session->mutex.
*/
@@ -3276,9 +3422,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
continue;
if (req->r_attempts == 0)
continue; /* only old requests */
- if (req->r_session &&
- req->r_session->s_mds == session->s_mds)
- __send_request(mdsc, session, req, true);
+ if (!req->r_session)
+ continue;
+ if (req->r_session->s_mds != session->s_mds)
+ continue;
+
+ ceph_mdsc_release_dir_caps(req);
+
+ __send_request(mdsc, session, req, true);
}
mutex_unlock(&mdsc->mutex);
}
@@ -3362,7 +3513,7 @@ fail_msg:
/*
* Encode information about a cap for a reconnect with the MDS.
*/
-static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
+static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
void *arg)
{
union {
@@ -3385,6 +3536,15 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
cap->mseq = 0; /* and migrate_seq */
cap->cap_gen = cap->session->s_cap_gen;
+ /* These are lost when the session goes away */
+ if (S_ISDIR(inode->i_mode)) {
+ if (cap->issued & CEPH_CAP_DIR_CREATE) {
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
+ }
+ cap->issued &= ~CEPH_CAP_ANY_DIR_OPS;
+ }
+
if (recon_state->msg_version >= 2) {
rec.v2.cap_id = cpu_to_le64(cap->cap_id);
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
@@ -3626,6 +3786,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
if (!reply)
goto fail_nomsg;
+ xa_destroy(&session->s_delegated_inos);
+
mutex_lock(&session->s_mutex);
session->s_state = CEPH_MDS_SESSION_RECONNECTING;
session->s_seq = 0;
@@ -3681,7 +3843,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
recon_state.msg_version = 2;
}
/* trsaverse this session's caps */
- err = ceph_iterate_session_caps(session, encode_caps_cb, &recon_state);
+ err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state);
spin_lock(&session->s_cap_lock);
session->s_cap_reconnect = 0;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 27a7446e10d3..4e5be79bf080 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -23,8 +23,9 @@ enum ceph_feature_type {
CEPHFS_FEATURE_RECLAIM_CLIENT,
CEPHFS_FEATURE_LAZY_CAP_WANTED,
CEPHFS_FEATURE_MULTI_RECONNECT,
+ CEPHFS_FEATURE_DELEG_INO,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
};
/*
@@ -37,6 +38,7 @@ enum ceph_feature_type {
CEPHFS_FEATURE_REPLY_ENCODING, \
CEPHFS_FEATURE_LAZY_CAP_WANTED, \
CEPHFS_FEATURE_MULTI_RECONNECT, \
+ CEPHFS_FEATURE_DELEG_INO, \
\
CEPHFS_FEATURE_MAX, \
}
@@ -201,6 +203,7 @@ struct ceph_mds_session {
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
+ struct xarray s_delegated_inos;
};
/*
@@ -255,6 +258,7 @@ struct ceph_mds_request {
#define CEPH_MDS_R_GOT_RESULT (5) /* got a result */
#define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */
#define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */
+#define CEPH_MDS_R_ASYNC (8) /* async request */
unsigned long r_req_flags;
struct mutex r_fill_mutex;
@@ -263,6 +267,7 @@ struct ceph_mds_request {
int r_fmode; /* file mode, if expecting cap */
kuid_t r_uid;
kgid_t r_gid;
+ int r_request_release_offset;
struct timespec64 r_stamp;
/* for choosing which mds to send this request to */
@@ -280,12 +285,16 @@ struct ceph_mds_request {
int r_old_inode_drop, r_old_inode_unless;
struct ceph_msg *r_request; /* original request */
- int r_request_release_offset;
struct ceph_msg *r_reply;
struct ceph_mds_reply_info_parsed r_reply_info;
- struct page *r_locked_page;
int r_err;
+
+ struct page *r_locked_page;
+ int r_dir_caps;
+ int r_num_caps;
+ u32 r_readdir_offset;
+
unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */
unsigned long r_request_started; /* start time for mds request only,
@@ -304,6 +313,7 @@ struct ceph_mds_request {
int r_num_fwd; /* number of forward attempts */
int r_resend_mds; /* mds to resend to next, if any*/
u32 r_sent_on_mseq; /* cap mseq request was sent at*/
+ u64 r_deleg_ino;
struct list_head r_wait;
struct completion r_completion;
@@ -315,10 +325,8 @@ struct ceph_mds_request {
long long r_dir_release_cnt;
long long r_dir_ordered_cnt;
int r_readdir_cache_idx;
- u32 r_readdir_offset;
struct ceph_cap_reservation r_caps_reservation;
- int r_num_caps;
};
struct ceph_pool_perm {
@@ -488,6 +496,7 @@ extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
+extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req);
static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
{
kref_get(&req->r_kref);
@@ -537,4 +546,15 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
int max_caps);
+
+static inline int ceph_wait_on_async_create(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
+ TASK_INTERRUPTIBLE);
+}
+
+extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
+extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
#endif
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c7f150686a53..c9784eb1159a 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -155,6 +155,7 @@ enum {
Opt_acl,
Opt_quotadf,
Opt_copyfrom,
+ Opt_wsync,
};
enum ceph_recover_session_mode {
@@ -194,6 +195,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = {
fsparam_string ("snapdirname", Opt_snapdirname),
fsparam_string ("source", Opt_source),
fsparam_u32 ("wsize", Opt_wsize),
+ fsparam_flag_no ("wsync", Opt_wsync),
{}
};
@@ -444,6 +446,12 @@ static int ceph_parse_mount_param(struct fs_context *fc,
fc->sb_flags &= ~SB_POSIXACL;
}
break;
+ case Opt_wsync:
+ if (!result.negated)
+ fsopt->flags &= ~CEPH_MOUNT_OPT_ASYNC_DIROPS;
+ else
+ fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS;
+ break;
default:
BUG();
}
@@ -567,6 +575,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
seq_show_option(m, "recover_session", "clean");
+ if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
+ seq_puts(m, ",nowsync");
+
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
seq_printf(m, ",wsize=%u", fsopt->wsize);
if (fsopt->rsize != CEPH_MAX_READ_SIZE)
@@ -729,6 +740,7 @@ struct kmem_cache *ceph_cap_flush_cachep;
struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;
struct kmem_cache *ceph_dir_file_cachep;
+struct kmem_cache *ceph_mds_request_cachep;
static void ceph_inode_init_once(void *foo)
{
@@ -769,6 +781,10 @@ static int __init init_caches(void)
if (!ceph_dir_file_cachep)
goto bad_dir_file;
+ ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, SLAB_MEM_SPREAD);
+ if (!ceph_mds_request_cachep)
+ goto bad_mds_req;
+
error = ceph_fscache_register();
if (error)
goto bad_fscache;
@@ -776,6 +792,8 @@ static int __init init_caches(void)
return 0;
bad_fscache:
+ kmem_cache_destroy(ceph_mds_request_cachep);
+bad_mds_req:
kmem_cache_destroy(ceph_dir_file_cachep);
bad_dir_file:
kmem_cache_destroy(ceph_file_cachep);
@@ -804,6 +822,7 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep);
kmem_cache_destroy(ceph_dir_file_cachep);
+ kmem_cache_destroy(ceph_mds_request_cachep);
ceph_fscache_unregister();
}
@@ -1107,6 +1126,15 @@ static void ceph_free_fc(struct fs_context *fc)
static int ceph_reconfigure_fc(struct fs_context *fc)
{
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(fc->root->d_sb);
+
+ if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
+ ceph_set_mount_opt(fsc, ASYNC_DIROPS);
+ else
+ ceph_clear_mount_opt(fsc, ASYNC_DIROPS);
+
sync_filesystem(fc->root->d_sb);
return 0;
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 037cdfb2ad4f..60aac3aee055 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -43,13 +43,16 @@
#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */
#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */
#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */
+#define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */
#define CEPH_MOUNT_OPT_DEFAULT \
(CEPH_MOUNT_OPT_DCACHE | \
CEPH_MOUNT_OPT_NOCOPYFROM)
#define ceph_set_mount_opt(fsc, opt) \
- (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+ (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt
+#define ceph_clear_mount_opt(fsc, opt) \
+ (fsc)->mount_options->flags &= ~CEPH_MOUNT_OPT_##opt
#define ceph_test_mount_opt(fsc, opt) \
(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
@@ -170,9 +173,9 @@ struct ceph_cap {
struct list_head caps_item;
};
-#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
-#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
-#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
+#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */
+#define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */
+#define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */
struct ceph_cap_flush {
u64 tid;
@@ -284,6 +287,7 @@ struct ceph_dentry_info {
#define CEPH_DENTRY_REFERENCED 1
#define CEPH_DENTRY_LEASE_LIST 2
#define CEPH_DENTRY_SHRINK_LIST 4
+#define CEPH_DENTRY_PRIMARY_LINK 8
struct ceph_inode_xattrs_info {
/*
@@ -315,13 +319,14 @@ struct ceph_inode_info {
u64 i_inline_version;
u32 i_time_warp_seq;
- unsigned i_ceph_flags;
+ unsigned long i_ceph_flags;
atomic64_t i_release_count;
atomic64_t i_ordered_count;
atomic64_t i_complete_seq[2];
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
+ struct ceph_file_layout i_cached_layout; // for async creates
char *i_symlink;
/* for dirs */
@@ -352,7 +357,6 @@ struct ceph_inode_info {
struct ceph_cap_flush *i_prealloc_cap_flush;
struct list_head i_cap_flush_list;
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
- unsigned long i_hold_caps_min; /* jiffies */
unsigned long i_hold_caps_max; /* jiffies */
struct list_head i_cap_delay_list; /* for delayed cap release to mds */
struct ceph_cap_reservation i_cap_migration_resv;
@@ -361,6 +365,8 @@ struct ceph_inode_info {
dirty|flushing caps */
unsigned i_snap_caps; /* cap bits for snapped files */
+ unsigned long i_last_rd;
+ unsigned long i_last_wr;
int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */
struct mutex i_truncate_mutex;
@@ -375,7 +381,7 @@ struct ceph_inode_info {
/* held references to caps */
int i_pin_ref;
- int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;
+ int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref, i_fx_ref;
int i_wrbuffer_ref, i_wrbuffer_ref_head;
atomic_t i_filelock_ref;
atomic_t i_shared_gen; /* increment each time we get FILE_SHARED */
@@ -511,18 +517,18 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
* Ceph inode.
*/
#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */
-#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */
#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */
#define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */
#define CEPH_I_POOL_RD (1 << 4) /* can read from pool */
#define CEPH_I_POOL_WR (1 << 5) /* can write to pool */
#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */
-#define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */
-#define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */
-#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */
-#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */
-#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */
-#define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */
+#define CEPH_I_KICK_FLUSH (1 << 7) /* kick flushing caps */
+#define CEPH_I_FLUSH_SNAPS (1 << 8) /* need flush snapss */
+#define CEPH_I_ERROR_WRITE (1 << 9) /* have seen write errors */
+#define CEPH_I_ERROR_FILELOCK (1 << 10) /* have seen file lock errors */
+#define CEPH_I_ODIRECT (1 << 11) /* inode in direct I/O mode */
+#define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */
+#define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT)
/*
* Masks of ceph inode work.
@@ -674,18 +680,12 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
extern int __ceph_caps_used(struct ceph_inode_info *ci);
-extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
-
-/*
- * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
- */
-static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
+static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci)
{
- int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
- if (w & CEPH_CAP_FILE_BUFFER)
- w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
- return w;
+ return ci->i_nr_by_mode[0];
}
+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
+extern int __ceph_caps_wanted(struct ceph_inode_info *ci);
/* what the mds thinks we want */
extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
@@ -899,6 +899,9 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
}
/* inode.c */
+struct ceph_mds_reply_info_in;
+struct ceph_mds_reply_dirfrag;
+
extern const struct inode_operations ceph_file_iops;
extern struct inode *ceph_alloc_inode(struct super_block *sb);
@@ -914,6 +917,11 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
u64 time_warp_seq, struct timespec64 *ctime,
struct timespec64 *mtime,
struct timespec64 *atime);
+extern int ceph_fill_inode(struct inode *inode, struct page *locked_page,
+ struct ceph_mds_reply_info_in *iinfo,
+ struct ceph_mds_reply_dirfrag *dirinfo,
+ struct ceph_mds_session *session, int cap_fmode,
+ struct ceph_cap_reservation *caps_reservation);
extern int ceph_fill_trace(struct super_block *sb,
struct ceph_mds_request *req);
extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
@@ -1042,7 +1050,7 @@ extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx);
extern void ceph_add_cap(struct inode *inode,
struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
+ unsigned issued, unsigned wanted,
unsigned cap, unsigned seq, u64 realmino, int flags,
struct ceph_cap **new_cap);
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
@@ -1058,8 +1066,12 @@ extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
+void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
+ struct ceph_inode_info *ci);
extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
int mds);
+extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps,
+ bool snap_rwsem_locked);
extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
@@ -1084,8 +1096,10 @@ extern int ceph_try_get_caps(struct inode *inode,
int need, int want, bool nonblock, int *got);
/* for counting open files by mode */
-extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
-extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
+extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
+ struct ceph_mds_client *mdsc, int fmode);
/* addr.c */
extern const struct address_space_operations ceph_aops;
@@ -1097,7 +1111,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
/* file.c */
extern const struct file_operations ceph_file_fops;
-extern int ceph_renew_caps(struct inode *inode);
+extern int ceph_renew_caps(struct inode *inode, int fmode);
extern int ceph_open(struct inode *inode, struct file *file);
extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags, umode_t mode);
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 22cf04fb32d3..604f65f4b6c5 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -202,7 +202,7 @@ config CIFS_SMB_DIRECT
help
Enables SMB Direct support for SMB 3.0, 3.02 and 3.1.1.
SMB Direct allows transferring SMB packets over RDMA. If unsure,
- say N.
+ say Y.
config CIFS_FSCACHE
bool "Provide CIFS client caching support"
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 276e4b5ea8e0..916567d770f5 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -323,10 +323,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
atomic_read(&server->smbd_conn->send_credits),
atomic_read(&server->smbd_conn->receive_credits),
server->smbd_conn->receive_credit_target);
- seq_printf(m, "\nPending send_pending: %x "
- "send_payload_pending: %x",
- atomic_read(&server->smbd_conn->send_pending),
- atomic_read(&server->smbd_conn->send_payload_pending));
+ seq_printf(m, "\nPending send_pending: %x ",
+ atomic_read(&server->smbd_conn->send_pending));
seq_printf(m, "\nReceive buffers count_receive_queue: %x "
"count_empty_packet_queue: %x",
server->smbd_conn->count_receive_queue,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 94e3ed4850b5..c31f362fa098 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1208,6 +1208,10 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
{
unsigned int xid = get_xid();
ssize_t rc;
+ struct cifsFileInfo *cfile = dst_file->private_data;
+
+ if (cfile->swapfile)
+ return -EOPNOTSUPP;
rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff,
len, flags);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0d956360e984..05dd3dea684b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -426,7 +426,8 @@ struct smb_version_operations {
/* generate new lease key */
void (*new_lease_key)(struct cifs_fid *);
int (*generate_signingkey)(struct cifs_ses *);
- int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *);
+ int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *,
+ bool allocate_crypto);
int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon,
struct cifsFileInfo *src_file);
int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon,
@@ -1312,6 +1313,7 @@ struct cifsFileInfo {
struct tcon_link *tlink;
unsigned int f_flags;
bool invalidHandle:1; /* file closed via session abend */
+ bool swapfile:1;
bool oplock_break_cancelled:1;
unsigned int oplock_epoch; /* epoch from the lease break */
__u32 oplock_level; /* oplock/lease level from the lease break */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5920820bfbd0..0b1528edebcf 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4808,6 +4808,60 @@ cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter)
return -EINVAL;
}
+static int cifs_swap_activate(struct swap_info_struct *sis,
+ struct file *swap_file, sector_t *span)
+{
+ struct cifsFileInfo *cfile = swap_file->private_data;
+ struct inode *inode = swap_file->f_mapping->host;
+ unsigned long blocks;
+ long long isize;
+
+ cifs_dbg(FYI, "swap activate\n");
+
+ spin_lock(&inode->i_lock);
+ blocks = inode->i_blocks;
+ isize = inode->i_size;
+ spin_unlock(&inode->i_lock);
+ if (blocks*512 < isize) {
+ pr_warn("swap activate: swapfile has holes\n");
+ return -EINVAL;
+ }
+ *span = sis->pages;
+
+ printk_once(KERN_WARNING "Swap support over SMB3 is experimental\n");
+
+ /*
+ * TODO: consider adding ACL (or documenting how) to prevent other
+ * users (on this or other systems) from reading it
+ */
+
+
+ /* TODO: add sk_set_memalloc(inet) or similar */
+
+ if (cfile)
+ cfile->swapfile = true;
+ /*
+ * TODO: Since file already open, we can't open with DENY_ALL here
+ * but we could add call to grab a byte range lock to prevent others
+ * from reading or writing the file
+ */
+
+ return 0;
+}
+
+static void cifs_swap_deactivate(struct file *file)
+{
+ struct cifsFileInfo *cfile = file->private_data;
+
+ cifs_dbg(FYI, "swap deactivate\n");
+
+ /* TODO: undo sk_set_memalloc(inet) will eventually be needed */
+
+ if (cfile)
+ cfile->swapfile = false;
+
+ /* do we need to unpin (or unlock) the file */
+}
const struct address_space_operations cifs_addr_ops = {
.readpage = cifs_readpage,
@@ -4821,6 +4875,13 @@ const struct address_space_operations cifs_addr_ops = {
.direct_IO = cifs_direct_io,
.invalidatepage = cifs_invalidate_page,
.launder_page = cifs_launder_page,
+ /*
+ * TODO: investigate and if useful we could add an cifs_migratePage
+ * helper (under an CONFIG_MIGRATION) in the future, and also
+ * investigate and add an is_dirty_writeback helper if needed
+ */
+ .swap_activate = cifs_swap_activate,
+ .swap_deactivate = cifs_swap_deactivate,
};
/*
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8d01ec2dca66..8fbbdcdad8ff 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2026,6 +2026,10 @@ cifs_revalidate_mapping(struct inode *inode)
int rc;
unsigned long *flags = &CIFS_I(inode)->flags;
+ /* swapfiles are not supposed to be shared */
+ if (IS_SWAPFILE(inode))
+ return 0;
+
rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable,
TASK_KILLABLE);
if (rc)
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 19e4a5d3b4ca..50f776a8d4ba 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -246,7 +246,7 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info,
*/
fattr->cf_mode = le32_to_cpu(info->Mode) & ~S_IFMT;
- cifs_dbg(VFS, "XXX dev %d, reparse %d, mode %o",
+ cifs_dbg(FYI, "posix fattr: dev %d, reparse %d, mode %o",
le32_to_cpu(info->DeviceId),
le32_to_cpu(info->ReparseTag),
le32_to_cpu(info->Mode));
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 0511aaf451d4..497afb0b9960 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -766,6 +766,20 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count);
spin_lock(&cifs_tcp_ses_lock);
+ if (tcon->tc_count <= 0) {
+ struct TCP_Server_Info *server = NULL;
+
+ WARN_ONCE(tcon->tc_count < 0, "tcon refcount is negative");
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ if (tcon->ses)
+ server = tcon->ses->server;
+
+ cifs_server_dbg(FYI, "tid=%u: tcon is closing, skipping async close retry of fid %llu %llu\n",
+ tcon->tid, persistent_fid, volatile_fid);
+
+ return 0;
+ }
tcon->tc_count++;
spin_unlock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 4d1ff7b66fdc..087d5f14320b 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -55,9 +55,11 @@ extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server,
extern struct cifs_tcon *smb2_find_smb_tcon(struct TCP_Server_Info *server,
__u64 ses_id, __u32 tid);
extern int smb2_calc_signature(struct smb_rqst *rqst,
- struct TCP_Server_Info *server);
+ struct TCP_Server_Info *server,
+ bool allocate_crypto);
extern int smb3_calc_signature(struct smb_rqst *rqst,
- struct TCP_Server_Info *server);
+ struct TCP_Server_Info *server,
+ bool allocate_crypto);
extern void smb2_echo_request(struct work_struct *work);
extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
extern bool smb2_is_valid_oplock_break(char *buffer,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 20cc79e5c15d..1a6c227ada8f 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -41,14 +41,6 @@
#include "smb2glob.h"
static int
-smb2_crypto_shash_allocate(struct TCP_Server_Info *server)
-{
- return cifs_alloc_hash("hmac(sha256)",
- &server->secmech.hmacsha256,
- &server->secmech.sdeschmacsha256);
-}
-
-static int
smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
{
struct cifs_secmech *p = &server->secmech;
@@ -219,7 +211,8 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid)
}
int
-smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
+ bool allocate_crypto)
{
int rc;
unsigned char smb2_signature[SMB2_HMACSHA256_SIZE];
@@ -228,6 +221,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
struct cifs_ses *ses;
struct shash_desc *shash;
+ struct crypto_shash *hash;
+ struct sdesc *sdesc = NULL;
struct smb_rqst drqst;
ses = smb2_find_smb_ses(server, shdr->SessionId);
@@ -239,24 +234,32 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
- rc = smb2_crypto_shash_allocate(server);
- if (rc) {
- cifs_server_dbg(VFS, "%s: sha256 alloc failed\n", __func__);
- return rc;
+ if (allocate_crypto) {
+ rc = cifs_alloc_hash("hmac(sha256)", &hash, &sdesc);
+ if (rc) {
+ cifs_server_dbg(VFS,
+ "%s: sha256 alloc failed\n", __func__);
+ return rc;
+ }
+ shash = &sdesc->shash;
+ } else {
+ hash = server->secmech.hmacsha256;
+ shash = &server->secmech.sdeschmacsha256->shash;
}
- rc = crypto_shash_setkey(server->secmech.hmacsha256,
- ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
+ rc = crypto_shash_setkey(hash, ses->auth_key.response,
+ SMB2_NTLMV2_SESSKEY_SIZE);
if (rc) {
- cifs_server_dbg(VFS, "%s: Could not update with response\n", __func__);
- return rc;
+ cifs_server_dbg(VFS,
+ "%s: Could not update with response\n",
+ __func__);
+ goto out;
}
- shash = &server->secmech.sdeschmacsha256->shash;
rc = crypto_shash_init(shash);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not init sha256", __func__);
- return rc;
+ goto out;
}
/*
@@ -271,9 +274,10 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
rc = crypto_shash_update(shash, iov[0].iov_base,
iov[0].iov_len);
if (rc) {
- cifs_server_dbg(VFS, "%s: Could not update with payload\n",
- __func__);
- return rc;
+ cifs_server_dbg(VFS,
+ "%s: Could not update with payload\n",
+ __func__);
+ goto out;
}
drqst.rq_iov++;
drqst.rq_nvec--;
@@ -283,6 +287,9 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (!rc)
memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+out:
+ if (allocate_crypto)
+ cifs_free_hash(&hash, &sdesc);
return rc;
}
@@ -504,14 +511,17 @@ generate_smb311signingkey(struct cifs_ses *ses)
}
int
-smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
+ bool allocate_crypto)
{
int rc;
unsigned char smb3_signature[SMB2_CMACAES_SIZE];
unsigned char *sigptr = smb3_signature;
struct kvec *iov = rqst->rq_iov;
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
- struct shash_desc *shash = &server->secmech.sdesccmacaes->shash;
+ struct shash_desc *shash;
+ struct crypto_shash *hash;
+ struct sdesc *sdesc = NULL;
struct smb_rqst drqst;
u8 key[SMB3_SIGN_KEY_SIZE];
@@ -519,14 +529,24 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (rc)
return 0;
+ if (allocate_crypto) {
+ rc = cifs_alloc_hash("cmac(aes)", &hash, &sdesc);
+ if (rc)
+ return rc;
+
+ shash = &sdesc->shash;
+ } else {
+ hash = server->secmech.cmacaes;
+ shash = &server->secmech.sdesccmacaes->shash;
+ }
+
memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
- rc = crypto_shash_setkey(server->secmech.cmacaes,
- key, SMB2_CMACAES_SIZE);
+ rc = crypto_shash_setkey(hash, key, SMB2_CMACAES_SIZE);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
- return rc;
+ goto out;
}
/*
@@ -537,7 +557,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
rc = crypto_shash_init(shash);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
- return rc;
+ goto out;
}
/*
@@ -554,7 +574,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with payload\n",
__func__);
- return rc;
+ goto out;
}
drqst.rq_iov++;
drqst.rq_nvec--;
@@ -564,6 +584,9 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (!rc)
memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+out:
+ if (allocate_crypto)
+ cifs_free_hash(&hash, &sdesc);
return rc;
}
@@ -593,7 +616,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
return 0;
}
- rc = server->ops->calc_signature(rqst, server);
+ rc = server->ops->calc_signature(rqst, server, false);
return rc;
}
@@ -631,9 +654,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
memset(shdr->Signature, 0, SMB2_SIGNATURE_SIZE);
- mutex_lock(&server->srv_mutex);
- rc = server->ops->calc_signature(rqst, server);
- mutex_unlock(&server->srv_mutex);
+ rc = server->ops->calc_signature(rqst, server, true);
if (rc)
return rc;
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 8da43a500686..1a5834a5d597 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -284,13 +284,10 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
request->sge[i].length,
DMA_TO_DEVICE);
- if (request->has_payload) {
- if (atomic_dec_and_test(&request->info->send_payload_pending))
- wake_up(&request->info->wait_send_payload_pending);
- } else {
- if (atomic_dec_and_test(&request->info->send_pending))
- wake_up(&request->info->wait_send_pending);
- }
+ if (atomic_dec_and_test(&request->info->send_pending))
+ wake_up(&request->info->wait_send_pending);
+
+ wake_up(&request->info->wait_post_send);
mempool_free(request, request->info->request_mempool);
}
@@ -383,27 +380,6 @@ static bool process_negotiation_response(
return true;
}
-/*
- * Check and schedule to send an immediate packet
- * This is used to extend credtis to remote peer to keep the transport busy
- */
-static void check_and_send_immediate(struct smbd_connection *info)
-{
- if (info->transport_status != SMBD_CONNECTED)
- return;
-
- info->send_immediate = true;
-
- /*
- * Promptly send a packet if our peer is running low on receive
- * credits
- */
- if (atomic_read(&info->receive_credits) <
- info->receive_credit_target - 1)
- queue_delayed_work(
- info->workqueue, &info->send_immediate_work, 0);
-}
-
static void smbd_post_send_credits(struct work_struct *work)
{
int ret = 0;
@@ -453,10 +429,16 @@ static void smbd_post_send_credits(struct work_struct *work)
info->new_credits_offered += ret;
spin_unlock(&info->lock_new_credits_offered);
- atomic_add(ret, &info->receive_credits);
-
- /* Check if we can post new receive and grant credits to peer */
- check_and_send_immediate(info);
+ /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */
+ info->send_immediate = true;
+ if (atomic_read(&info->receive_credits) <
+ info->receive_credit_target - 1) {
+ if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
+ info->send_immediate) {
+ log_keep_alive(INFO, "send an empty message\n");
+ smbd_post_send_empty(info);
+ }
+ }
}
/* Called from softirq, when recv is done */
@@ -551,12 +533,6 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
info->keep_alive_requested = KEEP_ALIVE_PENDING;
}
- /*
- * Check if we need to send something to remote peer to
- * grant more credits or respond to KEEP_ALIVE packet
- */
- check_and_send_immediate(info);
-
return;
default:
@@ -749,7 +725,6 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info)
request->sge[0].addr,
request->sge[0].length, request->sge[0].lkey);
- request->has_payload = false;
atomic_inc(&info->send_pending);
rc = ib_post_send(info->id->qp, &send_wr, NULL);
if (!rc)
@@ -806,45 +781,96 @@ static int manage_keep_alive_before_sending(struct smbd_connection *info)
return 0;
}
-/*
- * Build and prepare the SMBD packet header
- * This function waits for avaialbe send credits and build a SMBD packet
- * header. The caller then optional append payload to the packet after
- * the header
- * intput values
- * size: the size of the payload
- * remaining_data_length: remaining data to send if this is part of a
- * fragmented packet
- * output values
- * request_out: the request allocated from this function
- * return values: 0 on success, otherwise actual error code returned
- */
-static int smbd_create_header(struct smbd_connection *info,
- int size, int remaining_data_length,
- struct smbd_request **request_out)
+/* Post the send request */
+static int smbd_post_send(struct smbd_connection *info,
+ struct smbd_request *request)
+{
+ struct ib_send_wr send_wr;
+ int rc, i;
+
+ for (i = 0; i < request->num_sge; i++) {
+ log_rdma_send(INFO,
+ "rdma_request sge[%d] addr=%llu length=%u\n",
+ i, request->sge[i].addr, request->sge[i].length);
+ ib_dma_sync_single_for_device(
+ info->id->device,
+ request->sge[i].addr,
+ request->sge[i].length,
+ DMA_TO_DEVICE);
+ }
+
+ request->cqe.done = send_done;
+
+ send_wr.next = NULL;
+ send_wr.wr_cqe = &request->cqe;
+ send_wr.sg_list = request->sge;
+ send_wr.num_sge = request->num_sge;
+ send_wr.opcode = IB_WR_SEND;
+ send_wr.send_flags = IB_SEND_SIGNALED;
+
+ rc = ib_post_send(info->id->qp, &send_wr, NULL);
+ if (rc) {
+ log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
+ smbd_disconnect_rdma_connection(info);
+ rc = -EAGAIN;
+ } else
+ /* Reset timer for idle connection after packet is sent */
+ mod_delayed_work(info->workqueue, &info->idle_timer_work,
+ info->keep_alive_interval*HZ);
+
+ return rc;
+}
+
+static int smbd_post_send_sgl(struct smbd_connection *info,
+ struct scatterlist *sgl, int data_length, int remaining_data_length)
{
+ int num_sgs;
+ int i, rc;
+ int header_length;
struct smbd_request *request;
struct smbd_data_transfer *packet;
- int header_length;
- int rc;
+ int new_credits;
+ struct scatterlist *sg;
+wait_credit:
/* Wait for send credits. A SMBD packet needs one credit */
rc = wait_event_interruptible(info->wait_send_queue,
atomic_read(&info->send_credits) > 0 ||
info->transport_status != SMBD_CONNECTED);
if (rc)
- return rc;
+ goto err_wait_credit;
+
+ if (info->transport_status != SMBD_CONNECTED) {
+ log_outgoing(ERR, "disconnected not sending on wait_credit\n");
+ rc = -EAGAIN;
+ goto err_wait_credit;
+ }
+ if (unlikely(atomic_dec_return(&info->send_credits) < 0)) {
+ atomic_inc(&info->send_credits);
+ goto wait_credit;
+ }
+
+wait_send_queue:
+ wait_event(info->wait_post_send,
+ atomic_read(&info->send_pending) < info->send_credit_target ||
+ info->transport_status != SMBD_CONNECTED);
if (info->transport_status != SMBD_CONNECTED) {
- log_outgoing(ERR, "disconnected not sending\n");
- return -EAGAIN;
+ log_outgoing(ERR, "disconnected not sending on wait_send_queue\n");
+ rc = -EAGAIN;
+ goto err_wait_send_queue;
+ }
+
+ if (unlikely(atomic_inc_return(&info->send_pending) >
+ info->send_credit_target)) {
+ atomic_dec(&info->send_pending);
+ goto wait_send_queue;
}
- atomic_dec(&info->send_credits);
request = mempool_alloc(info->request_mempool, GFP_KERNEL);
if (!request) {
rc = -ENOMEM;
- goto err;
+ goto err_alloc;
}
request->info = info;
@@ -852,8 +878,11 @@ static int smbd_create_header(struct smbd_connection *info,
/* Fill in the packet header */
packet = smbd_request_payload(request);
packet->credits_requested = cpu_to_le16(info->send_credit_target);
- packet->credits_granted =
- cpu_to_le16(manage_credits_prior_sending(info));
+
+ new_credits = manage_credits_prior_sending(info);
+ atomic_add(new_credits, &info->receive_credits);
+ packet->credits_granted = cpu_to_le16(new_credits);
+
info->send_immediate = false;
packet->flags = 0;
@@ -861,11 +890,11 @@ static int smbd_create_header(struct smbd_connection *info,
packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED);
packet->reserved = 0;
- if (!size)
+ if (!data_length)
packet->data_offset = 0;
else
packet->data_offset = cpu_to_le32(24);
- packet->data_length = cpu_to_le32(size);
+ packet->data_length = cpu_to_le32(data_length);
packet->remaining_data_length = cpu_to_le32(remaining_data_length);
packet->padding = 0;
@@ -880,7 +909,7 @@ static int smbd_create_header(struct smbd_connection *info,
/* Map the packet to DMA */
header_length = sizeof(struct smbd_data_transfer);
/* If this is a packet without payload, don't send padding */
- if (!size)
+ if (!data_length)
header_length = offsetof(struct smbd_data_transfer, padding);
request->num_sge = 1;
@@ -889,102 +918,15 @@ static int smbd_create_header(struct smbd_connection *info,
header_length,
DMA_TO_DEVICE);
if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
- mempool_free(request, info->request_mempool);
rc = -EIO;
- goto err;
+ request->sge[0].addr = 0;
+ goto err_dma;
}
request->sge[0].length = header_length;
request->sge[0].lkey = info->pd->local_dma_lkey;
- *request_out = request;
- return 0;
-
-err:
- atomic_inc(&info->send_credits);
- return rc;
-}
-
-static void smbd_destroy_header(struct smbd_connection *info,
- struct smbd_request *request)
-{
-
- ib_dma_unmap_single(info->id->device,
- request->sge[0].addr,
- request->sge[0].length,
- DMA_TO_DEVICE);
- mempool_free(request, info->request_mempool);
- atomic_inc(&info->send_credits);
-}
-
-/* Post the send request */
-static int smbd_post_send(struct smbd_connection *info,
- struct smbd_request *request, bool has_payload)
-{
- struct ib_send_wr send_wr;
- int rc, i;
-
- for (i = 0; i < request->num_sge; i++) {
- log_rdma_send(INFO,
- "rdma_request sge[%d] addr=%llu length=%u\n",
- i, request->sge[i].addr, request->sge[i].length);
- ib_dma_sync_single_for_device(
- info->id->device,
- request->sge[i].addr,
- request->sge[i].length,
- DMA_TO_DEVICE);
- }
-
- request->cqe.done = send_done;
-
- send_wr.next = NULL;
- send_wr.wr_cqe = &request->cqe;
- send_wr.sg_list = request->sge;
- send_wr.num_sge = request->num_sge;
- send_wr.opcode = IB_WR_SEND;
- send_wr.send_flags = IB_SEND_SIGNALED;
-
- if (has_payload) {
- request->has_payload = true;
- atomic_inc(&info->send_payload_pending);
- } else {
- request->has_payload = false;
- atomic_inc(&info->send_pending);
- }
-
- rc = ib_post_send(info->id->qp, &send_wr, NULL);
- if (rc) {
- log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
- if (has_payload) {
- if (atomic_dec_and_test(&info->send_payload_pending))
- wake_up(&info->wait_send_payload_pending);
- } else {
- if (atomic_dec_and_test(&info->send_pending))
- wake_up(&info->wait_send_pending);
- }
- smbd_disconnect_rdma_connection(info);
- rc = -EAGAIN;
- } else
- /* Reset timer for idle connection after packet is sent */
- mod_delayed_work(info->workqueue, &info->idle_timer_work,
- info->keep_alive_interval*HZ);
-
- return rc;
-}
-
-static int smbd_post_send_sgl(struct smbd_connection *info,
- struct scatterlist *sgl, int data_length, int remaining_data_length)
-{
- int num_sgs;
- int i, rc;
- struct smbd_request *request;
- struct scatterlist *sg;
-
- rc = smbd_create_header(
- info, data_length, remaining_data_length, &request);
- if (rc)
- return rc;
-
+ /* Fill in the packet data payload */
num_sgs = sgl ? sg_nents(sgl) : 0;
for_each_sg(sgl, sg, num_sgs, i) {
request->sge[i+1].addr =
@@ -994,25 +936,41 @@ static int smbd_post_send_sgl(struct smbd_connection *info,
info->id->device, request->sge[i+1].addr)) {
rc = -EIO;
request->sge[i+1].addr = 0;
- goto dma_mapping_failure;
+ goto err_dma;
}
request->sge[i+1].length = sg->length;
request->sge[i+1].lkey = info->pd->local_dma_lkey;
request->num_sge++;
}
- rc = smbd_post_send(info, request, data_length);
+ rc = smbd_post_send(info, request);
if (!rc)
return 0;
-dma_mapping_failure:
- for (i = 1; i < request->num_sge; i++)
+err_dma:
+ for (i = 0; i < request->num_sge; i++)
if (request->sge[i].addr)
ib_dma_unmap_single(info->id->device,
request->sge[i].addr,
request->sge[i].length,
DMA_TO_DEVICE);
- smbd_destroy_header(info, request);
+ mempool_free(request, info->request_mempool);
+
+ /* roll back receive credits and credits to be offered */
+ spin_lock(&info->lock_new_credits_offered);
+ info->new_credits_offered += new_credits;
+ spin_unlock(&info->lock_new_credits_offered);
+ atomic_sub(new_credits, &info->receive_credits);
+
+err_alloc:
+ if (atomic_dec_and_test(&info->send_pending))
+ wake_up(&info->wait_send_pending);
+
+err_wait_send_queue:
+ /* roll back send credits and pending */
+ atomic_inc(&info->send_credits);
+
+err_wait_credit:
return rc;
}
@@ -1334,25 +1292,6 @@ static void destroy_receive_buffers(struct smbd_connection *info)
mempool_free(response, info->response_mempool);
}
-/*
- * Check and send an immediate or keep alive packet
- * The condition to send those packets are defined in [MS-SMBD] 3.1.1.1
- * Connection.KeepaliveRequested and Connection.SendImmediate
- * The idea is to extend credits to server as soon as it becomes available
- */
-static void send_immediate_work(struct work_struct *work)
-{
- struct smbd_connection *info = container_of(
- work, struct smbd_connection,
- send_immediate_work.work);
-
- if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
- info->send_immediate) {
- log_keep_alive(INFO, "send an empty message\n");
- smbd_post_send_empty(info);
- }
-}
-
/* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
static void idle_connection_timer(struct work_struct *work)
{
@@ -1407,14 +1346,10 @@ void smbd_destroy(struct TCP_Server_Info *server)
log_rdma_event(INFO, "cancelling idle timer\n");
cancel_delayed_work_sync(&info->idle_timer_work);
- log_rdma_event(INFO, "cancelling send immediate work\n");
- cancel_delayed_work_sync(&info->send_immediate_work);
log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
wait_event(info->wait_send_pending,
atomic_read(&info->send_pending) == 0);
- wait_event(info->wait_send_payload_pending,
- atomic_read(&info->send_payload_pending) == 0);
/* It's not posssible for upper layer to get to reassembly */
log_rdma_event(INFO, "drain the reassembly queue\n");
@@ -1744,15 +1679,13 @@ static struct smbd_connection *_smbd_get_connection(
init_waitqueue_head(&info->wait_send_queue);
INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer);
- INIT_DELAYED_WORK(&info->send_immediate_work, send_immediate_work);
queue_delayed_work(info->workqueue, &info->idle_timer_work,
info->keep_alive_interval*HZ);
init_waitqueue_head(&info->wait_send_pending);
atomic_set(&info->send_pending, 0);
- init_waitqueue_head(&info->wait_send_payload_pending);
- atomic_set(&info->send_payload_pending, 0);
+ init_waitqueue_head(&info->wait_post_send);
INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work);
INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits);
@@ -2226,8 +2159,8 @@ done:
* that means all the I/Os have been out and we are good to return
*/
- wait_event(info->wait_send_payload_pending,
- atomic_read(&info->send_payload_pending) == 0);
+ wait_event(info->wait_send_pending,
+ atomic_read(&info->send_pending) == 0);
return rc;
}
diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h
index 8ede915f2b24..a87fca82a796 100644
--- a/fs/cifs/smbdirect.h
+++ b/fs/cifs/smbdirect.h
@@ -114,8 +114,7 @@ struct smbd_connection {
/* Activity accoutning */
atomic_t send_pending;
wait_queue_head_t wait_send_pending;
- atomic_t send_payload_pending;
- wait_queue_head_t wait_send_payload_pending;
+ wait_queue_head_t wait_post_send;
/* Receive queue */
struct list_head receive_queue;
@@ -154,7 +153,6 @@ struct smbd_connection {
struct workqueue_struct *workqueue;
struct delayed_work idle_timer_work;
- struct delayed_work send_immediate_work;
/* Memory pool for preallocating buffers */
/* request pool for RDMA send */
@@ -234,9 +232,6 @@ struct smbd_request {
struct smbd_connection *info;
struct ib_cqe cqe;
- /* true if this request carries upper layer payload */
- bool has_payload;
-
/* the SGE entries for this packet */
struct ib_sge sge[SMBDIRECT_MAX_SGE];
int num_sge;
diff --git a/fs/dax.c b/fs/dax.c
index 35da144375a0..11b16729b86f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1038,50 +1038,43 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
return ret;
}
-static bool dax_range_is_aligned(struct block_device *bdev,
- unsigned int offset, unsigned int length)
+int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
+ struct iomap *iomap)
{
- unsigned short sector_size = bdev_logical_block_size(bdev);
+ sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
+ pgoff_t pgoff;
+ long rc, id;
+ void *kaddr;
+ bool page_aligned = false;
- if (!IS_ALIGNED(offset, sector_size))
- return false;
- if (!IS_ALIGNED(length, sector_size))
- return false;
- return true;
-}
+ if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
+ IS_ALIGNED(size, PAGE_SIZE))
+ page_aligned = true;
-int __dax_zero_page_range(struct block_device *bdev,
- struct dax_device *dax_dev, sector_t sector,
- unsigned int offset, unsigned int size)
-{
- if (dax_range_is_aligned(bdev, offset, size)) {
- sector_t start_sector = sector + (offset >> 9);
+ rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
+ if (rc)
+ return rc;
- return blkdev_issue_zeroout(bdev, start_sector,
- size >> 9, GFP_NOFS, 0);
- } else {
- pgoff_t pgoff;
- long rc, id;
- void *kaddr;
+ id = dax_read_lock();
- rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
- if (rc)
- return rc;
+ if (page_aligned)
+ rc = dax_zero_page_range(iomap->dax_dev, pgoff,
+ size >> PAGE_SHIFT);
+ else
+ rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
+ if (rc < 0) {
+ dax_read_unlock(id);
+ return rc;
+ }
- id = dax_read_lock();
- rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
- if (rc < 0) {
- dax_read_unlock(id);
- return rc;
- }
+ if (!page_aligned) {
memset(kaddr + offset, 0, size);
- dax_flush(dax_dev, kaddr + offset, size);
- dax_read_unlock(id);
+ dax_flush(iomap->dax_dev, kaddr + offset, size);
}
+ dax_read_unlock(id);
return 0;
}
-EXPORT_SYMBOL_GPL(__dax_zero_page_range);
static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index eee3c92a9ebf..8c596641a72b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -218,13 +218,18 @@ struct eventpoll {
struct file *file;
/* used to optimize loop detection check */
- int visited;
struct list_head visited_list_link;
+ int visited;
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /* tracks wakeup nests for lockdep validation */
+ u8 nests;
+#endif
};
/* Wait structure used by the poll hooks */
@@ -545,30 +550,47 @@ out_unlock:
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static DEFINE_PER_CPU(int, wakeup_nest);
-
-static void ep_poll_safewake(wait_queue_head_t *wq)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
{
+ struct eventpoll *ep_src;
unsigned long flags;
- int subclass;
+ u8 nests = 0;
- local_irq_save(flags);
- preempt_disable();
- subclass = __this_cpu_read(wakeup_nest);
- spin_lock_nested(&wq->lock, subclass + 1);
- __this_cpu_inc(wakeup_nest);
- wake_up_locked_poll(wq, POLLIN);
- __this_cpu_dec(wakeup_nest);
- spin_unlock(&wq->lock);
- local_irq_restore(flags);
- preempt_enable();
+ /*
+ * To set the subclass or nesting level for spin_lock_irqsave_nested()
+ * it might be natural to create a per-cpu nest count. However, since
+ * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
+ * schedule() in the -rt kernel, the per-cpu variable are no longer
+ * protected. Thus, we are introducing a per eventpoll nest field.
+ * If we are not being call from ep_poll_callback(), epi is NULL and
+ * we are at the first level of nesting, 0. Otherwise, we are being
+ * called from ep_poll_callback() and if a previous wakeup source is
+ * not an epoll file itself, we are at depth 1 since the wakeup source
+ * is depth 0. If the wakeup source is a previous epoll file in the
+ * wakeup chain then we use its nests value and record ours as
+ * nests + 1. The previous epoll file nests value is stable since its
+ * already holding its own poll_wait.lock.
+ */
+ if (epi) {
+ if ((is_file_epoll(epi->ffd.file))) {
+ ep_src = epi->ffd.file->private_data;
+ nests = ep_src->nests;
+ } else {
+ nests = 1;
+ }
+ }
+ spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
+ ep->nests = nests + 1;
+ wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
+ ep->nests = 0;
+ spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}
#else
-static void ep_poll_safewake(wait_queue_head_t *wq)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
{
- wake_up_poll(wq, EPOLLIN);
+ wake_up_poll(&ep->poll_wait, EPOLLIN);
}
#endif
@@ -789,7 +811,7 @@ static void ep_free(struct eventpoll *ep)
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, NULL);
/*
* We need to lock this because we could be hit by
@@ -1258,7 +1280,7 @@ out_unlock:
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, epi);
if (!(epi->event.events & EPOLLEXCLUSIVE))
ewake = 1;
@@ -1562,7 +1584,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, NULL);
return 0;
@@ -1666,7 +1688,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, NULL);
return 0;
}
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index f0faada30f30..bb68d21e1f8c 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -118,3 +118,12 @@ config F2FS_FS_LZ4
default y
help
Support LZ4 compress algorithm, if unsure, say Y.
+
+config F2FS_FS_ZSTD
+ bool "ZSTD compression support"
+ depends on F2FS_FS_COMPRESSION
+ select ZSTD_COMPRESS
+ select ZSTD_DECOMPRESS
+ default y
+ help
+ Support ZSTD compress algorithm, if unsure, say Y.
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 44e84ac5c941..852890b72d6a 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -50,9 +50,6 @@ repeat:
return page;
}
-/*
- * We guarantee no failure on the returned page.
- */
static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
bool is_meta)
{
@@ -206,7 +203,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
}
/*
- * Readahead CP/NAT/SIT/SSA pages
+ * Readahead CP/NAT/SIT/SSA/POR pages
*/
int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
int type, bool sync)
@@ -898,7 +895,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi)
return -ENOMEM;
/*
* Finding out valid cp block involves read both
- * sets( cp pack1 and cp pack 2)
+ * sets( cp pack 1 and cp pack 2)
*/
cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr);
cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
@@ -1250,20 +1247,20 @@ static void unblock_operations(struct f2fs_sb_info *sbi)
f2fs_unlock_all(sbi);
}
-void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
+void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
{
DEFINE_WAIT(wait);
for (;;) {
prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (!get_pages(sbi, F2FS_WB_CP_DATA))
+ if (!get_pages(sbi, type))
break;
if (unlikely(f2fs_cp_error(sbi)))
break;
- io_schedule_timeout(5*HZ);
+ io_schedule_timeout(DEFAULT_IO_TIMEOUT);
}
finish_wait(&sbi->cp_wait, &wait);
}
@@ -1301,10 +1298,14 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
else
__clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
- if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) ||
- is_sbi_flag_set(sbi, SBI_IS_RESIZEFS))
+ if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
__set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+ if (is_sbi_flag_set(sbi, SBI_IS_RESIZEFS))
+ __set_ckpt_flags(ckpt, CP_RESIZEFS_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_RESIZEFS_FLAG);
+
if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
__set_ckpt_flags(ckpt, CP_DISABLED_FLAG);
else
@@ -1384,13 +1385,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Flush all the NAT/SIT pages */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
- f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
- !f2fs_cp_error(sbi));
- /*
- * modify checkpoint
- * version number is already updated
- */
+ /* start to update checkpoint, cp ver is already updated previously */
ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true));
ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
@@ -1493,11 +1489,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Here, we have one bio having CP pack except cp pack 2 page */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
- f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
- !f2fs_cp_error(sbi));
+ /* Wait for all dirty meta pages to be submitted for IO */
+ f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META);
/* wait for previous submitted meta pages writeback */
- f2fs_wait_on_all_pages_writeback(sbi);
+ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
/* flush all device cache */
err = f2fs_flush_device_cache(sbi);
@@ -1506,7 +1502,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* barrier and flush checkpoint cp pack 2 page if it can */
commit_checkpoint(sbi, ckpt, start_blk);
- f2fs_wait_on_all_pages_writeback(sbi);
+ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
/*
* invalidate intermediate page cache borrowed from meta inode which are
@@ -1543,9 +1539,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0;
}
-/*
- * We guarantee that this checkpoint procedure will not fail.
- */
int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1613,7 +1606,6 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_flush_sit_entries(sbi, cpc);
- /* unlock all the fs_lock[] in do_checkpoint() */
err = do_checkpoint(sbi, cpc);
if (err)
f2fs_release_discard_addrs(sbi);
@@ -1626,7 +1618,7 @@ stop:
if (cpc->reason & CP_RECOVERY)
f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver);
- /* do checkpoint periodically */
+ /* update CP_TIME to trigger checkpoint periodically */
f2fs_update_time(sbi, CP_TIME);
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
out:
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index d8a64be90a50..df7b2d15eacd 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -11,6 +11,7 @@
#include <linux/backing-dev.h>
#include <linux/lzo.h>
#include <linux/lz4.h>
+#include <linux/zstd.h>
#include "f2fs.h"
#include "node.h"
@@ -20,6 +21,8 @@ struct f2fs_compress_ops {
int (*init_compress_ctx)(struct compress_ctx *cc);
void (*destroy_compress_ctx)(struct compress_ctx *cc);
int (*compress_pages)(struct compress_ctx *cc);
+ int (*init_decompress_ctx)(struct decompress_io_ctx *dic);
+ void (*destroy_decompress_ctx)(struct decompress_io_ctx *dic);
int (*decompress_pages)(struct decompress_io_ctx *dic);
};
@@ -52,7 +55,7 @@ bool f2fs_is_compressed_page(struct page *page)
}
static void f2fs_set_compressed_page(struct page *page,
- struct inode *inode, pgoff_t index, void *data, refcount_t *r)
+ struct inode *inode, pgoff_t index, void *data)
{
SetPagePrivate(page);
set_page_private(page, (unsigned long)data);
@@ -60,8 +63,6 @@ static void f2fs_set_compressed_page(struct page *page,
/* i_crypto_info and iv index */
page->index = index;
page->mapping = inode->i_mapping;
- if (r)
- refcount_inc(r);
}
static void f2fs_put_compressed_page(struct page *page)
@@ -291,6 +292,165 @@ static const struct f2fs_compress_ops f2fs_lz4_ops = {
};
#endif
+#ifdef CONFIG_F2FS_FS_ZSTD
+#define F2FS_ZSTD_DEFAULT_CLEVEL 1
+
+static int zstd_init_compress_ctx(struct compress_ctx *cc)
+{
+ ZSTD_parameters params;
+ ZSTD_CStream *stream;
+ void *workspace;
+ unsigned int workspace_size;
+
+ params = ZSTD_getParams(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen, 0);
+ workspace_size = ZSTD_CStreamWorkspaceBound(params.cParams);
+
+ workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
+ workspace_size, GFP_NOFS);
+ if (!workspace)
+ return -ENOMEM;
+
+ stream = ZSTD_initCStream(params, 0, workspace, workspace_size);
+ if (!stream) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initCStream failed\n",
+ KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
+ __func__);
+ kvfree(workspace);
+ return -EIO;
+ }
+
+ cc->private = workspace;
+ cc->private2 = stream;
+
+ cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE;
+ return 0;
+}
+
+static void zstd_destroy_compress_ctx(struct compress_ctx *cc)
+{
+ kvfree(cc->private);
+ cc->private = NULL;
+ cc->private2 = NULL;
+}
+
+static int zstd_compress_pages(struct compress_ctx *cc)
+{
+ ZSTD_CStream *stream = cc->private2;
+ ZSTD_inBuffer inbuf;
+ ZSTD_outBuffer outbuf;
+ int src_size = cc->rlen;
+ int dst_size = src_size - PAGE_SIZE - COMPRESS_HEADER_SIZE;
+ int ret;
+
+ inbuf.pos = 0;
+ inbuf.src = cc->rbuf;
+ inbuf.size = src_size;
+
+ outbuf.pos = 0;
+ outbuf.dst = cc->cbuf->cdata;
+ outbuf.size = dst_size;
+
+ ret = ZSTD_compressStream(stream, &outbuf, &inbuf);
+ if (ZSTD_isError(ret)) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_compressStream failed, ret: %d\n",
+ KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
+ __func__, ZSTD_getErrorCode(ret));
+ return -EIO;
+ }
+
+ ret = ZSTD_endStream(stream, &outbuf);
+ if (ZSTD_isError(ret)) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_endStream returned %d\n",
+ KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
+ __func__, ZSTD_getErrorCode(ret));
+ return -EIO;
+ }
+
+ cc->clen = outbuf.pos;
+ return 0;
+}
+
+static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic)
+{
+ ZSTD_DStream *stream;
+ void *workspace;
+ unsigned int workspace_size;
+
+ workspace_size = ZSTD_DStreamWorkspaceBound(MAX_COMPRESS_WINDOW_SIZE);
+
+ workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode),
+ workspace_size, GFP_NOFS);
+ if (!workspace)
+ return -ENOMEM;
+
+ stream = ZSTD_initDStream(MAX_COMPRESS_WINDOW_SIZE,
+ workspace, workspace_size);
+ if (!stream) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initDStream failed\n",
+ KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id,
+ __func__);
+ kvfree(workspace);
+ return -EIO;
+ }
+
+ dic->private = workspace;
+ dic->private2 = stream;
+
+ return 0;
+}
+
+static void zstd_destroy_decompress_ctx(struct decompress_io_ctx *dic)
+{
+ kvfree(dic->private);
+ dic->private = NULL;
+ dic->private2 = NULL;
+}
+
+static int zstd_decompress_pages(struct decompress_io_ctx *dic)
+{
+ ZSTD_DStream *stream = dic->private2;
+ ZSTD_inBuffer inbuf;
+ ZSTD_outBuffer outbuf;
+ int ret;
+
+ inbuf.pos = 0;
+ inbuf.src = dic->cbuf->cdata;
+ inbuf.size = dic->clen;
+
+ outbuf.pos = 0;
+ outbuf.dst = dic->rbuf;
+ outbuf.size = dic->rlen;
+
+ ret = ZSTD_decompressStream(stream, &outbuf, &inbuf);
+ if (ZSTD_isError(ret)) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_compressStream failed, ret: %d\n",
+ KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id,
+ __func__, ZSTD_getErrorCode(ret));
+ return -EIO;
+ }
+
+ if (dic->rlen != outbuf.pos) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD invalid rlen:%zu, "
+ "expected:%lu\n", KERN_ERR,
+ F2FS_I_SB(dic->inode)->sb->s_id,
+ __func__, dic->rlen,
+ PAGE_SIZE << dic->log_cluster_size);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static const struct f2fs_compress_ops f2fs_zstd_ops = {
+ .init_compress_ctx = zstd_init_compress_ctx,
+ .destroy_compress_ctx = zstd_destroy_compress_ctx,
+ .compress_pages = zstd_compress_pages,
+ .init_decompress_ctx = zstd_init_decompress_ctx,
+ .destroy_decompress_ctx = zstd_destroy_decompress_ctx,
+ .decompress_pages = zstd_decompress_pages,
+};
+#endif
+
static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = {
#ifdef CONFIG_F2FS_FS_LZO
&f2fs_lzo_ops,
@@ -302,6 +462,11 @@ static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = {
#else
NULL,
#endif
+#ifdef CONFIG_F2FS_FS_ZSTD
+ &f2fs_zstd_ops,
+#else
+ NULL,
+#endif
};
bool f2fs_is_compress_backend_ready(struct inode *inode)
@@ -334,9 +499,11 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx,
cc->cluster_size, fi->i_compress_algorithm);
- ret = cops->init_compress_ctx(cc);
- if (ret)
- goto out;
+ if (cops->init_compress_ctx) {
+ ret = cops->init_compress_ctx(cc);
+ if (ret)
+ goto out;
+ }
max_len = COMPRESS_HEADER_SIZE + cc->clen;
cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE);
@@ -380,21 +547,27 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
}
cc->cbuf->clen = cpu_to_le32(cc->clen);
- cc->cbuf->chksum = cpu_to_le32(0);
for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++)
cc->cbuf->reserved[i] = cpu_to_le32(0);
+ nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE);
+
+ /* zero out any unused part of the last page */
+ memset(&cc->cbuf->cdata[cc->clen], 0,
+ (nr_cpages * PAGE_SIZE) - (cc->clen + COMPRESS_HEADER_SIZE));
+
vunmap(cc->cbuf);
vunmap(cc->rbuf);
- nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE);
-
for (i = nr_cpages; i < cc->nr_cpages; i++) {
f2fs_put_compressed_page(cc->cpages[i]);
cc->cpages[i] = NULL;
}
+ if (cops->destroy_compress_ctx)
+ cops->destroy_compress_ctx(cc);
+
cc->nr_cpages = nr_cpages;
trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
@@ -413,7 +586,8 @@ out_free_cpages:
kfree(cc->cpages);
cc->cpages = NULL;
destroy_compress_ctx:
- cops->destroy_compress_ctx(cc);
+ if (cops->destroy_compress_ctx)
+ cops->destroy_compress_ctx(cc);
out:
trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
cc->clen, ret);
@@ -447,10 +621,16 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
goto out_free_dic;
}
+ if (cops->init_decompress_ctx) {
+ ret = cops->init_decompress_ctx(dic);
+ if (ret)
+ goto out_free_dic;
+ }
+
dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL);
if (!dic->rbuf) {
ret = -ENOMEM;
- goto out_free_dic;
+ goto destroy_decompress_ctx;
}
dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO);
@@ -473,7 +653,12 @@ out_vunmap_cbuf:
vunmap(dic->cbuf);
out_vunmap_rbuf:
vunmap(dic->rbuf);
+destroy_decompress_ctx:
+ if (cops->destroy_decompress_ctx)
+ cops->destroy_decompress_ctx(dic);
out_free_dic:
+ if (verity)
+ refcount_set(&dic->ref, dic->nr_cpages);
if (!verity)
f2fs_decompress_end_io(dic->rpages, dic->cluster_size,
ret, false);
@@ -532,8 +717,7 @@ static bool __cluster_may_compress(struct compress_ctx *cc)
return true;
}
-/* return # of compressed block addresses */
-static int f2fs_compressed_blocks(struct compress_ctx *cc)
+static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr)
{
struct dnode_of_data dn;
int ret;
@@ -554,10 +738,15 @@ static int f2fs_compressed_blocks(struct compress_ctx *cc)
for (i = 1; i < cc->cluster_size; i++) {
block_t blkaddr;
- blkaddr = datablock_addr(dn.inode,
+ blkaddr = data_blkaddr(dn.inode,
dn.node_page, dn.ofs_in_node + i);
- if (blkaddr != NULL_ADDR)
- ret++;
+ if (compr) {
+ if (__is_valid_data_blkaddr(blkaddr))
+ ret++;
+ } else {
+ if (blkaddr != NULL_ADDR)
+ ret++;
+ }
}
}
fail:
@@ -565,6 +754,18 @@ fail:
return ret;
}
+/* return # of compressed blocks in compressed cluster */
+static int f2fs_compressed_blocks(struct compress_ctx *cc)
+{
+ return __f2fs_cluster_blocks(cc, true);
+}
+
+/* return # of valid blocks in compressed cluster */
+static int f2fs_cluster_blocks(struct compress_ctx *cc, bool compr)
+{
+ return __f2fs_cluster_blocks(cc, false);
+}
+
int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
{
struct compress_ctx cc = {
@@ -574,7 +775,7 @@ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
.cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size,
};
- return f2fs_compressed_blocks(&cc);
+ return f2fs_cluster_blocks(&cc, false);
}
static bool cluster_may_compress(struct compress_ctx *cc)
@@ -623,7 +824,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc,
bool prealloc;
retry:
- ret = f2fs_compressed_blocks(cc);
+ ret = f2fs_cluster_blocks(cc, false);
if (ret <= 0)
return ret;
@@ -653,7 +854,7 @@ retry:
struct bio *bio = NULL;
ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size,
- &last_block_in_bio, false);
+ &last_block_in_bio, false, true);
f2fs_destroy_compress_ctx(cc);
if (ret)
goto release_pages;
@@ -772,7 +973,6 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
.encrypted_page = NULL,
.compressed_page = NULL,
.submitted = false,
- .need_lock = LOCK_RETRY,
.io_type = io_type,
.io_wbc = wbc,
.encrypted = f2fs_encrypted_file(cc->inode),
@@ -785,16 +985,17 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
loff_t psize;
int i, err;
- set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
+ if (!f2fs_trylock_op(sbi))
+ return -EAGAIN;
- f2fs_lock_op(sbi);
+ set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
err = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
if (err)
goto out_unlock_op;
for (i = 0; i < cc->cluster_size; i++) {
- if (datablock_addr(dn.inode, dn.node_page,
+ if (data_blkaddr(dn.inode, dn.node_page,
dn.ofs_in_node + i) == NULL_ADDR)
goto out_put_dnode;
}
@@ -813,7 +1014,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
cic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
cic->inode = inode;
- refcount_set(&cic->ref, 1);
+ refcount_set(&cic->ref, cc->nr_cpages);
cic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) <<
cc->log_cluster_size, GFP_NOFS);
if (!cic->rpages)
@@ -823,8 +1024,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
for (i = 0; i < cc->nr_cpages; i++) {
f2fs_set_compressed_page(cc->cpages[i], inode,
- cc->rpages[i + 1]->index,
- cic, i ? &cic->ref : NULL);
+ cc->rpages[i + 1]->index, cic);
fio.compressed_page = cc->cpages[i];
if (fio.encrypted) {
fio.page = cc->rpages[i + 1];
@@ -843,9 +1043,8 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) {
block_t blkaddr;
- blkaddr = datablock_addr(dn.inode, dn.node_page,
- dn.ofs_in_node);
- fio.page = cic->rpages[i];
+ blkaddr = f2fs_data_blkaddr(&dn);
+ fio.page = cc->rpages[i];
fio.old_blkaddr = blkaddr;
/* cluster header */
@@ -895,10 +1094,10 @@ unlock_continue:
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
- down_write(&fi->i_sem);
+ spin_lock(&fi->i_size_lock);
if (fi->last_disk_size < psize)
fi->last_disk_size = psize;
- up_write(&fi->i_sem);
+ spin_unlock(&fi->i_size_lock);
f2fs_put_rpages(cc);
f2fs_destroy_compress_ctx(cc);
@@ -984,24 +1183,30 @@ retry_write:
unlock_page(cc->rpages[i]);
ret = 0;
} else if (ret == -EAGAIN) {
+ /*
+ * for quota file, just redirty left pages to
+ * avoid deadlock caused by cluster update race
+ * from foreground operation.
+ */
+ if (IS_NOQUOTA(cc->inode)) {
+ err = 0;
+ goto out_err;
+ }
ret = 0;
cond_resched();
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
lock_page(cc->rpages[i]);
clear_page_dirty_for_io(cc->rpages[i]);
goto retry_write;
}
err = ret;
- goto out_fail;
+ goto out_err;
}
*submitted += _submitted;
}
return 0;
-
-out_fail:
- /* TODO: revoke partially updated block addresses */
- BUG_ON(compr_blocks);
out_err:
for (++i; i < cc->cluster_size; i++) {
if (!cc->rpages[i])
@@ -1069,7 +1274,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
dic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
dic->inode = cc->inode;
- refcount_set(&dic->ref, 1);
+ refcount_set(&dic->ref, cc->nr_cpages);
dic->cluster_idx = cc->cluster_idx;
dic->cluster_size = cc->cluster_size;
dic->log_cluster_size = cc->log_cluster_size;
@@ -1093,8 +1298,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
goto out_free;
f2fs_set_compressed_page(page, cc->inode,
- start_idx + i + 1,
- dic, i ? &dic->ref : NULL);
+ start_idx + i + 1, dic);
dic->cpages[i] = page;
}
@@ -1104,20 +1308,16 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
goto out_free;
for (i = 0; i < dic->cluster_size; i++) {
- if (cc->rpages[i])
+ if (cc->rpages[i]) {
+ dic->tpages[i] = cc->rpages[i];
continue;
+ }
dic->tpages[i] = f2fs_grab_page();
if (!dic->tpages[i])
goto out_free;
}
- for (i = 0; i < dic->cluster_size; i++) {
- if (dic->tpages[i])
- continue;
- dic->tpages[i] = cc->rpages[i];
- }
-
return dic;
out_free:
@@ -1133,7 +1333,10 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
for (i = 0; i < dic->cluster_size; i++) {
if (dic->rpages[i])
continue;
- f2fs_put_page(dic->tpages[i], 1);
+ if (!dic->tpages[i])
+ continue;
+ unlock_page(dic->tpages[i]);
+ put_page(dic->tpages[i]);
}
kfree(dic->tpages);
}
@@ -1162,15 +1365,17 @@ void f2fs_decompress_end_io(struct page **rpages,
if (!rpage)
continue;
- if (err || PageError(rpage)) {
- ClearPageUptodate(rpage);
- ClearPageError(rpage);
- } else {
- if (!verity || fsverity_verify_page(rpage))
- SetPageUptodate(rpage);
- else
- SetPageError(rpage);
+ if (err || PageError(rpage))
+ goto clear_uptodate;
+
+ if (!verity || fsverity_verify_page(rpage)) {
+ SetPageUptodate(rpage);
+ goto unlock;
}
+clear_uptodate:
+ ClearPageUptodate(rpage);
+ ClearPageError(rpage);
+unlock:
unlock_page(rpage);
}
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index b27b72107911..cdf2f626bea7 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -54,17 +54,13 @@ static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask,
return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset);
}
-struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail)
+struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio)
{
- struct bio *bio;
-
- if (no_fail) {
+ if (noio) {
/* No failure on bio allocation */
- bio = __f2fs_bio_alloc(GFP_NOIO, npages);
- if (!bio)
- bio = __f2fs_bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
- return bio;
+ return __f2fs_bio_alloc(GFP_NOIO, npages);
}
+
if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO);
return NULL;
@@ -143,6 +139,8 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity)
f2fs_decompress_pages(bio, page, verity);
continue;
}
+ if (verity)
+ continue;
#endif
/* PG_error was set if any post_read step failed */
@@ -191,12 +189,38 @@ static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size)
static void f2fs_verify_bio(struct bio *bio)
{
- struct page *page = bio_first_page_all(bio);
- struct decompress_io_ctx *dic =
- (struct decompress_io_ctx *)page_private(page);
+ struct bio_vec *bv;
+ struct bvec_iter_all iter_all;
- f2fs_verify_pages(dic->rpages, dic->cluster_size);
- f2fs_free_dic(dic);
+ bio_for_each_segment_all(bv, bio, iter_all) {
+ struct page *page = bv->bv_page;
+ struct decompress_io_ctx *dic;
+
+ dic = (struct decompress_io_ctx *)page_private(page);
+
+ if (dic) {
+ if (refcount_dec_not_one(&dic->ref))
+ continue;
+ f2fs_verify_pages(dic->rpages,
+ dic->cluster_size);
+ f2fs_free_dic(dic);
+ continue;
+ }
+
+ if (bio->bi_status || PageError(page))
+ goto clear_uptodate;
+
+ if (fsverity_verify_page(page)) {
+ SetPageUptodate(page);
+ goto unlock;
+ }
+clear_uptodate:
+ ClearPageUptodate(page);
+ ClearPageError(page);
+unlock:
+ dec_page_count(F2FS_P_SB(page), __read_io_type(page));
+ unlock_page(page);
+ }
}
#endif
@@ -364,9 +388,6 @@ static void f2fs_write_end_io(struct bio *bio)
bio_put(bio);
}
-/*
- * Return true, if pre_bio's bdev is same as its target device.
- */
struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
block_t blk_addr, struct bio *bio)
{
@@ -403,6 +424,9 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
return 0;
}
+/*
+ * Return true, if pre_bio's bdev is same as its target device.
+ */
static bool __same_bdev(struct f2fs_sb_info *sbi,
block_t blk_addr, struct bio *bio)
{
@@ -410,9 +434,6 @@ static bool __same_bdev(struct f2fs_sb_info *sbi,
return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno;
}
-/*
- * Low-level block read/write IO operations.
- */
static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
{
struct f2fs_sb_info *sbi = fio->sbi;
@@ -445,7 +466,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
if (type != DATA && type != NODE)
goto submit_io;
- if (test_opt(sbi, LFS) && current->plug)
+ if (f2fs_lfs_mode(sbi) && current->plug)
blk_finish_plug(current->plug);
if (F2FS_IO_ALIGNED(sbi))
@@ -928,14 +949,15 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
unsigned nr_pages, unsigned op_flag,
- pgoff_t first_idx)
+ pgoff_t first_idx, bool for_write)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct bio *bio;
struct bio_post_read_ctx *ctx;
unsigned int post_read_steps = 0;
- bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false);
+ bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES),
+ for_write);
if (!bio)
return ERR_PTR(-ENOMEM);
f2fs_target_device(sbi, blkaddr, bio);
@@ -970,12 +992,12 @@ static void f2fs_release_read_bio(struct bio *bio)
/* This can handle encryption stuffs */
static int f2fs_submit_page_read(struct inode *inode, struct page *page,
- block_t blkaddr)
+ block_t blkaddr, bool for_write)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct bio *bio;
- bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index);
+ bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index, for_write);
if (IS_ERR(bio))
return PTR_ERR(bio);
@@ -1047,8 +1069,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
for (; count > 0; dn->ofs_in_node++) {
- block_t blkaddr = datablock_addr(dn->inode,
- dn->node_page, dn->ofs_in_node);
+ block_t blkaddr = f2fs_data_blkaddr(dn);
if (blkaddr == NULL_ADDR) {
dn->data_blkaddr = NEW_ADDR;
__set_data_blkaddr(dn);
@@ -1162,7 +1183,7 @@ got_it:
return page;
}
- err = f2fs_submit_page_read(inode, page, dn.data_blkaddr);
+ err = f2fs_submit_page_read(inode, page, dn.data_blkaddr, for_write);
if (err)
goto put_err;
return page;
@@ -1300,8 +1321,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
if (err)
return err;
- dn->data_blkaddr = datablock_addr(dn->inode,
- dn->node_page, dn->ofs_in_node);
+ dn->data_blkaddr = f2fs_data_blkaddr(dn);
if (dn->data_blkaddr != NULL_ADDR)
goto alloc;
@@ -1388,13 +1408,9 @@ void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
}
/*
- * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with
- * f2fs_map_blocks structure.
- * If original data blocks are allocated, then give them to blockdev.
- * Otherwise,
- * a. preallocate requested block addresses
- * b. do not use extent cache for better performance
- * c. give the block addresses to blockdev
+ * f2fs_map_blocks() tries to find or build mapping relationship which
+ * maps continuous logical blocks to physical blocks, and return such
+ * info via f2fs_map_blocks structure.
*/
int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
int create, int flag)
@@ -1422,7 +1438,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
end = pgofs + maxblocks;
if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
- if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
+ if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO &&
map->m_may_create)
goto next_dnode;
@@ -1467,7 +1483,7 @@ next_dnode:
end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
next_block:
- blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
+ blkaddr = f2fs_data_blkaddr(&dn);
if (__is_valid_data_blkaddr(blkaddr) &&
!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
@@ -1477,7 +1493,7 @@ next_block:
if (__is_valid_data_blkaddr(blkaddr)) {
/* use out-place-update for driect IO under LFS mode */
- if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
+ if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO &&
map->m_may_create) {
err = __allocate_data_block(&dn, map->m_seg_type);
if (err)
@@ -1980,7 +1996,8 @@ submit_and_realloc:
}
if (bio == NULL) {
bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
- is_readahead ? REQ_RAHEAD : 0, page->index);
+ is_readahead ? REQ_RAHEAD : 0, page->index,
+ false);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
bio = NULL;
@@ -2015,7 +2032,7 @@ out:
#ifdef CONFIG_F2FS_FS_COMPRESSION
int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
unsigned nr_pages, sector_t *last_block_in_bio,
- bool is_readahead)
+ bool is_readahead, bool for_write)
{
struct dnode_of_data dn;
struct inode *inode = cc->inode;
@@ -2031,7 +2048,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc));
- last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+ last_block_in_file = (f2fs_readpage_limit(inode) +
+ blocksize - 1) >> blkbits;
/* get rid of pages beyond EOF */
for (i = 0; i < cc->cluster_size; i++) {
@@ -2067,7 +2085,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
for (i = 1; i < cc->cluster_size; i++) {
block_t blkaddr;
- blkaddr = datablock_addr(dn.inode, dn.node_page,
+ blkaddr = data_blkaddr(dn.inode, dn.node_page,
dn.ofs_in_node + i);
if (!__is_valid_data_blkaddr(blkaddr))
@@ -2096,7 +2114,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
struct page *page = dic->cpages[i];
block_t blkaddr;
- blkaddr = datablock_addr(dn.inode, dn.node_page,
+ blkaddr = data_blkaddr(dn.inode, dn.node_page,
dn.ofs_in_node + i + 1);
if (bio && !page_is_mergeable(sbi, bio,
@@ -2109,7 +2127,7 @@ submit_and_realloc:
if (!bio) {
bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages,
is_readahead ? REQ_RAHEAD : 0,
- page->index);
+ page->index, for_write);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
bio = NULL;
@@ -2210,7 +2228,7 @@ int f2fs_mpage_readpages(struct address_space *mapping,
ret = f2fs_read_multi_pages(&cc, &bio,
max_nr_pages,
&last_block_in_bio,
- is_readahead);
+ is_readahead, false);
f2fs_destroy_compress_ctx(&cc);
if (ret)
goto set_error_page;
@@ -2253,7 +2271,7 @@ next_page:
ret = f2fs_read_multi_pages(&cc, &bio,
max_nr_pages,
&last_block_in_bio,
- is_readahead);
+ is_readahead, false);
f2fs_destroy_compress_ctx(&cc);
}
}
@@ -2326,7 +2344,7 @@ retry_encrypt:
/* flush pending IOs and wait for a while in the ENOMEM case */
if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
f2fs_flush_merged_writes(fio->sbi);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
gfp_flags |= __GFP_NOFAIL;
goto retry_encrypt;
}
@@ -2397,7 +2415,7 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- if (test_opt(sbi, LFS))
+ if (f2fs_lfs_mode(sbi))
return true;
if (S_ISDIR(inode->i_mode))
return true;
@@ -2647,10 +2665,10 @@ write:
if (err) {
file_set_keep_isize(inode);
} else {
- down_write(&F2FS_I(inode)->i_sem);
+ spin_lock(&F2FS_I(inode)->i_size_lock);
if (F2FS_I(inode)->last_disk_size < psize)
F2FS_I(inode)->last_disk_size = psize;
- up_write(&F2FS_I(inode)->i_sem);
+ spin_unlock(&F2FS_I(inode)->i_size_lock);
}
done:
@@ -2917,7 +2935,7 @@ result:
if (wbc->sync_mode == WB_SYNC_ALL) {
cond_resched();
congestion_wait(BLK_RW_ASYNC,
- HZ/50);
+ DEFAULT_IO_TIMEOUT);
goto retry_write;
}
goto next;
@@ -2973,15 +2991,17 @@ next:
static inline bool __should_serialize_io(struct inode *inode,
struct writeback_control *wbc)
{
+ /* to avoid deadlock in path of data flush */
+ if (F2FS_I(inode)->cp_task)
+ return false;
+
if (!S_ISREG(inode->i_mode))
return false;
- if (f2fs_compressed_file(inode))
- return true;
if (IS_NOQUOTA(inode))
return false;
- /* to avoid deadlock in path of data flush */
- if (F2FS_I(inode)->cp_task)
- return false;
+
+ if (f2fs_compressed_file(inode))
+ return true;
if (wbc->sync_mode != WB_SYNC_ALL)
return true;
if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks)
@@ -3283,7 +3303,7 @@ repeat:
err = -EFSCORRUPTED;
goto fail;
}
- err = f2fs_submit_page_read(inode, page, blkaddr);
+ err = f2fs_submit_page_read(inode, page, blkaddr, true);
if (err)
goto fail;
@@ -3464,7 +3484,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
iter, rw == WRITE ? get_data_block_dio_write :
get_data_block_dio, NULL, f2fs_dio_submit_bio,
- DIO_LOCKING | DIO_SKIP_HOLES);
+ rw == WRITE ? DIO_LOCKING | DIO_SKIP_HOLES :
+ DIO_SKIP_HOLES);
if (do_opu)
up_read(&fi->i_gc_rwsem[READ]);
@@ -3861,7 +3882,7 @@ void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi)
int __init f2fs_init_bio_entry_cache(void)
{
- bio_entry_slab = f2fs_kmem_cache_create("bio_entry_slab",
+ bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
sizeof(struct bio_entry));
if (!bio_entry_slab)
return -ENOMEM;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 6b89eae5e4ca..0dbcb0f9c019 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -301,6 +301,9 @@ static int stat_show(struct seq_file *s, void *v)
si->ssa_area_segs, si->main_area_segs);
seq_printf(s, "(OverProv:%d Resv:%d)]\n\n",
si->overp_segs, si->rsvd_segs);
+ seq_printf(s, "Current Time Sec: %llu / Mounted Time Sec: %llu\n\n",
+ ktime_get_boottime_seconds(),
+ SIT_I(si->sbi)->mounted_time);
if (test_opt(si->sbi, DISCARD))
seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n",
si->utilization, si->valid_count, si->discard_blks);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 27d0dd7a16d6..44bfc464df78 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -471,7 +471,6 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
struct page *dpage)
{
struct page *page;
- int dummy_encrypt = DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(dir));
int err;
if (is_inode_flag_set(inode, FI_NEW_INODE)) {
@@ -498,8 +497,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
if (err)
goto put_error;
- if ((IS_ENCRYPTED(dir) || dummy_encrypt) &&
- f2fs_may_encrypt(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = fscrypt_inherit_context(dir, inode, page, false);
if (err)
goto put_error;
@@ -850,12 +848,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
0);
set_page_dirty(page);
- dir->i_ctime = dir->i_mtime = current_time(dir);
- f2fs_mark_inode_dirty_sync(dir, false);
-
- if (inode)
- f2fs_drop_nlink(dir, inode);
-
if (bit_pos == NR_DENTRY_IN_BLOCK &&
!f2fs_truncate_hole(dir, page->index, page->index + 1)) {
f2fs_clear_page_cache_dirty_tag(page);
@@ -867,6 +859,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
f2fs_remove_dirty_inode(dir);
}
f2fs_put_page(page, 1);
+
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
+
+ if (inode)
+ f2fs_drop_nlink(dir, inode);
}
bool f2fs_empty_dir(struct inode *dir)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 088c3e7a1080..ba470d5687fe 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -75,7 +75,6 @@ extern const char *f2fs_fault_name[FAULT_MAX];
/*
* For mount options
*/
-#define F2FS_MOUNT_BG_GC 0x00000001
#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002
#define F2FS_MOUNT_DISCARD 0x00000004
#define F2FS_MOUNT_NOHEAP 0x00000008
@@ -89,11 +88,8 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_NOBARRIER 0x00000800
#define F2FS_MOUNT_FASTBOOT 0x00001000
#define F2FS_MOUNT_EXTENT_CACHE 0x00002000
-#define F2FS_MOUNT_FORCE_FG_GC 0x00004000
#define F2FS_MOUNT_DATA_FLUSH 0x00008000
#define F2FS_MOUNT_FAULT_INJECTION 0x00010000
-#define F2FS_MOUNT_ADAPTIVE 0x00020000
-#define F2FS_MOUNT_LFS 0x00040000
#define F2FS_MOUNT_USRQUOTA 0x00080000
#define F2FS_MOUNT_GRPQUOTA 0x00100000
#define F2FS_MOUNT_PRJQUOTA 0x00200000
@@ -101,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000
#define F2FS_MOUNT_RESERVE_ROOT 0x01000000
#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000
+#define F2FS_MOUNT_NORECOVERY 0x04000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -139,6 +136,8 @@ struct f2fs_mount_info {
int whint_mode;
int alloc_mode; /* segment allocation policy */
int fsync_mode; /* fsync policy */
+ int fs_mode; /* fs mode: LFS or ADAPTIVE */
+ int bggc_mode; /* bggc mode: off, on or sync */
bool test_dummy_encryption; /* test dummy encryption */
block_t unusable_cap; /* Amount of space allowed to be
* unusable when disabling checkpoint
@@ -332,8 +331,8 @@ struct discard_policy {
bool io_aware; /* issue discard in idle time */
bool sync; /* submit discard with REQ_SYNC flag */
bool ordered; /* issue discard by lba order */
+ bool timeout; /* discard timeout for put_super */
unsigned int granularity; /* discard granularity */
- int timeout; /* discard timeout for put_super */
};
struct discard_cmd_control {
@@ -428,6 +427,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
#define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32)
#define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15)
#define F2FS_IOC_RESIZE_FS _IOW(F2FS_IOCTL_MAGIC, 16, __u64)
+#define F2FS_IOC_GET_COMPRESS_BLOCKS _IOR(F2FS_IOCTL_MAGIC, 17, __u64)
#define F2FS_IOC_GET_VOLUME_NAME FS_IOC_GETFSLABEL
#define F2FS_IOC_SET_VOLUME_NAME FS_IOC_SETFSLABEL
@@ -560,6 +560,9 @@ enum {
#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */
+/* congestion wait timeout value, default: 20ms */
+#define DEFAULT_IO_TIMEOUT (msecs_to_jiffies(20))
+
/* maximum retry quota flush count */
#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8
@@ -676,6 +679,44 @@ enum {
MAX_GC_FAILURE
};
+/* used for f2fs_inode_info->flags */
+enum {
+ FI_NEW_INODE, /* indicate newly allocated inode */
+ FI_DIRTY_INODE, /* indicate inode is dirty or not */
+ FI_AUTO_RECOVER, /* indicate inode is recoverable */
+ FI_DIRTY_DIR, /* indicate directory has dirty pages */
+ FI_INC_LINK, /* need to increment i_nlink */
+ FI_ACL_MODE, /* indicate acl mode */
+ FI_NO_ALLOC, /* should not allocate any blocks */
+ FI_FREE_NID, /* free allocated nide */
+ FI_NO_EXTENT, /* not to use the extent cache */
+ FI_INLINE_XATTR, /* used for inline xattr */
+ FI_INLINE_DATA, /* used for inline data*/
+ FI_INLINE_DENTRY, /* used for inline dentry */
+ FI_APPEND_WRITE, /* inode has appended data */
+ FI_UPDATE_WRITE, /* inode has in-place-update data */
+ FI_NEED_IPU, /* used for ipu per file */
+ FI_ATOMIC_FILE, /* indicate atomic file */
+ FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */
+ FI_VOLATILE_FILE, /* indicate volatile file */
+ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
+ FI_DROP_CACHE, /* drop dirty page cache */
+ FI_DATA_EXIST, /* indicate data exists */
+ FI_INLINE_DOTS, /* indicate inline dot dentries */
+ FI_DO_DEFRAG, /* indicate defragment is running */
+ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
+ FI_NO_PREALLOC, /* indicate skipped preallocated blocks */
+ FI_HOT_DATA, /* indicate file is hot */
+ FI_EXTRA_ATTR, /* indicate file has extra attribute */
+ FI_PROJ_INHERIT, /* indicate file inherits projectid */
+ FI_PIN_FILE, /* indicate file should not be gced */
+ FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
+ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
+ FI_COMPRESSED_FILE, /* indicate file's data can be compressed */
+ FI_MMAP_FILE, /* indicate file was mmapped */
+ FI_MAX, /* max flag, never be used */
+};
+
struct f2fs_inode_info {
struct inode vfs_inode; /* serve a vfs inode */
unsigned long i_flags; /* keep an inode flags for ioctl */
@@ -688,7 +729,7 @@ struct f2fs_inode_info {
umode_t i_acl_mode; /* keep file acl mode temporarily */
/* Use below internally in f2fs*/
- unsigned long flags; /* use to pass per-file flags */
+ unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */
struct rw_semaphore i_sem; /* protect fi info */
atomic_t dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
@@ -697,6 +738,7 @@ struct f2fs_inode_info {
struct task_struct *cp_task; /* separate cp/wb IO stats*/
nid_t i_xattr_nid; /* node id that contains xattrs */
loff_t last_disk_size; /* lastly written file size */
+ spinlock_t i_size_lock; /* protect last_disk_size */
#ifdef CONFIG_QUOTA
struct dquot *i_dquot[MAXQUOTAS];
@@ -1173,6 +1215,20 @@ enum {
};
enum {
+ BGGC_MODE_ON, /* background gc is on */
+ BGGC_MODE_OFF, /* background gc is off */
+ BGGC_MODE_SYNC, /*
+ * background gc is on, migrating blocks
+ * like foreground gc
+ */
+};
+
+enum {
+ FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */
+ FS_MODE_LFS, /* use lfs allocation only */
+};
+
+enum {
WHINT_MODE_OFF, /* not pass down write hints */
WHINT_MODE_USER, /* try to pass down hints given by users */
WHINT_MODE_FS, /* pass down hints with F2FS policy */
@@ -1212,13 +1268,13 @@ enum fsync_mode {
enum compress_algorithm_type {
COMPRESS_LZO,
COMPRESS_LZ4,
+ COMPRESS_ZSTD,
COMPRESS_MAX,
};
-#define COMPRESS_DATA_RESERVED_SIZE 4
+#define COMPRESS_DATA_RESERVED_SIZE 5
struct compress_data {
__le32 clen; /* compressed data size */
- __le32 chksum; /* checksum of compressed data */
__le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */
u8 cdata[]; /* compressed data */
};
@@ -1242,6 +1298,7 @@ struct compress_ctx {
size_t rlen; /* valid data length in rbuf */
size_t clen; /* valid data length in cbuf */
void *private; /* payload buffer for specified compression algorithm */
+ void *private2; /* extra payload buffer */
};
/* compress context for write IO path */
@@ -1271,11 +1328,14 @@ struct decompress_io_ctx {
size_t clen; /* valid data length in cbuf */
refcount_t ref; /* referrence count of compressed page */
bool failed; /* indicate IO error during decompression */
+ void *private; /* payload buffer for specified decompression algorithm */
+ void *private2; /* extra payload buffer */
};
#define NULL_CLUSTER ((unsigned int)(~0))
#define MIN_COMPRESS_LOG_SIZE 2
#define MAX_COMPRESS_LOG_SIZE 8
+#define MAX_COMPRESS_WINDOW_SIZE ((PAGE_SIZE) << MAX_COMPRESS_LOG_SIZE)
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
@@ -1471,6 +1531,9 @@ struct f2fs_sb_info {
__u32 s_chksum_seed;
struct workqueue_struct *post_read_wq; /* post read workqueue */
+
+ struct kmem_cache *inline_xattr_slab; /* inline xattr entry */
+ unsigned int inline_xattr_slab_size; /* default inline xattr slab size */
};
struct f2fs_private_dio {
@@ -2211,7 +2274,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
dquot_free_inode(inode);
} else {
if (unlikely(inode->i_blocks == 0)) {
- f2fs_warn(sbi, "Inconsistent i_blocks, ino:%lu, iblocks:%llu",
+ f2fs_warn(sbi, "dec_valid_node_count: inconsistent i_blocks, ino:%lu, iblocks:%llu",
inode->i_ino,
(unsigned long long)inode->i_blocks);
set_sbi_flag(sbi, SBI_NEED_FSCK);
@@ -2379,7 +2442,7 @@ static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
}
static inline int f2fs_has_extra_attr(struct inode *inode);
-static inline block_t datablock_addr(struct inode *inode,
+static inline block_t data_blkaddr(struct inode *inode,
struct page *node_page, unsigned int offset)
{
struct f2fs_node *raw_node;
@@ -2389,9 +2452,9 @@ static inline block_t datablock_addr(struct inode *inode,
raw_node = F2FS_NODE(node_page);
- /* from GC path only */
if (is_inode) {
if (!inode)
+ /* from GC path only */
base = offset_in_addr(&raw_node->i);
else if (f2fs_has_extra_attr(inode))
base = get_extra_isize(inode);
@@ -2401,6 +2464,11 @@ static inline block_t datablock_addr(struct inode *inode,
return le32_to_cpu(addr_array[base + offset]);
}
+static inline block_t f2fs_data_blkaddr(struct dnode_of_data *dn)
+{
+ return data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node);
+}
+
static inline int f2fs_test_bit(unsigned int nr, char *addr)
{
int mask;
@@ -2498,43 +2566,6 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
return flags & F2FS_OTHER_FLMASK;
}
-/* used for f2fs_inode_info->flags */
-enum {
- FI_NEW_INODE, /* indicate newly allocated inode */
- FI_DIRTY_INODE, /* indicate inode is dirty or not */
- FI_AUTO_RECOVER, /* indicate inode is recoverable */
- FI_DIRTY_DIR, /* indicate directory has dirty pages */
- FI_INC_LINK, /* need to increment i_nlink */
- FI_ACL_MODE, /* indicate acl mode */
- FI_NO_ALLOC, /* should not allocate any blocks */
- FI_FREE_NID, /* free allocated nide */
- FI_NO_EXTENT, /* not to use the extent cache */
- FI_INLINE_XATTR, /* used for inline xattr */
- FI_INLINE_DATA, /* used for inline data*/
- FI_INLINE_DENTRY, /* used for inline dentry */
- FI_APPEND_WRITE, /* inode has appended data */
- FI_UPDATE_WRITE, /* inode has in-place-update data */
- FI_NEED_IPU, /* used for ipu per file */
- FI_ATOMIC_FILE, /* indicate atomic file */
- FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */
- FI_VOLATILE_FILE, /* indicate volatile file */
- FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
- FI_DROP_CACHE, /* drop dirty page cache */
- FI_DATA_EXIST, /* indicate data exists */
- FI_INLINE_DOTS, /* indicate inline dot dentries */
- FI_DO_DEFRAG, /* indicate defragment is running */
- FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
- FI_NO_PREALLOC, /* indicate skipped preallocated blocks */
- FI_HOT_DATA, /* indicate file is hot */
- FI_EXTRA_ATTR, /* indicate file has extra attribute */
- FI_PROJ_INHERIT, /* indicate file inherits projectid */
- FI_PIN_FILE, /* indicate file should not be gced */
- FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
- FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
- FI_COMPRESSED_FILE, /* indicate file's data can be compressed */
- FI_MMAP_FILE, /* indicate file was mmapped */
-};
-
static inline void __mark_inode_dirty_flag(struct inode *inode,
int flag, bool set)
{
@@ -2549,27 +2580,24 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
case FI_DATA_EXIST:
case FI_INLINE_DOTS:
case FI_PIN_FILE:
- case FI_COMPRESSED_FILE:
f2fs_mark_inode_dirty_sync(inode, true);
}
}
static inline void set_inode_flag(struct inode *inode, int flag)
{
- if (!test_bit(flag, &F2FS_I(inode)->flags))
- set_bit(flag, &F2FS_I(inode)->flags);
+ test_and_set_bit(flag, F2FS_I(inode)->flags);
__mark_inode_dirty_flag(inode, flag, true);
}
static inline int is_inode_flag_set(struct inode *inode, int flag)
{
- return test_bit(flag, &F2FS_I(inode)->flags);
+ return test_bit(flag, F2FS_I(inode)->flags);
}
static inline void clear_inode_flag(struct inode *inode, int flag)
{
- if (test_bit(flag, &F2FS_I(inode)->flags))
- clear_bit(flag, &F2FS_I(inode)->flags);
+ test_and_clear_bit(flag, F2FS_I(inode)->flags);
__mark_inode_dirty_flag(inode, flag, false);
}
@@ -2660,19 +2688,19 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
struct f2fs_inode_info *fi = F2FS_I(inode);
if (ri->i_inline & F2FS_INLINE_XATTR)
- set_bit(FI_INLINE_XATTR, &fi->flags);
+ set_bit(FI_INLINE_XATTR, fi->flags);
if (ri->i_inline & F2FS_INLINE_DATA)
- set_bit(FI_INLINE_DATA, &fi->flags);
+ set_bit(FI_INLINE_DATA, fi->flags);
if (ri->i_inline & F2FS_INLINE_DENTRY)
- set_bit(FI_INLINE_DENTRY, &fi->flags);
+ set_bit(FI_INLINE_DENTRY, fi->flags);
if (ri->i_inline & F2FS_DATA_EXIST)
- set_bit(FI_DATA_EXIST, &fi->flags);
+ set_bit(FI_DATA_EXIST, fi->flags);
if (ri->i_inline & F2FS_INLINE_DOTS)
- set_bit(FI_INLINE_DOTS, &fi->flags);
+ set_bit(FI_INLINE_DOTS, fi->flags);
if (ri->i_inline & F2FS_EXTRA_ATTR)
- set_bit(FI_EXTRA_ATTR, &fi->flags);
+ set_bit(FI_EXTRA_ATTR, fi->flags);
if (ri->i_inline & F2FS_PIN_FILE)
- set_bit(FI_PIN_FILE, &fi->flags);
+ set_bit(FI_PIN_FILE, fi->flags);
}
static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
@@ -2857,9 +2885,9 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
if (!f2fs_is_time_consistent(inode))
return false;
- down_read(&F2FS_I(inode)->i_sem);
+ spin_lock(&F2FS_I(inode)->i_size_lock);
ret = F2FS_I(inode)->last_disk_size == i_size_read(inode);
- up_read(&F2FS_I(inode)->i_sem);
+ spin_unlock(&F2FS_I(inode)->i_size_lock);
return ret;
}
@@ -3213,7 +3241,7 @@ void f2fs_drop_inmem_pages(struct inode *inode);
void f2fs_drop_inmem_page(struct inode *inode, struct page *page);
int f2fs_commit_inmem_pages(struct inode *inode);
void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need);
-void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi);
+void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg);
int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino);
int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi);
int f2fs_flush_device_cache(struct f2fs_sb_info *sbi);
@@ -3309,7 +3337,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi);
void f2fs_update_dirty_page(struct inode *inode, struct page *page);
void f2fs_remove_dirty_inode(struct inode *inode);
int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type);
-void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi);
+void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type);
int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc);
void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi);
int __init f2fs_create_checkpoint_caches(void);
@@ -3320,7 +3348,7 @@ void f2fs_destroy_checkpoint_caches(void);
*/
int __init f2fs_init_bioset(void);
void f2fs_destroy_bioset(void);
-struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail);
+struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio);
int f2fs_init_bio_entry_cache(void);
void f2fs_destroy_bio_entry_cache(void);
void f2fs_submit_bio(struct f2fs_sb_info *sbi,
@@ -3776,7 +3804,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index);
int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
unsigned nr_pages, sector_t *last_block_in_bio,
- bool is_readahead);
+ bool is_readahead, bool for_write);
struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
void f2fs_free_dic(struct decompress_io_ctx *dic);
void f2fs_decompress_end_io(struct page **rpages,
@@ -3813,6 +3841,7 @@ static inline void set_compress_context(struct inode *inode)
F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
set_inode_flag(inode, FI_COMPRESSED_FILE);
stat_inc_compr_inode(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline u64 f2fs_disable_compressed_file(struct inode *inode)
@@ -3821,12 +3850,17 @@ static inline u64 f2fs_disable_compressed_file(struct inode *inode)
if (!f2fs_compressed_file(inode))
return 0;
- if (fi->i_compr_blocks)
- return fi->i_compr_blocks;
+ if (S_ISREG(inode->i_mode)) {
+ if (get_dirty_pages(inode))
+ return 1;
+ if (fi->i_compr_blocks)
+ return fi->i_compr_blocks;
+ }
fi->i_flags &= ~F2FS_COMPR_FL;
- clear_inode_flag(inode, FI_COMPRESSED_FILE);
stat_dec_compr_inode(inode);
+ clear_inode_flag(inode, FI_COMPRESSED_FILE);
+ f2fs_mark_inode_dirty_sync(inode, true);
return 0;
}
@@ -3903,31 +3937,25 @@ static inline bool f2fs_hw_is_readonly(struct f2fs_sb_info *sbi)
return false;
}
-
-static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
+static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi)
{
- clear_opt(sbi, ADAPTIVE);
- clear_opt(sbi, LFS);
-
- switch (mt) {
- case F2FS_MOUNT_ADAPTIVE:
- set_opt(sbi, ADAPTIVE);
- break;
- case F2FS_MOUNT_LFS:
- set_opt(sbi, LFS);
- break;
- }
+ return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS;
}
-static inline bool f2fs_may_encrypt(struct inode *inode)
+static inline bool f2fs_may_encrypt(struct inode *dir, struct inode *inode)
{
#ifdef CONFIG_FS_ENCRYPTION
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
umode_t mode = inode->i_mode;
- return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
-#else
- return false;
+ /*
+ * If the directory encrypted or dummy encryption enabled,
+ * then we should encrypt the inode.
+ */
+ if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi))
+ return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
#endif
+ return false;
}
static inline bool f2fs_may_compress(struct inode *inode)
@@ -3971,7 +3999,7 @@ static inline int allow_outplace_dio(struct inode *inode,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int rw = iov_iter_rw(iter);
- return (test_opt(sbi, LFS) && (rw == WRITE) &&
+ return (f2fs_lfs_mode(sbi) && (rw == WRITE) &&
!block_unaligned_IO(inode, iocb, iter));
}
@@ -3993,7 +4021,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
*/
if (f2fs_sb_has_blkzoned(sbi))
return true;
- if (test_opt(sbi, LFS) && (rw == WRITE)) {
+ if (f2fs_lfs_mode(sbi) && (rw == WRITE)) {
if (block_unaligned_IO(inode, iocb, iter))
return true;
if (F2FS_IO_ALIGNED(sbi))
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 351762f77840..6ab8f621a3c5 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -106,13 +106,20 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
err = f2fs_get_block(&dn, page->index);
f2fs_put_dnode(&dn);
__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
- if (err) {
- unlock_page(page);
- goto out_sem;
- }
}
- /* fill the page */
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (!need_alloc) {
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
+ f2fs_put_dnode(&dn);
+ }
+#endif
+ if (err) {
+ unlock_page(page);
+ goto out_sem;
+ }
+
f2fs_wait_on_page_writeback(page, DATA, false, true);
/* wait for GCed page writeback via META_MAPPING */
@@ -448,8 +455,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
block_t blkaddr;
- blkaddr = datablock_addr(dn.inode,
- dn.node_page, dn.ofs_in_node);
+ blkaddr = f2fs_data_blkaddr(&dn);
if (__is_valid_data_blkaddr(blkaddr) &&
!f2fs_is_valid_blkaddr(F2FS_I_SB(inode),
@@ -793,6 +799,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
}
flags = fi->i_flags;
+ if (flags & F2FS_COMPR_FL)
+ stat->attributes |= STATX_ATTR_COMPRESSED;
if (flags & F2FS_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
if (IS_ENCRYPTED(inode))
@@ -804,7 +812,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
if (IS_VERITY(inode))
stat->attributes |= STATX_ATTR_VERITY;
- stat->attributes_mask |= (STATX_ATTR_APPEND |
+ stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
+ STATX_ATTR_APPEND |
STATX_ATTR_ENCRYPTED |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP |
@@ -929,10 +938,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
return err;
- down_write(&F2FS_I(inode)->i_sem);
+ spin_lock(&F2FS_I(inode)->i_size_lock);
inode->i_mtime = inode->i_ctime = current_time(inode);
F2FS_I(inode)->last_disk_size = i_size_read(inode);
- up_write(&F2FS_I(inode)->i_sem);
+ spin_unlock(&F2FS_I(inode)->i_size_lock);
}
__setattr_copy(inode, attr);
@@ -1109,8 +1118,7 @@ next_dnode:
done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) -
dn.ofs_in_node, len);
for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) {
- *blkaddr = datablock_addr(dn.inode,
- dn.node_page, dn.ofs_in_node);
+ *blkaddr = f2fs_data_blkaddr(&dn);
if (__is_valid_data_blkaddr(*blkaddr) &&
!f2fs_is_valid_blkaddr(sbi, *blkaddr,
@@ -1121,7 +1129,7 @@ next_dnode:
if (!f2fs_is_checkpointed_data(sbi, *blkaddr)) {
- if (test_opt(sbi, LFS)) {
+ if (f2fs_lfs_mode(sbi)) {
f2fs_put_dnode(&dn);
return -EOPNOTSUPP;
}
@@ -1199,8 +1207,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
ADDRS_PER_PAGE(dn.node_page, dst_inode) -
dn.ofs_in_node, len - i);
do {
- dn.data_blkaddr = datablock_addr(dn.inode,
- dn.node_page, dn.ofs_in_node);
+ dn.data_blkaddr = f2fs_data_blkaddr(&dn);
f2fs_truncate_data_blocks_range(&dn, 1);
if (do_replace[i]) {
@@ -1376,8 +1383,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
int ret;
for (; index < end; index++, dn->ofs_in_node++) {
- if (datablock_addr(dn->inode, dn->node_page,
- dn->ofs_in_node) == NULL_ADDR)
+ if (f2fs_data_blkaddr(dn) == NULL_ADDR)
count++;
}
@@ -1388,8 +1394,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
dn->ofs_in_node = ofs_in_node;
for (index = start; index < end; index++, dn->ofs_in_node++) {
- dn->data_blkaddr = datablock_addr(dn->inode,
- dn->node_page, dn->ofs_in_node);
+ dn->data_blkaddr = f2fs_data_blkaddr(dn);
/*
* f2fs_reserve_new_blocks will not guarantee entire block
* allocation.
@@ -1787,12 +1792,15 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id)
static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
+ u32 masked_flags = fi->i_flags & mask;
+
+ f2fs_bug_on(F2FS_I_SB(inode), (iflags & ~mask));
/* Is it quota file? Do not allow user to mess with it */
if (IS_NOQUOTA(inode))
return -EPERM;
- if ((iflags ^ fi->i_flags) & F2FS_CASEFOLD_FL) {
+ if ((iflags ^ masked_flags) & F2FS_CASEFOLD_FL) {
if (!f2fs_sb_has_casefold(F2FS_I_SB(inode)))
return -EOPNOTSUPP;
if (!f2fs_empty_dir(inode))
@@ -1806,27 +1814,22 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
return -EINVAL;
}
- if ((iflags ^ fi->i_flags) & F2FS_COMPR_FL) {
- if (S_ISREG(inode->i_mode) &&
- (fi->i_flags & F2FS_COMPR_FL || i_size_read(inode) ||
- F2FS_HAS_BLOCKS(inode)))
- return -EINVAL;
+ if ((iflags ^ masked_flags) & F2FS_COMPR_FL) {
+ if (masked_flags & F2FS_COMPR_FL) {
+ if (f2fs_disable_compressed_file(inode))
+ return -EINVAL;
+ }
if (iflags & F2FS_NOCOMP_FL)
return -EINVAL;
if (iflags & F2FS_COMPR_FL) {
- int err = f2fs_convert_inline_inode(inode);
-
- if (err)
- return err;
-
if (!f2fs_may_compress(inode))
return -EINVAL;
set_compress_context(inode);
}
}
- if ((iflags ^ fi->i_flags) & F2FS_NOCOMP_FL) {
- if (fi->i_flags & F2FS_COMPR_FL)
+ if ((iflags ^ masked_flags) & F2FS_NOCOMP_FL) {
+ if (masked_flags & F2FS_COMPR_FL)
return -EINVAL;
}
@@ -3401,6 +3404,21 @@ out:
return err;
}
+static int f2fs_get_compress_blocks(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ __u64 blocks;
+
+ if (!f2fs_sb_has_compression(F2FS_I_SB(inode)))
+ return -EOPNOTSUPP;
+
+ if (!f2fs_compressed_file(inode))
+ return -EINVAL;
+
+ blocks = F2FS_I(inode)->i_compr_blocks;
+ return put_user(blocks, (u64 __user *)arg);
+}
+
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
@@ -3481,6 +3499,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_get_volume_name(filp, arg);
case F2FS_IOC_SET_VOLUME_NAME:
return f2fs_set_volume_name(filp, arg);
+ case F2FS_IOC_GET_COMPRESS_BLOCKS:
+ return f2fs_get_compress_blocks(filp, arg);
default:
return -ENOTTY;
}
@@ -3508,8 +3528,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
- if (!f2fs_is_compress_backend_ready(inode))
- return -EOPNOTSUPP;
+ if (!f2fs_is_compress_backend_ready(inode)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!inode_trylock(inode)) {
@@ -3639,6 +3661,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_MEASURE_VERITY:
case F2FS_IOC_GET_VOLUME_NAME:
case F2FS_IOC_SET_VOLUME_NAME:
+ case F2FS_IOC_GET_COMPRESS_BLOCKS:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index db8725d473b5..26248c8936db 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -31,6 +31,8 @@ static int gc_thread_func(void *data)
set_freezable();
do {
+ bool sync_mode;
+
wait_event_interruptible_timeout(*wq,
kthread_should_stop() || freezing(current) ||
gc_th->gc_wake,
@@ -101,15 +103,17 @@ static int gc_thread_func(void *data)
do_gc:
stat_inc_bggc_count(sbi->stat_info);
+ sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC;
+
/* if return value is not zero, no victim was selected */
- if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO))
+ if (f2fs_gc(sbi, sync_mode, true, NULL_SEGNO))
wait_ms = gc_th->no_gc_sleep_time;
trace_f2fs_background_gc(sbi->sb, wait_ms,
prefree_segments(sbi), free_segments(sbi));
/* balancing f2fs's metadata periodically */
- f2fs_balance_fs_bg(sbi);
+ f2fs_balance_fs_bg(sbi, true);
next:
sb_end_write(sbi->sb);
@@ -192,7 +196,10 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
p->ofs_unit = sbi->segs_per_sec;
}
- /* we need to check every dirty segments in the FG_GC case */
+ /*
+ * adjust candidates range, should select all dirty segments for
+ * foreground GC and urgent GC cases.
+ */
if (gc_type != FG_GC &&
(sbi->gc_mode != GC_URGENT) &&
p->max_search > sbi->max_victim_search)
@@ -634,7 +641,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
}
*nofs = ofs_of_node(node_page);
- source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node);
+ source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node);
f2fs_put_page(node_page, 1);
if (source_blkaddr != blkaddr) {
@@ -762,7 +769,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
struct page *page, *mpage;
block_t newaddr;
int err = 0;
- bool lfs_mode = test_opt(fio.sbi, LFS);
+ bool lfs_mode = f2fs_lfs_mode(fio.sbi);
/* do not read out */
page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
@@ -970,7 +977,8 @@ retry:
if (err) {
clear_cold_data(page);
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
goto retry;
}
if (is_dirty)
@@ -1018,8 +1026,8 @@ next_step:
* race condition along with SSR block allocation.
*/
if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) ||
- get_valid_blocks(sbi, segno, false) ==
- sbi->blocks_per_seg)
+ get_valid_blocks(sbi, segno, true) ==
+ BLKS_PER_SEC(sbi))
return submitted;
if (check_valid_map(sbi, segno, off) == 0)
@@ -1203,7 +1211,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
if (get_valid_blocks(sbi, segno, false) == 0)
goto freed;
- if (__is_large_section(sbi) &&
+ if (gc_type == BG_GC && __is_large_section(sbi) &&
migrated >= sbi->migration_granularity)
goto skip;
if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi)))
@@ -1233,12 +1241,12 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
segno, gc_type);
stat_inc_seg_count(sbi, type, gc_type);
+ migrated++;
freed:
if (gc_type == FG_GC &&
get_valid_blocks(sbi, segno, false) == 0)
seg_freed++;
- migrated++;
if (__is_large_section(sbi) && segno + 1 < end_segno)
sbi->next_victim_seg[gc_type] = segno + 1;
@@ -1434,12 +1442,19 @@ static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start,
static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
{
struct f2fs_super_block *raw_sb = F2FS_RAW_SUPER(sbi);
- int section_count = le32_to_cpu(raw_sb->section_count);
- int segment_count = le32_to_cpu(raw_sb->segment_count);
- int segment_count_main = le32_to_cpu(raw_sb->segment_count_main);
- long long block_count = le64_to_cpu(raw_sb->block_count);
+ int section_count;
+ int segment_count;
+ int segment_count_main;
+ long long block_count;
int segs = secs * sbi->segs_per_sec;
+ down_write(&sbi->sb_lock);
+
+ section_count = le32_to_cpu(raw_sb->section_count);
+ segment_count = le32_to_cpu(raw_sb->segment_count);
+ segment_count_main = le32_to_cpu(raw_sb->segment_count_main);
+ block_count = le64_to_cpu(raw_sb->block_count);
+
raw_sb->section_count = cpu_to_le32(section_count + secs);
raw_sb->segment_count = cpu_to_le32(segment_count + segs);
raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs);
@@ -1453,6 +1468,8 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
raw_sb->devs[last_dev].total_segments =
cpu_to_le32(dev_segs + segs);
}
+
+ up_write(&sbi->sb_lock);
}
static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
@@ -1570,11 +1587,17 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
goto out;
}
+ mutex_lock(&sbi->cp_mutex);
update_fs_metadata(sbi, -secs);
clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
+ mutex_unlock(&sbi->cp_mutex);
+
err = f2fs_sync_fs(sbi->sb, 1);
if (err) {
+ mutex_lock(&sbi->cp_mutex);
update_fs_metadata(sbi, secs);
+ mutex_unlock(&sbi->cp_mutex);
update_sb_metadata(sbi, secs);
f2fs_commit_super(sbi, false);
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 78c3f1d70f1d..44582a4db513 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -291,13 +291,30 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
fi->i_flags & F2FS_COMPR_FL &&
F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
i_log_cluster_size)) {
- if (ri->i_compress_algorithm >= COMPRESS_MAX)
+ if (ri->i_compress_algorithm >= COMPRESS_MAX) {
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported "
+ "compress algorithm: %u, run fsck to fix",
+ __func__, inode->i_ino,
+ ri->i_compress_algorithm);
return false;
- if (le64_to_cpu(ri->i_compr_blocks) > inode->i_blocks)
+ }
+ if (le64_to_cpu(ri->i_compr_blocks) >
+ SECTOR_TO_BLOCK(inode->i_blocks)) {
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has inconsistent "
+ "i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix",
+ __func__, inode->i_ino,
+ le64_to_cpu(ri->i_compr_blocks),
+ SECTOR_TO_BLOCK(inode->i_blocks));
return false;
+ }
if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE ||
- ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE)
+ ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) {
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported "
+ "log cluster size: %u, run fsck to fix",
+ __func__, inode->i_ino,
+ ri->i_log_cluster_size);
return false;
+ }
}
return true;
@@ -345,7 +362,7 @@ static int do_read_inode(struct inode *inode)
fi->i_flags = le32_to_cpu(ri->i_flags);
if (S_ISREG(inode->i_mode))
fi->i_flags &= ~F2FS_PROJINHERIT_FL;
- fi->flags = 0;
+ bitmap_zero(fi->flags, FI_MAX);
fi->i_advise = ri->i_advise;
fi->i_pino = le32_to_cpu(ri->i_pino);
fi->i_dir_level = ri->i_dir_level;
@@ -518,7 +535,7 @@ retry:
inode = f2fs_iget(sb, ino);
if (IS_ERR(inode)) {
if (PTR_ERR(inode) == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
goto retry;
}
}
@@ -759,7 +776,7 @@ no_delete:
else
f2fs_inode_synced(inode);
- /* ino == 0, if f2fs_new_inode() was failed t*/
+ /* for the case f2fs_new_inode() was failed, .i_ino is zero, skip it */
if (inode->i_ino)
invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino,
inode->i_ino);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 2aa035422c0f..f54119da2217 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -75,9 +75,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
set_inode_flag(inode, FI_NEW_INODE);
- /* If the directory encrypted, then we should encrypt the inode. */
- if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
- f2fs_may_encrypt(inode))
+ if (f2fs_may_encrypt(dir, inode))
f2fs_set_encrypted_inode(inode);
if (f2fs_sb_has_extra_attr(sbi)) {
@@ -177,7 +175,7 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub)
}
/*
- * Set multimedia files as cold files for hot/cold data separation
+ * Set file's temperature for hot/cold data separation
*/
static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
const unsigned char *name)
@@ -876,12 +874,6 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) {
- int err = fscrypt_get_encryption_info(dir);
- if (err)
- return err;
- }
-
return __f2fs_tmpfile(dir, dentry, mode, NULL);
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 9d02cdcdbb07..ecbd6bd14a49 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -510,9 +510,6 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
return nr - nr_shrink;
}
-/*
- * This function always returns success
- */
int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
struct node_info *ni)
{
@@ -716,8 +713,7 @@ got:
/*
* Caller should call f2fs_put_dnode(dn).
* Also, it should grab and release a rwsem by calling f2fs_lock_op() and
- * f2fs_unlock_op() only if ro is not set RDONLY_NODE.
- * In the case of RDONLY_NODE, we don't need to care about mutex.
+ * f2fs_unlock_op() only if mode is set with ALLOC_NODE.
*/
int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
{
@@ -809,8 +805,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
dn->nid = nids[level];
dn->ofs_in_node = offset[level];
dn->node_page = npage[level];
- dn->data_blkaddr = datablock_addr(dn->inode,
- dn->node_page, dn->ofs_in_node);
+ dn->data_blkaddr = f2fs_data_blkaddr(dn);
return 0;
release_pages:
@@ -1188,8 +1183,9 @@ int f2fs_remove_inode_page(struct inode *inode)
}
if (unlikely(inode->i_blocks != 0 && inode->i_blocks != 8)) {
- f2fs_warn(F2FS_I_SB(inode), "Inconsistent i_blocks, ino:%lu, iblocks:%llu",
- inode->i_ino, (unsigned long long)inode->i_blocks);
+ f2fs_warn(F2FS_I_SB(inode),
+ "f2fs_remove_inode_page: inconsistent i_blocks, ino:%lu, iblocks:%llu",
+ inode->i_ino, (unsigned long long)inode->i_blocks);
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
}
@@ -1562,15 +1558,16 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
if (atomic && !test_opt(sbi, NOBARRIER))
fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
- set_page_writeback(page);
- ClearPageError(page);
-
+ /* should add to global list before clearing PAGECACHE status */
if (f2fs_in_warm_node_list(sbi, page)) {
seq = f2fs_add_fsync_node_entry(sbi, page);
if (seq_id)
*seq_id = seq;
}
+ set_page_writeback(page);
+ ClearPageError(page);
+
fio.old_blkaddr = ni.blk_addr;
f2fs_do_write_node_page(nid, &fio);
set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
@@ -1979,7 +1976,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
goto skip_write;
/* balancing f2fs's metadata in background */
- f2fs_balance_fs_bg(sbi);
+ f2fs_balance_fs_bg(sbi, true);
/* collect a number of dirty node pages and write together */
if (wbc->sync_mode != WB_SYNC_ALL &&
@@ -2602,7 +2599,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
retry:
ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
if (!ipage) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
goto retry;
}
@@ -3193,22 +3190,22 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
int __init f2fs_create_node_manager_caches(void)
{
- nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
+ nat_entry_slab = f2fs_kmem_cache_create("f2fs_nat_entry",
sizeof(struct nat_entry));
if (!nat_entry_slab)
goto fail;
- free_nid_slab = f2fs_kmem_cache_create("free_nid",
+ free_nid_slab = f2fs_kmem_cache_create("f2fs_free_nid",
sizeof(struct free_nid));
if (!free_nid_slab)
goto destroy_nat_entry;
- nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set",
+ nat_entry_set_slab = f2fs_kmem_cache_create("f2fs_nat_entry_set",
sizeof(struct nat_entry_set));
if (!nat_entry_set_slab)
goto destroy_free_nid;
- fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry",
+ fsync_node_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_node_entry",
sizeof(struct fsync_node_entry));
if (!fsync_node_entry_slab)
goto destroy_nat_entry_set;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 763d5c0951d1..dd804c07eeb0 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -496,8 +496,7 @@ out:
return 0;
truncate_out:
- if (datablock_addr(tdn.inode, tdn.node_page,
- tdn.ofs_in_node) == blkaddr)
+ if (f2fs_data_blkaddr(&tdn) == blkaddr)
f2fs_truncate_data_blocks_range(&tdn, 1);
if (dn->inode->i_ino == nid && !dn->inode_page_locked)
unlock_page(dn->inode_page);
@@ -535,7 +534,7 @@ retry_dn:
err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE);
if (err) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
goto retry_dn;
}
goto out;
@@ -560,8 +559,8 @@ retry_dn:
for (; start < end; start++, dn.ofs_in_node++) {
block_t src, dest;
- src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
- dest = datablock_addr(dn.inode, page, dn.ofs_in_node);
+ src = f2fs_data_blkaddr(&dn);
+ dest = data_blkaddr(dn.inode, page, dn.ofs_in_node);
if (__is_valid_data_blkaddr(src) &&
!f2fs_is_valid_blkaddr(sbi, src, META_POR)) {
@@ -618,7 +617,8 @@ retry_prev:
err = check_index_in_prev_nodes(sbi, dest, &dn);
if (err) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
goto retry_prev;
}
goto err;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index cf0eb002cfd4..b7a9421472a7 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -172,7 +172,7 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi)
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
- if (test_opt(sbi, LFS))
+ if (f2fs_lfs_mode(sbi))
return false;
if (sbi->gc_mode == GC_URGENT)
return true;
@@ -245,7 +245,8 @@ retry:
LOOKUP_NODE);
if (err) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
cond_resched();
goto retry;
}
@@ -312,7 +313,7 @@ next:
skip:
iput(inode);
}
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
cond_resched();
if (gc_failure) {
if (++looped >= count)
@@ -415,7 +416,8 @@ retry:
err = f2fs_do_write_data_page(&fio);
if (err) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
cond_resched();
goto retry;
}
@@ -494,7 +496,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
/* balance_fs_bg is able to be pending */
if (need && excess_cached_nats(sbi))
- f2fs_balance_fs_bg(sbi);
+ f2fs_balance_fs_bg(sbi, false);
if (!f2fs_is_checkpoint_ready(sbi))
return;
@@ -509,7 +511,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
}
}
-void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
+void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
{
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return;
@@ -538,7 +540,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
excess_dirty_nats(sbi) ||
excess_dirty_nodes(sbi) ||
f2fs_time_over(sbi, CP_TIME)) {
- if (test_opt(sbi, DATA_FLUSH)) {
+ if (test_opt(sbi, DATA_FLUSH) && from_bg) {
struct blk_plug plug;
mutex_lock(&sbi->flush_lock);
@@ -1078,7 +1080,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
dpolicy->io_aware_gran = MAX_PLIST_NUM;
- dpolicy->timeout = 0;
+ dpolicy->timeout = false;
if (discard_type == DPOLICY_BG) {
dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
@@ -1103,6 +1105,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->io_aware = false;
/* we need to issue all to keep CP_TRIMMED_FLAG */
dpolicy->granularity = 1;
+ dpolicy->timeout = true;
}
}
@@ -1471,12 +1474,12 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
int i, issued = 0;
bool io_interrupted = false;
- if (dpolicy->timeout != 0)
- f2fs_update_time(sbi, dpolicy->timeout);
+ if (dpolicy->timeout)
+ f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT);
for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
- if (dpolicy->timeout != 0 &&
- f2fs_time_over(sbi, dpolicy->timeout))
+ if (dpolicy->timeout &&
+ f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT))
break;
if (i + 1 < dpolicy->granularity)
@@ -1497,8 +1500,8 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
list_for_each_entry_safe(dc, tmp, pend_list, list) {
f2fs_bug_on(sbi, dc->state != D_PREP);
- if (dpolicy->timeout != 0 &&
- f2fs_time_over(sbi, dpolicy->timeout))
+ if (dpolicy->timeout &&
+ f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT))
break;
if (dpolicy->io_aware && i < dpolicy->io_aware_gran &&
@@ -1677,7 +1680,6 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
dcc->discard_granularity);
- dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT;
__issue_discard_cmd(sbi, &dpolicy);
dropped = __drop_discard_cmd(sbi);
@@ -1940,7 +1942,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
unsigned int start = 0, end = -1;
unsigned int secno, start_segno;
bool force = (cpc->reason & CP_DISCARD);
- bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi);
+ bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi);
mutex_lock(&dirty_i->seglist_lock);
@@ -1972,7 +1974,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
(end - 1) <= cpc->trim_end)
continue;
- if (!test_opt(sbi, LFS) || !__is_large_section(sbi)) {
+ if (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi)) {
f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
(end - start) << sbi->log_blocks_per_seg);
continue;
@@ -2801,7 +2803,7 @@ next:
blk_finish_plug(&plug);
mutex_unlock(&dcc->cmd_lock);
trimmed += __wait_all_discard_cmd(sbi, NULL);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
goto next;
}
skip:
@@ -2830,7 +2832,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
struct discard_policy dpolicy;
unsigned long long trimmed = 0;
int err = 0;
- bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi);
+ bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi);
if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
return -EINVAL;
@@ -3193,7 +3195,7 @@ static void update_device_state(struct f2fs_io_info *fio)
static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
{
int type = __get_segment_type(fio);
- bool keep_order = (test_opt(fio->sbi, LFS) && type == CURSEG_COLD_DATA);
+ bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA);
if (keep_order)
down_read(&fio->sbi->io_order_lock);
@@ -4071,7 +4073,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
sit_i->dirty_sentries = 0;
sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
- sit_i->mounted_time = ktime_get_real_seconds();
+ sit_i->mounted_time = ktime_get_boottime_seconds();
init_rwsem(&sit_i->sentry_lock);
return 0;
}
@@ -4678,7 +4680,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS)
sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS;
- if (!test_opt(sbi, LFS))
+ if (!f2fs_lfs_mode(sbi))
sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
@@ -4830,22 +4832,22 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi)
int __init f2fs_create_segment_manager_caches(void)
{
- discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
+ discard_entry_slab = f2fs_kmem_cache_create("f2fs_discard_entry",
sizeof(struct discard_entry));
if (!discard_entry_slab)
goto fail;
- discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd",
+ discard_cmd_slab = f2fs_kmem_cache_create("f2fs_discard_cmd",
sizeof(struct discard_cmd));
if (!discard_cmd_slab)
goto destroy_discard_entry;
- sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
+ sit_entry_set_slab = f2fs_kmem_cache_create("f2fs_sit_entry_set",
sizeof(struct sit_entry_set));
if (!sit_entry_set_slab)
goto destroy_discard_cmd;
- inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
+ inmem_entry_slab = f2fs_kmem_cache_create("f2fs_inmem_page_entry",
sizeof(struct inmem_pages));
if (!inmem_entry_slab)
goto destroy_sit_entry_set;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 459dc3901a57..7a83bd530812 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -756,7 +756,7 @@ static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi,
bool base_time)
{
struct sit_info *sit_i = SIT_I(sbi);
- time64_t diff, now = ktime_get_real_seconds();
+ time64_t diff, now = ktime_get_boottime_seconds();
if (now >= sit_i->mounted_time)
return sit_i->elapsed_time + now - sit_i->mounted_time;
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index a467aca29cfe..d66de5999a26 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -58,7 +58,7 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
/* count extent cache entries */
count += __count_extent_cache(sbi);
- /* shrink clean nat cache entries */
+ /* count clean nat cache entries */
count += __count_nat_entries(sbi);
/* count free nids cache entries */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index d398b2d90c6c..f2dfc21c6abb 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -428,14 +428,11 @@ static int parse_options(struct super_block *sb, char *options)
if (!name)
return -ENOMEM;
if (strlen(name) == 2 && !strncmp(name, "on", 2)) {
- set_opt(sbi, BG_GC);
- clear_opt(sbi, FORCE_FG_GC);
+ F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
} else if (strlen(name) == 3 && !strncmp(name, "off", 3)) {
- clear_opt(sbi, BG_GC);
- clear_opt(sbi, FORCE_FG_GC);
+ F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF;
} else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) {
- set_opt(sbi, BG_GC);
- set_opt(sbi, FORCE_FG_GC);
+ F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC;
} else {
kvfree(name);
return -EINVAL;
@@ -447,7 +444,7 @@ static int parse_options(struct super_block *sb, char *options)
break;
case Opt_norecovery:
/* this option mounts f2fs with ro */
- set_opt(sbi, DISABLE_ROLL_FORWARD);
+ set_opt(sbi, NORECOVERY);
if (!f2fs_readonly(sb))
return -EINVAL;
break;
@@ -601,10 +598,10 @@ static int parse_options(struct super_block *sb, char *options)
kvfree(name);
return -EINVAL;
}
- set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
+ F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
} else if (strlen(name) == 3 &&
!strncmp(name, "lfs", 3)) {
- set_opt_mode(sbi, F2FS_MOUNT_LFS);
+ F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
} else {
kvfree(name);
return -EINVAL;
@@ -833,6 +830,10 @@ static int parse_options(struct super_block *sb, char *options)
!strcmp(name, "lz4")) {
F2FS_OPTION(sbi).compress_algorithm =
COMPRESS_LZ4;
+ } else if (strlen(name) == 4 &&
+ !strcmp(name, "zstd")) {
+ F2FS_OPTION(sbi).compress_algorithm =
+ COMPRESS_ZSTD;
} else {
kfree(name);
return -EINVAL;
@@ -905,7 +906,7 @@ static int parse_options(struct super_block *sb, char *options)
}
#endif
- if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) {
+ if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) {
f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO",
F2FS_IO_SIZE_KB(sbi));
return -EINVAL;
@@ -935,7 +936,7 @@ static int parse_options(struct super_block *sb, char *options)
}
}
- if (test_opt(sbi, DISABLE_CHECKPOINT) && test_opt(sbi, LFS)) {
+ if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) {
f2fs_err(sbi, "LFS not compatible with checkpoint=disable\n");
return -EINVAL;
}
@@ -961,6 +962,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Initialize f2fs-specific inode info */
atomic_set(&fi->dirty_pages, 0);
init_rwsem(&fi->i_sem);
+ spin_lock_init(&fi->i_size_lock);
INIT_LIST_HEAD(&fi->dirty_list);
INIT_LIST_HEAD(&fi->gdirty_list);
INIT_LIST_HEAD(&fi->inmem_ilist);
@@ -1173,7 +1175,7 @@ static void f2fs_put_super(struct super_block *sb)
/* our cp_error case, we can wait for any writeback page */
f2fs_flush_merged_writes(sbi);
- f2fs_wait_on_all_pages_writeback(sbi);
+ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
f2fs_bug_on(sbi, sbi->fsync_node_num);
@@ -1205,6 +1207,7 @@ static void f2fs_put_super(struct super_block *sb)
kvfree(sbi->raw_super);
destroy_device_list(sbi);
+ f2fs_destroy_xattr_caches(sbi);
mempool_destroy(sbi->write_io_dummy);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
@@ -1421,6 +1424,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq,
case COMPRESS_LZ4:
algtype = "lz4";
break;
+ case COMPRESS_ZSTD:
+ algtype = "zstd";
+ break;
}
seq_printf(seq, ",compress_algorithm=%s", algtype);
@@ -1437,16 +1443,17 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
{
struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
- if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) {
- if (test_opt(sbi, FORCE_FG_GC))
- seq_printf(seq, ",background_gc=%s", "sync");
- else
- seq_printf(seq, ",background_gc=%s", "on");
- } else {
+ if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC)
+ seq_printf(seq, ",background_gc=%s", "sync");
+ else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_ON)
+ seq_printf(seq, ",background_gc=%s", "on");
+ else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF)
seq_printf(seq, ",background_gc=%s", "off");
- }
+
if (test_opt(sbi, DISABLE_ROLL_FORWARD))
seq_puts(seq, ",disable_roll_forward");
+ if (test_opt(sbi, NORECOVERY))
+ seq_puts(seq, ",norecovery");
if (test_opt(sbi, DISCARD))
seq_puts(seq, ",discard");
else
@@ -1498,9 +1505,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",data_flush");
seq_puts(seq, ",mode=");
- if (test_opt(sbi, ADAPTIVE))
+ if (F2FS_OPTION(sbi).fs_mode == FS_MODE_ADAPTIVE)
seq_puts(seq, "adaptive");
- else if (test_opt(sbi, LFS))
+ else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS)
seq_puts(seq, "lfs");
seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs);
if (test_opt(sbi, RESERVE_ROOT))
@@ -1571,11 +1578,11 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).test_dummy_encryption = false;
F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
- F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO;
+ F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4;
F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE;
F2FS_OPTION(sbi).compress_ext_cnt = 0;
+ F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
- set_opt(sbi, BG_GC);
set_opt(sbi, INLINE_XATTR);
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
@@ -1587,9 +1594,9 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, FLUSH_MERGE);
set_opt(sbi, DISCARD);
if (f2fs_sb_has_blkzoned(sbi))
- set_opt_mode(sbi, F2FS_MOUNT_LFS);
+ F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
else
- set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
+ F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
#ifdef CONFIG_F2FS_FS_XATTR
set_opt(sbi, XATTR_USER);
@@ -1658,7 +1665,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
out_unlock:
up_write(&sbi->gc_lock);
restore_flag:
- sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+ sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
return err;
}
@@ -1781,7 +1788,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* or if background_gc = off is passed in mount
* option. Also sync the filesystem.
*/
- if ((*flags & SB_RDONLY) || !test_opt(sbi, BG_GC)) {
+ if ((*flags & SB_RDONLY) ||
+ F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) {
if (sbi->gc_thread) {
f2fs_stop_gc_thread(sbi);
need_restart_gc = true;
@@ -1886,7 +1894,8 @@ repeat:
page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS);
if (IS_ERR(page)) {
if (PTR_ERR(page) == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
goto repeat;
}
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
@@ -1928,6 +1937,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
int offset = off & (sb->s_blocksize - 1);
size_t towrite = len;
struct page *page;
+ void *fsdata = NULL;
char *kaddr;
int err = 0;
int tocopy;
@@ -1937,10 +1947,11 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
towrite);
retry:
err = a_ops->write_begin(NULL, mapping, off, tocopy, 0,
- &page, NULL);
+ &page, &fsdata);
if (unlikely(err)) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
goto retry;
}
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
@@ -1953,7 +1964,7 @@ retry:
flush_dcache_page(page);
a_ops->write_end(NULL, mapping, off, tocopy, tocopy,
- page, NULL);
+ page, fsdata);
offset = 0;
towrite -= tocopy;
off += tocopy;
@@ -3457,12 +3468,17 @@ try_onemore:
}
}
+ /* init per sbi slab cache */
+ err = f2fs_init_xattr_caches(sbi);
+ if (err)
+ goto free_io_dummy;
+
/* get an inode for meta space */
sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
if (IS_ERR(sbi->meta_inode)) {
f2fs_err(sbi, "Failed to read F2FS meta data inode");
err = PTR_ERR(sbi->meta_inode);
- goto free_io_dummy;
+ goto free_xattr_cache;
}
err = f2fs_get_valid_checkpoint(sbi);
@@ -3590,7 +3606,7 @@ try_onemore:
f2fs_err(sbi, "Cannot turn on quotas: error %d", err);
}
#endif
- /* if there are nt orphan nodes free them */
+ /* if there are any orphan inodes, free them */
err = f2fs_recover_orphan_inodes(sbi);
if (err)
goto free_meta;
@@ -3599,7 +3615,8 @@ try_onemore:
goto reset_checkpoint;
/* recover fsynced data */
- if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
+ if (!test_opt(sbi, DISABLE_ROLL_FORWARD) &&
+ !test_opt(sbi, NORECOVERY)) {
/*
* mount should be failed, when device has readonly mode, and
* previous checkpoint was not done by clean system shutdown.
@@ -3665,7 +3682,7 @@ reset_checkpoint:
* If filesystem is not mounted as read-only then
* do start the gc_thread.
*/
- if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) {
+ if (F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF && !f2fs_readonly(sb)) {
/* After POR, we can run background GC thread.*/
err = f2fs_start_gc_thread(sbi);
if (err)
@@ -3734,6 +3751,8 @@ free_meta_inode:
make_bad_inode(sbi->meta_inode);
iput(sbi->meta_inode);
sbi->meta_inode = NULL;
+free_xattr_cache:
+ f2fs_destroy_xattr_caches(sbi);
free_io_dummy:
mempool_destroy(sbi->write_io_dummy);
free_percpu:
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 91d649790b1b..e3bbbef9b4f0 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -109,47 +109,47 @@ static ssize_t features_show(struct f2fs_attr *a,
return sprintf(buf, "0\n");
if (f2fs_sb_has_encrypt(sbi))
- len += snprintf(buf, PAGE_SIZE - len, "%s",
+ len += scnprintf(buf, PAGE_SIZE - len, "%s",
"encryption");
if (f2fs_sb_has_blkzoned(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "blkzoned");
if (f2fs_sb_has_extra_attr(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "extra_attr");
if (f2fs_sb_has_project_quota(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "projquota");
if (f2fs_sb_has_inode_chksum(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "inode_checksum");
if (f2fs_sb_has_flexible_inline_xattr(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "flexible_inline_xattr");
if (f2fs_sb_has_quota_ino(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "quota_ino");
if (f2fs_sb_has_inode_crtime(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "inode_crtime");
if (f2fs_sb_has_lost_found(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "lost_found");
if (f2fs_sb_has_verity(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "verity");
if (f2fs_sb_has_sb_chksum(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "sb_checksum");
if (f2fs_sb_has_casefold(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "casefold");
if (f2fs_sb_has_compression(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "compression");
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "pin_file");
- len += snprintf(buf + len, PAGE_SIZE - len, "\n");
+ len += scnprintf(buf + len, PAGE_SIZE - len, "\n");
return len;
}
@@ -185,6 +185,12 @@ static ssize_t encoding_show(struct f2fs_attr *a,
return sprintf(buf, "(none)");
}
+static ssize_t mounted_time_sec_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time);
+}
+
#ifdef CONFIG_F2FS_STAT_FS
static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
@@ -233,16 +239,16 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
int hot_count = sbi->raw_super->hot_ext_count;
int len = 0, i;
- len += snprintf(buf + len, PAGE_SIZE - len,
+ len += scnprintf(buf + len, PAGE_SIZE - len,
"cold file extension:\n");
for (i = 0; i < cold_count; i++)
- len += snprintf(buf + len, PAGE_SIZE - len, "%s\n",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n",
extlist[i]);
- len += snprintf(buf + len, PAGE_SIZE - len,
+ len += scnprintf(buf + len, PAGE_SIZE - len,
"hot file extension:\n");
for (i = cold_count; i < cold_count + hot_count; i++)
- len += snprintf(buf + len, PAGE_SIZE - len, "%s\n",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n",
extlist[i]);
return len;
}
@@ -544,6 +550,7 @@ F2FS_GENERAL_RO_ATTR(features);
F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
F2FS_GENERAL_RO_ATTR(unusable);
F2FS_GENERAL_RO_ATTR(encoding);
+F2FS_GENERAL_RO_ATTR(mounted_time_sec);
#ifdef CONFIG_F2FS_STAT_FS
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count);
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count);
@@ -573,7 +580,9 @@ F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY);
#endif
F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM);
F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD);
+#ifdef CONFIG_F2FS_FS_COMPRESSION
F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION);
+#endif
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -621,6 +630,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(reserved_blocks),
ATTR_LIST(current_reserved_blocks),
ATTR_LIST(encoding),
+ ATTR_LIST(mounted_time_sec),
#ifdef CONFIG_F2FS_STAT_FS
ATTR_LIST(cp_foreground_calls),
ATTR_LIST(cp_background_calls),
@@ -654,7 +664,9 @@ static struct attribute *f2fs_feat_attrs[] = {
#endif
ATTR_LIST(sb_checksum),
ATTR_LIST(casefold),
+#ifdef CONFIG_F2FS_FS_COMPRESSION
ATTR_LIST(compression),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(f2fs_feat);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 296b3189448a..4f6582ef7ee3 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -23,6 +23,25 @@
#include "xattr.h"
#include "segment.h"
+static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline)
+{
+ if (likely(size == sbi->inline_xattr_slab_size)) {
+ *is_inline = true;
+ return kmem_cache_zalloc(sbi->inline_xattr_slab, GFP_NOFS);
+ }
+ *is_inline = false;
+ return f2fs_kzalloc(sbi, size, GFP_NOFS);
+}
+
+static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr,
+ bool is_inline)
+{
+ if (is_inline)
+ kmem_cache_free(sbi->inline_xattr_slab, xattr_addr);
+ else
+ kvfree(xattr_addr);
+}
+
static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
struct dentry *unused, struct inode *inode,
const char *name, void *buffer, size_t size)
@@ -301,7 +320,8 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr)
static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
unsigned int index, unsigned int len,
const char *name, struct f2fs_xattr_entry **xe,
- void **base_addr, int *base_size)
+ void **base_addr, int *base_size,
+ bool *is_inline)
{
void *cur_addr, *txattr_addr, *last_txattr_addr;
void *last_addr = NULL;
@@ -312,12 +332,12 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
if (!xnid && !inline_size)
return -ENODATA;
- *base_size = XATTR_SIZE(xnid, inode) + XATTR_PADDING_SIZE;
- txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS);
+ *base_size = XATTR_SIZE(inode) + XATTR_PADDING_SIZE;
+ txattr_addr = xattr_alloc(F2FS_I_SB(inode), *base_size, is_inline);
if (!txattr_addr)
return -ENOMEM;
- last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(xnid, inode);
+ last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(inode);
/* read from inline xattr */
if (inline_size) {
@@ -362,7 +382,7 @@ check:
*base_addr = txattr_addr;
return 0;
out:
- kvfree(txattr_addr);
+ xattr_free(F2FS_I_SB(inode), txattr_addr, *is_inline);
return err;
}
@@ -499,6 +519,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
unsigned int size, len;
void *base_addr = NULL;
int base_size;
+ bool is_inline;
if (name == NULL)
return -EINVAL;
@@ -509,7 +530,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
down_read(&F2FS_I(inode)->i_xattr_sem);
error = lookup_all_xattrs(inode, ipage, index, len, name,
- &entry, &base_addr, &base_size);
+ &entry, &base_addr, &base_size, &is_inline);
up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@@ -532,14 +553,13 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
}
error = size;
out:
- kvfree(base_addr);
+ xattr_free(F2FS_I_SB(inode), base_addr, is_inline);
return error;
}
ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
struct inode *inode = d_inode(dentry);
- nid_t xnid = F2FS_I(inode)->i_xattr_nid;
struct f2fs_xattr_entry *entry;
void *base_addr, *last_base_addr;
int error = 0;
@@ -551,7 +571,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
if (error)
return error;
- last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode);
+ last_base_addr = (void *)base_addr + XATTR_SIZE(inode);
list_for_each_xattr(entry, base_addr) {
const struct xattr_handler *handler =
@@ -609,7 +629,6 @@ static int __f2fs_setxattr(struct inode *inode, int index,
{
struct f2fs_xattr_entry *here, *last;
void *base_addr, *last_base_addr;
- nid_t xnid = F2FS_I(inode)->i_xattr_nid;
int found, newsize;
size_t len;
__u32 new_hsize;
@@ -633,7 +652,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (error)
return error;
- last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode);
+ last_base_addr = (void *)base_addr + XATTR_SIZE(inode);
/* find entry with wanted name. */
here = __find_xattr(base_addr, last_base_addr, index, len, name);
@@ -758,14 +777,34 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
- /* protect xattr_ver */
- down_write(&F2FS_I(inode)->i_sem);
down_write(&F2FS_I(inode)->i_xattr_sem);
err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags);
up_write(&F2FS_I(inode)->i_xattr_sem);
- up_write(&F2FS_I(inode)->i_sem);
f2fs_unlock_op(sbi);
f2fs_update_time(sbi, REQ_TIME);
return err;
}
+
+int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi)
+{
+ dev_t dev = sbi->sb->s_bdev->bd_dev;
+ char slab_name[32];
+
+ sprintf(slab_name, "f2fs_xattr_entry-%u:%u", MAJOR(dev), MINOR(dev));
+
+ sbi->inline_xattr_slab_size = F2FS_OPTION(sbi).inline_xattr_size *
+ sizeof(__le32) + XATTR_PADDING_SIZE;
+
+ sbi->inline_xattr_slab = f2fs_kmem_cache_create(slab_name,
+ sbi->inline_xattr_slab_size);
+ if (!sbi->inline_xattr_slab)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi)
+{
+ kmem_cache_destroy(sbi->inline_xattr_slab);
+}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index de0c600b9cab..938fcd20565d 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -49,7 +49,7 @@ struct f2fs_xattr_entry {
__u8 e_name_index;
__u8 e_name_len;
__le16 e_value_size; /* size of attribute value */
- char e_name[0]; /* attribute name */
+ char e_name[]; /* attribute name */
};
#define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr))
@@ -73,7 +73,8 @@ struct f2fs_xattr_entry {
entry = XATTR_NEXT_ENTRY(entry))
#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer))
#define XATTR_PADDING_SIZE (sizeof(__u32))
-#define XATTR_SIZE(x,i) (((x) ? VALID_XATTR_BLOCK_SIZE : 0) + \
+#define XATTR_SIZE(i) ((F2FS_I(i)->i_xattr_nid ? \
+ VALID_XATTR_BLOCK_SIZE : 0) + \
(inline_xattr_size(i)))
#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \
VALID_XATTR_BLOCK_SIZE)
@@ -130,6 +131,8 @@ extern int f2fs_setxattr(struct inode *, int, const char *,
extern int f2fs_getxattr(struct inode *, int, const char *, void *,
size_t, struct page *);
extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
+extern int f2fs_init_xattr_caches(struct f2fs_sb_info *);
+extern void f2fs_destroy_xattr_caches(struct f2fs_sb_info *);
#else
#define f2fs_xattr_handlers NULL
@@ -150,6 +153,8 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
{
return -EOPNOTSUPP;
}
+static inline int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) { return 0; }
+static inline void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) { }
#endif
#ifdef CONFIG_F2FS_FS_SECURITY
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 77bf5f95362d..90b8d879fbaf 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -272,7 +272,9 @@ struct file_system_type *get_fs_type(const char *name)
fs = __get_fs_type(name, len);
if (!fs && (request_module("fs-%.*s", len, name) == 0)) {
fs = __get_fs_type(name, len);
- WARN_ONCE(!fs, "request_module fs-%.*s succeeded, but still no fs?\n", len, name);
+ if (!fs)
+ pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n",
+ len, name);
}
if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index e6d554476db4..eeebe80c6be4 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -292,6 +292,10 @@ static int __hfsplus_delete_attr(struct inode *inode, u32 cnid,
return -ENOENT;
}
+ /* Avoid btree corruption */
+ hfs_bnode_read(fd->bnode, fd->search_key,
+ fd->keyoffset, fd->keylength);
+
err = hfs_brec_remove(fd);
if (err)
return err;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index e6b8c49076bb..c070c0d8e3e9 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -139,8 +139,8 @@ static char *inode_name(struct inode *ino)
static char *follow_link(char *link)
{
- int len, n;
char *name, *resolved, *end;
+ int n;
name = __getname();
if (!name) {
@@ -164,15 +164,13 @@ static char *follow_link(char *link)
return name;
*(end + 1) = '\0';
- len = strlen(link) + strlen(name) + 1;
- resolved = kmalloc(len, GFP_KERNEL);
+ resolved = kasprintf(GFP_KERNEL, "%s%s", link, name);
if (resolved == NULL) {
n = -ENOMEM;
goto out_free;
}
- sprintf(resolved, "%s%s", link, name);
__putname(name);
kfree(link);
return resolved;
@@ -921,18 +919,16 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
sb->s_d_op = &simple_dentry_operations;
sb->s_maxbytes = MAX_LFS_FILESIZE;
- /* NULL is printed as <NULL> by sprintf: avoid that. */
+ /* NULL is printed as '(null)' by printf(): avoid that. */
if (req_root == NULL)
req_root = "";
err = -ENOMEM;
sb->s_fs_info = host_root_path =
- kmalloc(strlen(root_ino) + strlen(req_root) + 2, GFP_KERNEL);
+ kasprintf(GFP_KERNEL, "%s/%s", root_ino, req_root);
if (host_root_path == NULL)
goto out;
- sprintf(host_root_path, "%s/%s", root_ino, req_root);
-
root_inode = new_inode(sb);
if (!root_inode)
goto out;
diff --git a/fs/io-wq.c b/fs/io-wq.c
index cc5cf2209fb0..4023c9846860 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -17,6 +17,7 @@
#include <linux/kthread.h>
#include <linux/rculist_nulls.h>
#include <linux/fs_struct.h>
+#include <linux/task_work.h>
#include "io-wq.h"
@@ -716,6 +717,9 @@ static int io_wq_manager(void *data)
complete(&wq->done);
while (!kthread_should_stop()) {
+ if (current->task_works)
+ task_work_run();
+
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
bool fork_worker[2] = { false, false };
@@ -738,6 +742,9 @@ static int io_wq_manager(void *data)
schedule_timeout(HZ);
}
+ if (current->task_works)
+ task_work_run();
+
return 0;
err:
set_bit(IO_WQ_BIT_ERROR, &wq->state);
@@ -1124,3 +1131,8 @@ void io_wq_destroy(struct io_wq *wq)
if (refcount_dec_and_test(&wq->use_refs))
__io_wq_destroy(wq);
}
+
+struct task_struct *io_wq_get_task(struct io_wq *wq)
+{
+ return wq->manager;
+}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 3ee7356d6be5..5ba12de7572f 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -136,6 +136,8 @@ typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data);
+struct task_struct *io_wq_get_task(struct io_wq *wq);
+
#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 358f97be9c7b..5190bfb6a665 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -186,14 +186,23 @@ struct fixed_file_table {
struct file **files;
};
+struct fixed_file_ref_node {
+ struct percpu_ref refs;
+ struct list_head node;
+ struct list_head file_list;
+ struct fixed_file_data *file_data;
+ struct work_struct work;
+};
+
struct fixed_file_data {
struct fixed_file_table *table;
struct io_ring_ctx *ctx;
+ struct percpu_ref *cur_refs;
struct percpu_ref refs;
- struct llist_head put_llist;
- struct work_struct ref_work;
struct completion done;
+ struct list_head ref_list;
+ spinlock_t lock;
};
struct io_buffer {
@@ -317,6 +326,8 @@ struct io_ring_ctx {
spinlock_t inflight_lock;
struct list_head inflight_list;
} ____cacheline_aligned_in_smp;
+
+ struct work_struct exit_work;
};
/*
@@ -599,6 +610,7 @@ struct io_kiocb {
};
struct io_async_ctx *io;
+ int cflags;
bool needs_fixed_file;
u8 opcode;
@@ -606,10 +618,8 @@ struct io_kiocb {
struct list_head list;
unsigned int flags;
refcount_t refs;
- union {
- struct task_struct *task;
- unsigned long fsize;
- };
+ struct task_struct *task;
+ unsigned long fsize;
u64 user_data;
u32 result;
u32 sequence;
@@ -618,6 +628,8 @@ struct io_kiocb {
struct list_head inflight_entry;
+ struct percpu_ref *fixed_file_refs;
+
union {
/*
* Only commands that never go async can use the below fields,
@@ -629,7 +641,6 @@ struct io_kiocb {
struct callback_head task_work;
struct hlist_node hash_node;
struct async_poll *apoll;
- int cflags;
};
struct io_wq_work work;
};
@@ -848,7 +859,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
static int io_grab_files(struct io_kiocb *req);
-static void io_ring_file_ref_flush(struct fixed_file_data *data);
static void io_cleanup_req(struct io_kiocb *req);
static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
int fd, struct file **out_file, bool fixed);
@@ -1285,8 +1295,8 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
return NULL;
}
-static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
- struct io_submit_state *state)
+static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
+ struct io_submit_state *state)
{
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct io_kiocb *req;
@@ -1319,41 +1329,20 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
req = state->reqs[state->free_reqs];
}
-got_it:
- req->io = NULL;
- req->file = NULL;
- req->ctx = ctx;
- req->flags = 0;
- /* one is dropped after submission, the other at completion */
- refcount_set(&req->refs, 2);
- req->result = 0;
- INIT_IO_WORK(&req->work, io_wq_submit_work);
return req;
fallback:
- req = io_get_fallback_req(ctx);
- if (req)
- goto got_it;
- percpu_ref_put(&ctx->refs);
- return NULL;
+ return io_get_fallback_req(ctx);
}
static inline void io_put_file(struct io_kiocb *req, struct file *file,
bool fixed)
{
if (fixed)
- percpu_ref_put(&req->ctx->file_data->refs);
+ percpu_ref_put(req->fixed_file_refs);
else
fput(file);
}
-static void __io_req_do_free(struct io_kiocb *req)
-{
- if (likely(!io_is_fallback_req(req)))
- kmem_cache_free(req_cachep, req);
- else
- clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
-}
-
static void __io_req_aux_free(struct io_kiocb *req)
{
if (req->flags & REQ_F_NEED_CLEANUP)
@@ -1362,6 +1351,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
kfree(req->io);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
+ if (req->task)
+ put_task_struct(req->task);
io_req_work_drop_env(req);
}
@@ -1382,7 +1373,10 @@ static void __io_free_req(struct io_kiocb *req)
}
percpu_ref_put(&req->ctx->refs);
- __io_req_do_free(req);
+ if (likely(!io_is_fallback_req(req)))
+ kmem_cache_free(req_cachep, req);
+ else
+ clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
}
struct req_batch {
@@ -1393,21 +1387,18 @@ struct req_batch {
static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
{
- int fixed_refs = rb->to_free;
-
if (!rb->to_free)
return;
if (rb->need_iter) {
int i, inflight = 0;
unsigned long flags;
- fixed_refs = 0;
for (i = 0; i < rb->to_free; i++) {
struct io_kiocb *req = rb->reqs[i];
if (req->flags & REQ_F_FIXED_FILE) {
req->file = NULL;
- fixed_refs++;
+ percpu_ref_put(req->fixed_file_refs);
}
if (req->flags & REQ_F_INFLIGHT)
inflight++;
@@ -1433,8 +1424,6 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
}
do_free:
kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
- if (fixed_refs)
- percpu_ref_put_many(&ctx->file_data->refs, fixed_refs);
percpu_ref_put_many(&ctx->refs, rb->to_free);
rb->to_free = rb->need_iter = 0;
}
@@ -1738,11 +1727,24 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
io_free_req_many(ctx, &rb);
}
+static void io_iopoll_queue(struct list_head *again)
+{
+ struct io_kiocb *req;
+
+ do {
+ req = list_first_entry(again, struct io_kiocb, list);
+ list_del(&req->list);
+ refcount_inc(&req->refs);
+ io_queue_async_work(req);
+ } while (!list_empty(again));
+}
+
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
long min)
{
struct io_kiocb *req, *tmp;
LIST_HEAD(done);
+ LIST_HEAD(again);
bool spin;
int ret;
@@ -1757,9 +1759,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
struct kiocb *kiocb = &req->rw.kiocb;
/*
- * Move completed entries to our local list. If we find a
- * request that requires polling, break out and complete
- * the done list first, if we have entries there.
+ * Move completed and retryable entries to our local lists.
+ * If we find a request that requires polling, break out
+ * and complete those lists first, if we have entries there.
*/
if (req->flags & REQ_F_IOPOLL_COMPLETED) {
list_move_tail(&req->list, &done);
@@ -1768,6 +1770,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
break;
+ if (req->result == -EAGAIN) {
+ list_move_tail(&req->list, &again);
+ continue;
+ }
+ if (!list_empty(&again))
+ break;
+
ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
if (ret < 0)
break;
@@ -1780,6 +1789,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
io_iopoll_complete(ctx, nr_events, &done);
+ if (!list_empty(&again))
+ io_iopoll_queue(&again);
+
return ret;
}
@@ -2465,8 +2477,9 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
req->io->rw.iov = iovec;
if (!req->io->rw.iov) {
req->io->rw.iov = req->io->rw.fast_iov;
- memcpy(req->io->rw.iov, fast_iov,
- sizeof(struct iovec) * iter->nr_segs);
+ if (req->io->rw.iov != fast_iov)
+ memcpy(req->io->rw.iov, fast_iov,
+ sizeof(struct iovec) * iter->nr_segs);
} else {
req->flags |= REQ_F_NEED_CLEANUP;
}
@@ -2920,7 +2933,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
- if (sqe->flags & IOSQE_FIXED_FILE)
+ if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
@@ -2929,6 +2942,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->open.how.mode = READ_ONCE(sqe->len);
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
req->open.how.flags = READ_ONCE(sqe->open_flags);
+ if (force_o_largefile())
+ req->open.how.flags |= O_LARGEFILE;
req->open.filename = getname(fname);
if (IS_ERR(req->open.filename)) {
@@ -2951,7 +2966,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
- if (sqe->flags & IOSQE_FIXED_FILE)
+ if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
@@ -3305,7 +3320,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
- if (sqe->flags & IOSQE_FIXED_FILE)
+ if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
@@ -3382,7 +3397,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
sqe->rw_flags || sqe->buf_index)
return -EINVAL;
- if (sqe->flags & IOSQE_FIXED_FILE)
+ if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
req->close.fd = READ_ONCE(sqe->fd);
@@ -3481,14 +3496,11 @@ static void __io_sync_file_range(struct io_kiocb *req)
static void io_sync_file_range_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct io_kiocb *nxt = NULL;
if (io_req_cancelled(req))
return;
__io_sync_file_range(req);
io_put_req(req); /* put submission ref */
- if (nxt)
- io_wq_assign_next(workptr, nxt);
}
static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
@@ -4114,6 +4126,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
struct task_struct *tsk;
+ int ret;
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
@@ -4127,11 +4140,15 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
req->result = mask;
init_task_work(&req->task_work, func);
/*
- * If this fails, then the task is exiting. If that is the case, then
- * the exit check will ultimately cancel these work items. Hence we
- * don't need to check here and handle it specifically.
+ * If this fails, then the task is exiting. Punt to one of the io-wq
+ * threads to ensure the work gets run, we can't always rely on exit
+ * cancelation taking care of this.
*/
- task_work_add(tsk, &req->task_work, true);
+ ret = task_work_add(tsk, &req->task_work, true);
+ if (unlikely(ret)) {
+ tsk = io_wq_get_task(req->ctx->io_wq);
+ task_work_add(tsk, &req->task_work, true);
+ }
wake_up_process(tsk);
return 1;
}
@@ -4251,10 +4268,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
req->flags |= REQ_F_POLLED;
memcpy(&apoll->work, &req->work, sizeof(req->work));
- /*
- * Don't need a reference here, as we're adding it to the task
- * task_works list. If the task exits, the list is pruned.
- */
+ get_task_struct(current);
req->task = current;
req->apoll = apoll;
INIT_HLIST_NODE(&req->hash_node);
@@ -4407,8 +4421,20 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
{
struct io_ring_ctx *ctx = req->ctx;
+ struct io_poll_iocb *poll = &req->poll;
+
+ if (!req->result && !READ_ONCE(poll->canceled)) {
+ struct poll_table_struct pt = { ._key = poll->events };
+
+ req->result = vfs_poll(req->file, &pt) & poll->events;
+ }
spin_lock_irq(&ctx->completion_lock);
+ if (!req->result && !READ_ONCE(poll->canceled)) {
+ add_wait_queue(poll->head, &poll->wait);
+ spin_unlock_irq(&ctx->completion_lock);
+ return;
+ }
hash_del(&req->hash_node);
io_poll_complete(req, req->result, 0);
req->flags |= REQ_F_COMP_LOCKED;
@@ -4465,10 +4491,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
- /*
- * Don't need a reference here, as we're adding it to the task
- * task_works list. If the task exits, the list is pruned.
- */
+ get_task_struct(current);
req->task = current;
return 0;
}
@@ -5331,7 +5354,8 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
file = io_file_from_index(ctx, fd);
if (!file)
return -EBADF;
- percpu_ref_get(&ctx->file_data->refs);
+ req->fixed_file_refs = ctx->file_data->cur_refs;
+ percpu_ref_get(req->fixed_file_refs);
} else {
trace_io_uring_file_get(ctx, fd);
file = __io_file_get(state, fd);
@@ -5344,15 +5368,10 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
}
static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+ int fd, unsigned int flags)
{
- unsigned flags;
- int fd;
bool fixed;
- flags = READ_ONCE(sqe->flags);
- fd = READ_ONCE(sqe->fd);
-
if (!io_req_needs_file(req, fd))
return 0;
@@ -5594,7 +5613,7 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
{
struct io_ring_ctx *ctx = req->ctx;
unsigned int sqe_flags;
- int ret, id;
+ int ret, id, fd;
sqe_flags = READ_ONCE(sqe->flags);
@@ -5625,7 +5644,8 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
IOSQE_ASYNC | IOSQE_FIXED_FILE |
IOSQE_BUFFER_SELECT);
- ret = io_req_set_file(state, req, sqe);
+ fd = READ_ONCE(sqe->fd);
+ ret = io_req_set_file(state, req, fd, sqe_flags);
if (unlikely(ret)) {
err_req:
io_cqring_add_event(req, ret);
@@ -5741,8 +5761,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
* used, it's important that those reads are done through READ_ONCE() to
* prevent a re-load down the line.
*/
-static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe **sqe_ptr)
+static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
{
u32 *sq_array = ctx->sq_array;
unsigned head;
@@ -5756,25 +5775,40 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
* though the application is the one updating it.
*/
head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
- if (likely(head < ctx->sq_entries)) {
- /*
- * All io need record the previous position, if LINK vs DARIN,
- * it can be used to mark the position of the first IO in the
- * link list.
- */
- req->sequence = ctx->cached_sq_head;
- *sqe_ptr = &ctx->sq_sqes[head];
- req->opcode = READ_ONCE((*sqe_ptr)->opcode);
- req->user_data = READ_ONCE((*sqe_ptr)->user_data);
- ctx->cached_sq_head++;
- return true;
- }
+ if (likely(head < ctx->sq_entries))
+ return &ctx->sq_sqes[head];
/* drop invalid entries */
- ctx->cached_sq_head++;
ctx->cached_sq_dropped++;
WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
- return false;
+ return NULL;
+}
+
+static inline void io_consume_sqe(struct io_ring_ctx *ctx)
+{
+ ctx->cached_sq_head++;
+}
+
+static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ /*
+ * All io need record the previous position, if LINK vs DARIN,
+ * it can be used to mark the position of the first IO in the
+ * link list.
+ */
+ req->sequence = ctx->cached_sq_head;
+ req->opcode = READ_ONCE(sqe->opcode);
+ req->user_data = READ_ONCE(sqe->user_data);
+ req->io = NULL;
+ req->file = NULL;
+ req->ctx = ctx;
+ req->flags = 0;
+ /* one is dropped after submission, the other at completion */
+ refcount_set(&req->refs, 2);
+ req->task = NULL;
+ req->result = 0;
+ INIT_IO_WORK(&req->work, io_wq_submit_work);
}
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
@@ -5812,17 +5846,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
struct io_kiocb *req;
int err;
- req = io_get_req(ctx, statep);
+ sqe = io_get_sqe(ctx);
+ if (unlikely(!sqe)) {
+ io_consume_sqe(ctx);
+ break;
+ }
+ req = io_alloc_req(ctx, statep);
if (unlikely(!req)) {
if (!submitted)
submitted = -EAGAIN;
break;
}
- if (!io_get_sqring(ctx, req, &sqe)) {
- __io_req_do_free(req);
- break;
- }
+ io_init_req(ctx, req, sqe);
+ io_consume_sqe(ctx);
/* will complete beyond this point, count as submitted */
submitted++;
@@ -5962,6 +5999,7 @@ static int io_sq_thread(void *data)
}
if (current->task_works) {
task_work_run();
+ finish_wait(&ctx->sqo_wait, &wait);
continue;
}
if (signal_pending(current))
@@ -6124,43 +6162,36 @@ static void io_file_ref_kill(struct percpu_ref *ref)
complete(&data->done);
}
-static void io_file_ref_exit_and_free(struct work_struct *work)
-{
- struct fixed_file_data *data;
-
- data = container_of(work, struct fixed_file_data, ref_work);
-
- /*
- * Ensure any percpu-ref atomic switch callback has run, it could have
- * been in progress when the files were being unregistered. Once
- * that's done, we can safely exit and free the ref and containing
- * data structure.
- */
- rcu_barrier();
- percpu_ref_exit(&data->refs);
- kfree(data);
-}
-
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
struct fixed_file_data *data = ctx->file_data;
+ struct fixed_file_ref_node *ref_node = NULL;
unsigned nr_tables, i;
+ unsigned long flags;
if (!data)
return -ENXIO;
- percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
- flush_work(&data->ref_work);
+ spin_lock_irqsave(&data->lock, flags);
+ if (!list_empty(&data->ref_list))
+ ref_node = list_first_entry(&data->ref_list,
+ struct fixed_file_ref_node, node);
+ spin_unlock_irqrestore(&data->lock, flags);
+ if (ref_node)
+ percpu_ref_kill(&ref_node->refs);
+
+ percpu_ref_kill(&data->refs);
+
+ /* wait for all refs nodes to complete */
wait_for_completion(&data->done);
- io_ring_file_ref_flush(data);
__io_sqe_files_unregister(ctx);
nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
for (i = 0; i < nr_tables; i++)
kfree(data->table[i].files);
kfree(data->table);
- INIT_WORK(&data->ref_work, io_file_ref_exit_and_free);
- queue_work(system_wq, &data->ref_work);
+ percpu_ref_exit(&data->refs);
+ kfree(data);
ctx->file_data = NULL;
ctx->nr_user_files = 0;
return 0;
@@ -6204,13 +6235,6 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
struct sk_buff *skb;
int i, nr_files;
- if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
- unsigned long inflight = ctx->user->unix_inflight + nr;
-
- if (inflight > task_rlimit(current, RLIMIT_NOFILE))
- return -EMFILE;
- }
-
fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
if (!fpl)
return -ENOMEM;
@@ -6385,46 +6409,72 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
}
struct io_file_put {
- struct llist_node llist;
+ struct list_head list;
struct file *file;
};
-static void io_ring_file_ref_flush(struct fixed_file_data *data)
+static void io_file_put_work(struct work_struct *work)
{
+ struct fixed_file_ref_node *ref_node;
+ struct fixed_file_data *file_data;
+ struct io_ring_ctx *ctx;
struct io_file_put *pfile, *tmp;
- struct llist_node *node;
+ unsigned long flags;
- while ((node = llist_del_all(&data->put_llist)) != NULL) {
- llist_for_each_entry_safe(pfile, tmp, node, llist) {
- io_ring_file_put(data->ctx, pfile->file);
- kfree(pfile);
- }
+ ref_node = container_of(work, struct fixed_file_ref_node, work);
+ file_data = ref_node->file_data;
+ ctx = file_data->ctx;
+
+ list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
+ list_del_init(&pfile->list);
+ io_ring_file_put(ctx, pfile->file);
+ kfree(pfile);
}
+
+ spin_lock_irqsave(&file_data->lock, flags);
+ list_del_init(&ref_node->node);
+ spin_unlock_irqrestore(&file_data->lock, flags);
+
+ percpu_ref_exit(&ref_node->refs);
+ kfree(ref_node);
+ percpu_ref_put(&file_data->refs);
}
-static void io_ring_file_ref_switch(struct work_struct *work)
+static void io_file_data_ref_zero(struct percpu_ref *ref)
{
- struct fixed_file_data *data;
+ struct fixed_file_ref_node *ref_node;
+
+ ref_node = container_of(ref, struct fixed_file_ref_node, refs);
- data = container_of(work, struct fixed_file_data, ref_work);
- io_ring_file_ref_flush(data);
- percpu_ref_switch_to_percpu(&data->refs);
+ queue_work(system_wq, &ref_node->work);
}
-static void io_file_data_ref_zero(struct percpu_ref *ref)
+static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
+ struct io_ring_ctx *ctx)
{
- struct fixed_file_data *data;
+ struct fixed_file_ref_node *ref_node;
- data = container_of(ref, struct fixed_file_data, refs);
+ ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+ if (!ref_node)
+ return ERR_PTR(-ENOMEM);
+
+ if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
+ 0, GFP_KERNEL)) {
+ kfree(ref_node);
+ return ERR_PTR(-ENOMEM);
+ }
+ INIT_LIST_HEAD(&ref_node->node);
+ INIT_LIST_HEAD(&ref_node->file_list);
+ INIT_WORK(&ref_node->work, io_file_put_work);
+ ref_node->file_data = ctx->file_data;
+ return ref_node;
- /*
- * We can't safely switch from inside this context, punt to wq. If
- * the table ref is going away, the table is being unregistered.
- * Don't queue up the async work for that case, the caller will
- * handle it.
- */
- if (!percpu_ref_is_dying(&data->refs))
- queue_work(system_wq, &data->ref_work);
+}
+
+static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
+{
+ percpu_ref_exit(&ref_node->refs);
+ kfree(ref_node);
}
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
@@ -6435,6 +6485,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
struct file *file;
int fd, ret = 0;
unsigned i;
+ struct fixed_file_ref_node *ref_node;
+ unsigned long flags;
if (ctx->file_data)
return -EBUSY;
@@ -6448,6 +6500,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -ENOMEM;
ctx->file_data->ctx = ctx;
init_completion(&ctx->file_data->done);
+ INIT_LIST_HEAD(&ctx->file_data->ref_list);
+ spin_lock_init(&ctx->file_data->lock);
nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
ctx->file_data->table = kcalloc(nr_tables,
@@ -6459,15 +6513,13 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -ENOMEM;
}
- if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
+ if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
kfree(ctx->file_data->table);
kfree(ctx->file_data);
ctx->file_data = NULL;
return -ENOMEM;
}
- ctx->file_data->put_llist.first = NULL;
- INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
percpu_ref_exit(&ctx->file_data->refs);
@@ -6530,9 +6582,22 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
}
ret = io_sqe_files_scm(ctx);
- if (ret)
+ if (ret) {
io_sqe_files_unregister(ctx);
+ return ret;
+ }
+
+ ref_node = alloc_fixed_file_ref_node(ctx);
+ if (IS_ERR(ref_node)) {
+ io_sqe_files_unregister(ctx);
+ return PTR_ERR(ref_node);
+ }
+ ctx->file_data->cur_refs = &ref_node->refs;
+ spin_lock_irqsave(&ctx->file_data->lock, flags);
+ list_add(&ref_node->node, &ctx->file_data->ref_list);
+ spin_unlock_irqrestore(&ctx->file_data->lock, flags);
+ percpu_ref_get(&ctx->file_data->refs);
return ret;
}
@@ -6579,30 +6644,21 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif
}
-static void io_atomic_switch(struct percpu_ref *ref)
-{
- struct fixed_file_data *data;
-
- /*
- * Juggle reference to ensure we hit zero, if needed, so we can
- * switch back to percpu mode
- */
- data = container_of(ref, struct fixed_file_data, refs);
- percpu_ref_put(&data->refs);
- percpu_ref_get(&data->refs);
-}
-
static int io_queue_file_removal(struct fixed_file_data *data,
- struct file *file)
+ struct file *file)
{
struct io_file_put *pfile;
+ struct percpu_ref *refs = data->cur_refs;
+ struct fixed_file_ref_node *ref_node;
pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
if (!pfile)
return -ENOMEM;
+ ref_node = container_of(refs, struct fixed_file_ref_node, refs);
pfile->file = file;
- llist_add(&pfile->llist, &data->put_llist);
+ list_add(&pfile->list, &ref_node->file_list);
+
return 0;
}
@@ -6611,17 +6667,23 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
unsigned nr_args)
{
struct fixed_file_data *data = ctx->file_data;
- bool ref_switch = false;
+ struct fixed_file_ref_node *ref_node;
struct file *file;
__s32 __user *fds;
int fd, i, err;
__u32 done;
+ unsigned long flags;
+ bool needs_switch = false;
if (check_add_overflow(up->offset, nr_args, &done))
return -EOVERFLOW;
if (done > ctx->nr_user_files)
return -EINVAL;
+ ref_node = alloc_fixed_file_ref_node(ctx);
+ if (IS_ERR(ref_node))
+ return PTR_ERR(ref_node);
+
done = 0;
fds = u64_to_user_ptr(up->fds);
while (nr_args) {
@@ -6642,7 +6704,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (err)
break;
table->files[index] = NULL;
- ref_switch = true;
+ needs_switch = true;
}
if (fd != -1) {
file = fget(fd);
@@ -6673,11 +6735,19 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
up->offset++;
}
- if (ref_switch)
- percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
+ if (needs_switch) {
+ percpu_ref_kill(data->cur_refs);
+ spin_lock_irqsave(&data->lock, flags);
+ list_add(&ref_node->node, &data->ref_list);
+ data->cur_refs = &ref_node->refs;
+ spin_unlock_irqrestore(&data->lock, flags);
+ percpu_ref_get(&ctx->file_data->refs);
+ } else
+ destroy_fixed_file_ref_node(ref_node);
return done ? done : err;
}
+
static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
@@ -7203,6 +7273,18 @@ static int io_remove_personalities(int id, void *p, void *data)
return 0;
}
+static void io_ring_exit_work(struct work_struct *work)
+{
+ struct io_ring_ctx *ctx;
+
+ ctx = container_of(work, struct io_ring_ctx, exit_work);
+ if (ctx->rings)
+ io_cqring_overflow_flush(ctx, true);
+
+ wait_for_completion(&ctx->completions[0]);
+ io_ring_ctx_free(ctx);
+}
+
static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
mutex_lock(&ctx->uring_lock);
@@ -7230,8 +7312,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
- wait_for_completion(&ctx->completions[0]);
- io_ring_ctx_free(ctx);
+ INIT_WORK(&ctx->exit_work, io_ring_exit_work);
+ queue_work(system_wq, &ctx->exit_work);
}
static int io_uring_release(struct inode *inode, struct file *file)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index f080f542911b..89e21961d1ad 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -302,6 +302,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) {
gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+ gfp_t orig_gfp = gfp;
int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (ctx->bio)
@@ -310,6 +311,13 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (ctx->is_readahead) /* same as readahead_gfp_mask */
gfp |= __GFP_NORETRY | __GFP_NOWARN;
ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs));
+ /*
+ * If the bio_alloc fails, try it again for a single page to
+ * avoid having to deal with partial page reads. This emulates
+ * what do_mpage_readpage does.
+ */
+ if (!ctx->bio)
+ ctx->bio = bio_alloc(orig_gfp, 1);
ctx->bio->bi_opf = REQ_OP_READ;
if (ctx->is_readahead)
ctx->bio->bi_opf |= REQ_RAHEAD;
@@ -975,13 +983,6 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap);
}
-static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
- struct iomap *iomap)
-{
- return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
- iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
-}
-
static loff_t
iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
void *data, struct iomap *iomap, struct iomap *srcmap)
@@ -1001,7 +1002,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
bytes = min_t(loff_t, PAGE_SIZE - offset, count);
if (IS_DAX(inode))
- status = iomap_dax_zero(pos, offset, bytes, iomap);
+ status = dax_iomap_zero(pos, offset, bytes, iomap);
else
status = iomap_zero(inode, pos, offset, bytes, iomap,
srcmap);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 690221747b47..d1a0e2c8b1b4 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -476,7 +476,7 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
err = ext_tree_remove(bl, true, 0, LLONG_MAX);
WARN_ON(err);
- kfree(bl);
+ kfree_rcu(bl, bl_layout.plh_rcu);
}
static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 549350259840..6a2033131c06 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -127,7 +127,9 @@ extern __be32 nfs4_callback_sequence(void *argp, void *resp,
#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9
#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
-#define RCA4_TYPE_MASK_ALL 0xf31f
+#define PNFS_FF_RCA4_TYPE_MASK_READ 16
+#define PNFS_FF_RCA4_TYPE_MASK_RW 17
+#define RCA4_TYPE_MASK_ALL 0x3f31f
struct cb_recallanyargs {
uint32_t craa_objs_to_keep;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index cd4c6bc81cae..e61dbc9b86ae 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -121,31 +121,31 @@ out:
*/
static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp,
const nfs4_stateid *stateid)
+ __must_hold(RCU)
{
struct nfs_server *server;
struct inode *inode;
struct pnfs_layout_hdr *lo;
+ rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- list_for_each_entry(lo, &server->layouts, plh_layouts) {
+ list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
+ if (!pnfs_layout_is_valid(lo))
+ continue;
if (stateid != NULL &&
!nfs4_stateid_match_other(stateid, &lo->plh_stateid))
continue;
+ if (!nfs_sb_active(server->super))
+ continue;
inode = igrab(lo->plh_inode);
- if (!inode)
- return ERR_PTR(-EAGAIN);
- if (!nfs_sb_active(inode->i_sb)) {
- rcu_read_unlock();
- spin_unlock(&clp->cl_lock);
- iput(inode);
- spin_lock(&clp->cl_lock);
- rcu_read_lock();
- return ERR_PTR(-EAGAIN);
- }
- return inode;
+ rcu_read_unlock();
+ if (inode)
+ return inode;
+ nfs_sb_deactive(server->super);
+ return ERR_PTR(-EAGAIN);
}
}
-
+ rcu_read_unlock();
return ERR_PTR(-ENOENT);
}
@@ -163,28 +163,25 @@ static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp,
struct inode *inode;
struct pnfs_layout_hdr *lo;
+ rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- list_for_each_entry(lo, &server->layouts, plh_layouts) {
+ list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
nfsi = NFS_I(lo->plh_inode);
if (nfs_compare_fh(fh, &nfsi->fh))
continue;
if (nfsi->layout != lo)
continue;
+ if (!nfs_sb_active(server->super))
+ continue;
inode = igrab(lo->plh_inode);
- if (!inode)
- return ERR_PTR(-EAGAIN);
- if (!nfs_sb_active(inode->i_sb)) {
- rcu_read_unlock();
- spin_unlock(&clp->cl_lock);
- iput(inode);
- spin_lock(&clp->cl_lock);
- rcu_read_lock();
- return ERR_PTR(-EAGAIN);
- }
- return inode;
+ rcu_read_unlock();
+ if (inode)
+ return inode;
+ nfs_sb_deactive(server->super);
+ return ERR_PTR(-EAGAIN);
}
}
-
+ rcu_read_unlock();
return ERR_PTR(-ENOENT);
}
@@ -194,14 +191,9 @@ static struct inode *nfs_layout_find_inode(struct nfs_client *clp,
{
struct inode *inode;
- spin_lock(&clp->cl_lock);
- rcu_read_lock();
inode = nfs_layout_find_inode_by_stateid(clp, stateid);
if (inode == ERR_PTR(-ENOENT))
inode = nfs_layout_find_inode_by_fh(clp, fh);
- rcu_read_unlock();
- spin_unlock(&clp->cl_lock);
-
return inode;
}
@@ -280,7 +272,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
goto unlock;
}
- pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+ pnfs_set_layout_stateid(lo, &args->cbl_stateid, NULL, true);
switch (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
&args->cbl_range,
be32_to_cpu(args->cbl_stateid.seqid))) {
@@ -605,6 +597,7 @@ __be32 nfs4_callback_recallany(void *argp, void *resp,
struct cb_recallanyargs *args = argp;
__be32 status;
fmode_t flags = 0;
+ bool schedule_manager = false;
status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
if (!cps->clp) /* set in cb_sequence */
@@ -627,6 +620,18 @@ __be32 nfs4_callback_recallany(void *argp, void *resp,
if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT))
pnfs_recall_all_layouts(cps->clp);
+
+ if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_READ)) {
+ set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &cps->clp->cl_state);
+ schedule_manager = true;
+ }
+ if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_RW)) {
+ set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &cps->clp->cl_state);
+ schedule_manager = true;
+ }
+ if (schedule_manager)
+ nfs4_schedule_state_manager(cps->clp);
+
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 1865322de142..816e1427f17e 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -378,6 +378,18 @@ nfs_inode_detach_delegation(struct inode *inode)
}
static void
+nfs_update_delegation_cred(struct nfs_delegation *delegation,
+ const struct cred *cred)
+{
+ const struct cred *old;
+
+ if (cred_fscmp(delegation->cred, cred) != 0) {
+ old = xchg(&delegation->cred, get_cred(cred));
+ put_cred(old);
+ }
+}
+
+static void
nfs_update_inplace_delegation(struct nfs_delegation *delegation,
const struct nfs_delegation *update)
{
@@ -385,8 +397,14 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation,
delegation->stateid.seqid = update->stateid.seqid;
smp_wmb();
delegation->type = update->type;
- if (test_and_clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ delegation->pagemod_limit = update->pagemod_limit;
+ if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ delegation->change_attr = update->change_attr;
+ nfs_update_delegation_cred(delegation, update->cred);
+ /* smp_mb__before_atomic() is implicit due to xchg() */
+ clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
atomic_long_inc(&nfs_active_delegations);
+ }
}
}
@@ -545,21 +563,11 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
return ret;
}
-/**
- * nfs_client_return_marked_delegations - return previously marked delegations
- * @clp: nfs_client to process
- *
- * Note that this function is designed to be called by the state
- * manager thread. For this reason, it cannot flush the dirty data,
- * since that could deadlock in case of a state recovery error.
- *
- * Returns zero on success, or a negative errno value.
- */
-int nfs_client_return_marked_delegations(struct nfs_client *clp)
+static int nfs_server_return_marked_delegations(struct nfs_server *server,
+ void __always_unused *data)
{
struct nfs_delegation *delegation;
struct nfs_delegation *prev;
- struct nfs_server *server;
struct inode *inode;
struct inode *place_holder = NULL;
struct nfs_delegation *place_holder_deleg = NULL;
@@ -569,78 +577,79 @@ restart:
/*
* To avoid quadratic looping we hold a reference
* to an inode place_holder. Each time we restart, we
- * list nfs_servers from the server of that inode, and
- * delegation in the server from the delegations of that
- * inode.
+ * list delegation in the server from the delegations
+ * of that inode.
* prev is an RCU-protected pointer to a delegation which
* wasn't marked for return and might be a good choice for
* the next place_holder.
*/
- rcu_read_lock();
prev = NULL;
+ delegation = NULL;
+ rcu_read_lock();
if (place_holder)
- server = NFS_SERVER(place_holder);
- else
- server = list_entry_rcu(clp->cl_superblocks.next,
- struct nfs_server, client_link);
- list_for_each_entry_from_rcu(server, &clp->cl_superblocks, client_link) {
- delegation = NULL;
- if (place_holder && server == NFS_SERVER(place_holder))
- delegation = rcu_dereference(NFS_I(place_holder)->delegation);
- if (!delegation || delegation != place_holder_deleg)
- delegation = list_entry_rcu(server->delegations.next,
- struct nfs_delegation, super_list);
- list_for_each_entry_from_rcu(delegation, &server->delegations, super_list) {
- struct inode *to_put = NULL;
-
- if (!nfs_delegation_need_return(delegation)) {
+ delegation = rcu_dereference(NFS_I(place_holder)->delegation);
+ if (!delegation || delegation != place_holder_deleg)
+ delegation = list_entry_rcu(server->delegations.next,
+ struct nfs_delegation, super_list);
+ list_for_each_entry_from_rcu(delegation, &server->delegations, super_list) {
+ struct inode *to_put = NULL;
+
+ if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags))
+ continue;
+ if (!nfs_delegation_need_return(delegation)) {
+ if (nfs4_is_valid_delegation(delegation, 0))
prev = delegation;
- continue;
- }
- if (!nfs_sb_active(server->super))
- break; /* continue in outer loop */
-
- if (prev) {
- struct inode *tmp;
-
- tmp = nfs_delegation_grab_inode(prev);
- if (tmp) {
- to_put = place_holder;
- place_holder = tmp;
- place_holder_deleg = prev;
- }
- }
+ continue;
+ }
- inode = nfs_delegation_grab_inode(delegation);
- if (inode == NULL) {
- rcu_read_unlock();
- if (to_put)
- iput(to_put);
- nfs_sb_deactive(server->super);
- goto restart;
+ if (prev) {
+ struct inode *tmp = nfs_delegation_grab_inode(prev);
+ if (tmp) {
+ to_put = place_holder;
+ place_holder = tmp;
+ place_holder_deleg = prev;
}
- delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+ }
+
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL) {
rcu_read_unlock();
+ iput(to_put);
+ goto restart;
+ }
+ delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+ rcu_read_unlock();
- if (to_put)
- iput(to_put);
+ iput(to_put);
- err = nfs_end_delegation_return(inode, delegation, 0);
- iput(inode);
- nfs_sb_deactive(server->super);
- cond_resched();
- if (!err)
- goto restart;
- set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
- if (place_holder)
- iput(place_holder);
- return err;
- }
+ err = nfs_end_delegation_return(inode, delegation, 0);
+ iput(inode);
+ cond_resched();
+ if (!err)
+ goto restart;
+ set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+ goto out;
}
rcu_read_unlock();
- if (place_holder)
- iput(place_holder);
- return 0;
+out:
+ iput(place_holder);
+ return err;
+}
+
+/**
+ * nfs_client_return_marked_delegations - return previously marked delegations
+ * @clp: nfs_client to process
+ *
+ * Note that this function is designed to be called by the state
+ * manager thread. For this reason, it cannot flush the dirty data,
+ * since that could deadlock in case of a state recovery error.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_client_return_marked_delegations(struct nfs_client *clp)
+{
+ return nfs_client_for_each_server(clp,
+ nfs_server_return_marked_delegations, NULL);
}
/**
@@ -1083,53 +1092,51 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp)
rcu_read_unlock();
}
-/**
- * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
- * @clp: nfs_client to process
- *
- */
-void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
+static int nfs_server_reap_unclaimed_delegations(struct nfs_server *server,
+ void __always_unused *data)
{
struct nfs_delegation *delegation;
- struct nfs_server *server;
struct inode *inode;
-
restart:
rcu_read_lock();
- list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- list_for_each_entry_rcu(delegation, &server->delegations,
- super_list) {
- if (test_bit(NFS_DELEGATION_INODE_FREEING,
- &delegation->flags) ||
- test_bit(NFS_DELEGATION_RETURNING,
- &delegation->flags) ||
- test_bit(NFS_DELEGATION_NEED_RECLAIM,
- &delegation->flags) == 0)
- continue;
- if (!nfs_sb_active(server->super))
- break; /* continue in outer loop */
- inode = nfs_delegation_grab_inode(delegation);
- if (inode == NULL) {
- rcu_read_unlock();
- nfs_sb_deactive(server->super);
- goto restart;
- }
- delegation = nfs_start_delegation_return_locked(NFS_I(inode));
- rcu_read_unlock();
- if (delegation != NULL) {
- if (nfs_detach_delegation(NFS_I(inode), delegation,
- server) != NULL)
- nfs_free_delegation(delegation);
- /* Match nfs_start_delegation_return_locked */
- nfs_put_delegation(delegation);
- }
- iput(inode);
- nfs_sb_deactive(server->super);
- cond_resched();
- goto restart;
+restart_locked:
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ if (test_bit(NFS_DELEGATION_INODE_FREEING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_NEED_RECLAIM,
+ &delegation->flags) == 0)
+ continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL)
+ goto restart_locked;
+ delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+ rcu_read_unlock();
+ if (delegation != NULL) {
+ if (nfs_detach_delegation(NFS_I(inode), delegation,
+ server) != NULL)
+ nfs_free_delegation(delegation);
+ /* Match nfs_start_delegation_return_locked */
+ nfs_put_delegation(delegation);
}
+ iput(inode);
+ cond_resched();
+ goto restart;
}
rcu_read_unlock();
+ return 0;
+}
+
+/**
+ * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
+{
+ nfs_client_for_each_server(clp, nfs_server_reap_unclaimed_delegations,
+ NULL);
}
static inline bool nfs4_server_rebooted(const struct nfs_client *clp)
@@ -1215,62 +1222,61 @@ nfs_delegation_test_free_expired(struct inode *inode,
nfs_remove_bad_delegation(inode, stateid);
}
-/**
- * nfs_reap_expired_delegations - reap expired delegations
- * @clp: nfs_client to process
- *
- * Iterates through all the delegations associated with this server and
- * checks if they have may have been revoked. This function is usually
- * expected to be called in cases where the server may have lost its
- * lease.
- */
-void nfs_reap_expired_delegations(struct nfs_client *clp)
+static int nfs_server_reap_expired_delegations(struct nfs_server *server,
+ void __always_unused *data)
{
struct nfs_delegation *delegation;
- struct nfs_server *server;
struct inode *inode;
const struct cred *cred;
nfs4_stateid stateid;
-
restart:
rcu_read_lock();
- list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- list_for_each_entry_rcu(delegation, &server->delegations,
- super_list) {
- if (test_bit(NFS_DELEGATION_INODE_FREEING,
- &delegation->flags) ||
- test_bit(NFS_DELEGATION_RETURNING,
- &delegation->flags) ||
- test_bit(NFS_DELEGATION_TEST_EXPIRED,
- &delegation->flags) == 0)
- continue;
- if (!nfs_sb_active(server->super))
- break; /* continue in outer loop */
- inode = nfs_delegation_grab_inode(delegation);
- if (inode == NULL) {
- rcu_read_unlock();
- nfs_sb_deactive(server->super);
- goto restart;
- }
- cred = get_cred_rcu(delegation->cred);
- nfs4_stateid_copy(&stateid, &delegation->stateid);
- clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
- rcu_read_unlock();
- nfs_delegation_test_free_expired(inode, &stateid, cred);
- put_cred(cred);
- if (nfs4_server_rebooted(clp)) {
- nfs_inode_mark_test_expired_delegation(server,inode);
- iput(inode);
- nfs_sb_deactive(server->super);
- return;
- }
+restart_locked:
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ if (test_bit(NFS_DELEGATION_INODE_FREEING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_TEST_EXPIRED,
+ &delegation->flags) == 0)
+ continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL)
+ goto restart_locked;
+ spin_lock(&delegation->lock);
+ cred = get_cred_rcu(delegation->cred);
+ nfs4_stateid_copy(&stateid, &delegation->stateid);
+ spin_unlock(&delegation->lock);
+ clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+ rcu_read_unlock();
+ nfs_delegation_test_free_expired(inode, &stateid, cred);
+ put_cred(cred);
+ if (!nfs4_server_rebooted(server->nfs_client)) {
iput(inode);
- nfs_sb_deactive(server->super);
cond_resched();
goto restart;
}
+ nfs_inode_mark_test_expired_delegation(server,inode);
+ iput(inode);
+ return -EAGAIN;
}
rcu_read_unlock();
+ return 0;
+}
+
+/**
+ * nfs_reap_expired_delegations - reap expired delegations
+ * @clp: nfs_client to process
+ *
+ * Iterates through all the delegations associated with this server and
+ * checks if they have may have been revoked. This function is usually
+ * expected to be called in cases where the server may have lost its
+ * lease.
+ */
+void nfs_reap_expired_delegations(struct nfs_client *clp)
+{
+ nfs_client_for_each_server(clp, nfs_server_reap_expired_delegations,
+ NULL);
}
void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
@@ -1359,11 +1365,14 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_delegation *delegation;
- bool ret;
+ bool ret = false;
flags &= FMODE_READ|FMODE_WRITE;
rcu_read_lock();
delegation = rcu_dereference(nfsi->delegation);
+ if (!delegation)
+ goto out;
+ spin_lock(&delegation->lock);
ret = nfs4_is_valid_delegation(delegation, flags);
if (ret) {
nfs4_stateid_copy(dst, &delegation->stateid);
@@ -1371,6 +1380,8 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
if (cred)
*cred = get_cred(delegation->cred);
}
+ spin_unlock(&delegation->lock);
+out:
rcu_read_unlock();
return ret;
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d4b839b6cf89..5a331da5f55a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -141,10 +141,9 @@ struct nfs_cache_array {
int size;
int eof_index;
u64 last_cookie;
- struct nfs_cache_array_entry array[0];
+ struct nfs_cache_array_entry array[];
};
-typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool);
typedef struct {
struct file *file;
struct page *page;
@@ -153,7 +152,7 @@ typedef struct {
u64 *dir_cookie;
u64 last_cookie;
loff_t current_index;
- decode_dirent_t decode;
+ loff_t prev_index;
unsigned long dir_verifier;
unsigned long timestamp;
@@ -240,6 +239,25 @@ out:
return ret;
}
+static inline
+int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+ return in_compat_syscall();
+#else
+ return (BITS_PER_LONG == 32);
+#endif
+}
+
+static
+bool nfs_readdir_use_cookie(const struct file *filp)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return false;
+ return true;
+}
+
static
int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
{
@@ -289,7 +307,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
!nfs_readdir_inode_mapping_valid(nfsi)) {
ctx->duped = 0;
ctx->attr_gencount = nfsi->attr_gencount;
- } else if (new_pos < desc->ctx->pos) {
+ } else if (new_pos < desc->prev_index) {
if (ctx->duped > 0
&& ctx->dup_cookie == *desc->dir_cookie) {
if (printk_ratelimit()) {
@@ -305,7 +323,11 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
ctx->dup_cookie = *desc->dir_cookie;
ctx->duped = -1;
}
- desc->ctx->pos = new_pos;
+ if (nfs_readdir_use_cookie(desc->file))
+ desc->ctx->pos = *desc->dir_cookie;
+ else
+ desc->ctx->pos = new_pos;
+ desc->prev_index = new_pos;
desc->cache_entry_index = i;
return 0;
}
@@ -376,9 +398,10 @@ error:
static int xdr_decode(nfs_readdir_descriptor_t *desc,
struct nfs_entry *entry, struct xdr_stream *xdr)
{
+ struct inode *inode = file_inode(desc->file);
int error;
- error = desc->decode(xdr, entry, desc->plus);
+ error = NFS_PROTO(inode)->decode_dirent(xdr, entry, desc->plus);
if (error)
return error;
entry->fattr->time_start = desc->timestamp;
@@ -756,6 +779,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
if (desc->page_index == 0) {
desc->current_index = 0;
+ desc->prev_index = 0;
desc->last_cookie = 0;
}
do {
@@ -786,11 +810,14 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
desc->eof = true;
break;
}
- desc->ctx->pos++;
if (i < (array->size-1))
*desc->dir_cookie = array->array[i+1].cookie;
else
*desc->dir_cookie = array->last_cookie;
+ if (nfs_readdir_use_cookie(file))
+ desc->ctx->pos = *desc->dir_cookie;
+ else
+ desc->ctx->pos++;
if (ctx->duped != 0)
ctx->duped = 1;
}
@@ -860,9 +887,14 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
- nfs_readdir_descriptor_t my_desc,
- *desc = &my_desc;
struct nfs_open_dir_context *dir_ctx = file->private_data;
+ nfs_readdir_descriptor_t my_desc = {
+ .file = file,
+ .ctx = ctx,
+ .dir_cookie = &dir_ctx->dir_cookie,
+ .plus = nfs_use_readdirplus(inode, ctx),
+ },
+ *desc = &my_desc;
int res = 0;
dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
@@ -875,14 +907,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
* to either find the entry with the appropriate number or
* revalidate the cookie.
*/
- memset(desc, 0, sizeof(*desc));
-
- desc->file = file;
- desc->ctx = ctx;
- desc->dir_cookie = &dir_ctx->dir_cookie;
- desc->decode = NFS_PROTO(inode)->decode_dirent;
- desc->plus = nfs_use_readdirplus(inode, ctx);
-
if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
@@ -954,7 +978,10 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
}
if (offset != filp->f_pos) {
filp->f_pos = offset;
- dir_ctx->dir_cookie = 0;
+ if (nfs_readdir_use_cookie(filp))
+ dir_ctx->dir_cookie = offset;
+ else
+ dir_ctx->dir_cookie = 0;
dir_ctx->duped = 0;
}
inode_unlock(inode);
@@ -2282,7 +2309,7 @@ static DEFINE_SPINLOCK(nfs_access_lru_lock);
static LIST_HEAD(nfs_access_lru_list);
static atomic_long_t nfs_access_nr_entries;
-static unsigned long nfs_access_max_cachesize = ULONG_MAX;
+static unsigned long nfs_access_max_cachesize = 4*1024*1024;
module_param(nfs_access_max_cachesize, ulong, 0644);
MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length");
@@ -2642,9 +2669,10 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
status = NFS_PROTO(inode)->access(inode, &cache);
if (status != 0) {
if (status == -ESTALE) {
- nfs_zap_caches(inode);
if (!S_ISDIR(inode->i_mode))
- set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+ nfs_set_inode_stale(inode);
+ else
+ nfs_zap_caches(inode);
}
goto out;
}
@@ -2732,14 +2760,7 @@ force_lookup:
if (!NFS_PROTO(inode)->access)
goto out_notsup;
- /* Always try fast lookups first */
- rcu_read_lock();
- res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
- rcu_read_unlock();
- if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) {
- /* Fast lookup failed, try the slow way */
- res = nfs_do_access(inode, cred, mask);
- }
+ res = nfs_do_access(inode, cred, mask);
out:
if (!res && (mask & MAY_EXEC))
res = nfs_execute_ok(inode, mask);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b768a0b42e82..a57e7c72c7f4 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -94,7 +94,7 @@ struct nfs_direct_req {
#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */
/* for read */
#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */
- struct nfs_writeverf verf; /* unstable write verifier */
+#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */
};
static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
@@ -151,106 +151,6 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq,
dreq->count = dreq_len;
}
-/*
- * nfs_direct_select_verf - select the right verifier
- * @dreq - direct request possibly spanning multiple servers
- * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
- * @commit_idx - commit bucket index for the DS
- *
- * returns the correct verifier to use given the role of the server
- */
-static struct nfs_writeverf *
-nfs_direct_select_verf(struct nfs_direct_req *dreq,
- struct nfs_client *ds_clp,
- int commit_idx)
-{
- struct nfs_writeverf *verfp = &dreq->verf;
-
-#ifdef CONFIG_NFS_V4_1
- /*
- * pNFS is in use, use the DS verf except commit_through_mds is set
- * for layout segment where nbuckets is zero.
- */
- if (ds_clp && dreq->ds_cinfo.nbuckets > 0) {
- if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
- verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
- else
- WARN_ON_ONCE(1);
- }
-#endif
- return verfp;
-}
-
-
-/*
- * nfs_direct_set_hdr_verf - set the write/commit verifier
- * @dreq - direct request possibly spanning multiple servers
- * @hdr - pageio header to validate against previously seen verfs
- *
- * Set the server's (MDS or DS) "seen" verifier
- */
-static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
- struct nfs_pgio_header *hdr)
-{
- struct nfs_writeverf *verfp;
-
- verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
- WARN_ON_ONCE(verfp->committed >= 0);
- memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
- WARN_ON_ONCE(verfp->committed < 0);
-}
-
-static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1,
- const struct nfs_writeverf *v2)
-{
- return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier);
-}
-
-/*
- * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
- * @dreq - direct request possibly spanning multiple servers
- * @hdr - pageio header to validate against previously seen verf
- *
- * set the server's "seen" verf if not initialized.
- * returns result of comparison between @hdr->verf and the "seen"
- * verf of the server used by @hdr (DS or MDS)
- */
-static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
- struct nfs_pgio_header *hdr)
-{
- struct nfs_writeverf *verfp;
-
- verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
- if (verfp->committed < 0) {
- nfs_direct_set_hdr_verf(dreq, hdr);
- return 0;
- }
- return nfs_direct_cmp_verf(verfp, &hdr->verf);
-}
-
-/*
- * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
- * @dreq - direct request possibly spanning multiple servers
- * @data - commit data to validate against previously seen verf
- *
- * returns result of comparison between @data->verf and the verf of
- * the server used by @data (DS or MDS)
- */
-static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
- struct nfs_commit_data *data)
-{
- struct nfs_writeverf *verfp;
-
- verfp = nfs_direct_select_verf(dreq, data->ds_clp,
- data->ds_commit_index);
-
- /* verifier not set so always fail */
- if (verfp->committed < 0 || data->res.verf->committed <= NFS_UNSTABLE)
- return 1;
-
- return nfs_direct_cmp_verf(verfp, data->res.verf);
-}
-
/**
* nfs_direct_IO - NFS address space operation for direct I/O
* @iocb: target I/O control block
@@ -305,7 +205,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
kref_get(&dreq->kref);
init_completion(&dreq->completion);
INIT_LIST_HEAD(&dreq->mds_cinfo.list);
- dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */
+ pnfs_init_ds_commit_info(&dreq->ds_cinfo);
INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
spin_lock_init(&dreq->lock);
@@ -316,7 +216,7 @@ static void nfs_direct_req_free(struct kref *kref)
{
struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
- nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
+ pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode);
if (dreq->l_ctx != NULL)
nfs_put_lock_context(dreq->l_ctx);
if (dreq->ctx != NULL)
@@ -571,6 +471,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
l_ctx = nfs_get_lock_context(dreq->ctx);
if (IS_ERR(l_ctx)) {
result = PTR_ERR(l_ctx);
+ nfs_direct_req_release(dreq);
goto out_release;
}
dreq->l_ctx = l_ctx;
@@ -605,15 +506,30 @@ out:
}
static void
+nfs_direct_join_group(struct list_head *list, struct inode *inode)
+{
+ struct nfs_page *req, *next;
+
+ list_for_each_entry(req, list, wb_list) {
+ if (req->wb_head != req || req->wb_this_page == req)
+ continue;
+ for (next = req->wb_this_page;
+ next != req->wb_head;
+ next = next->wb_this_page) {
+ nfs_list_remove_request(next);
+ nfs_release_request(next);
+ }
+ nfs_join_page_group(req, inode);
+ }
+}
+
+static void
nfs_direct_write_scan_commit_list(struct inode *inode,
struct list_head *list,
struct nfs_commit_info *cinfo)
{
mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
-#ifdef CONFIG_NFS_V4_1
- if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
- NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
-#endif
+ pnfs_recover_commit_reqs(list, cinfo);
nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
}
@@ -629,11 +545,12 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
nfs_init_cinfo_from_dreq(&cinfo, dreq);
nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
+ nfs_direct_join_group(&reqs, dreq->inode);
+
dreq->count = 0;
dreq->max_count = 0;
list_for_each_entry(req, &reqs, wb_list)
dreq->max_count += req->wb_bytes;
- dreq->verf.committed = NFS_INVALID_STABLE_HOW;
nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
get_dreq(dreq);
@@ -670,27 +587,35 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
static void nfs_direct_commit_complete(struct nfs_commit_data *data)
{
+ const struct nfs_writeverf *verf = data->res.verf;
struct nfs_direct_req *dreq = data->dreq;
struct nfs_commit_info cinfo;
struct nfs_page *req;
int status = data->task.tk_status;
+ if (status < 0) {
+ /* Errors in commit are fatal */
+ dreq->error = status;
+ dreq->max_count = 0;
+ dreq->count = 0;
+ dreq->flags = NFS_ODIRECT_DONE;
+ } else if (dreq->flags == NFS_ODIRECT_DONE)
+ status = dreq->error;
+
nfs_init_cinfo_from_dreq(&cinfo, dreq);
- if (status < 0 || nfs_direct_cmp_commit_data_verf(dreq, data))
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
- if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
+ if (status >= 0 && !nfs_write_match_verf(verf, req)) {
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
/*
* Despite the reboot, the write was successful,
* so reset wb_nio.
*/
req->wb_nio = 0;
- /* Note the rewrite will go through mds */
nfs_mark_request_commit(req, NULL, &cinfo, 0);
- } else
+ } else /* Error or match */
nfs_release_request(req);
nfs_unlock_and_release_request(req);
}
@@ -705,7 +630,8 @@ static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
struct nfs_direct_req *dreq = cinfo->dreq;
spin_lock(&dreq->lock);
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ if (dreq->flags != NFS_ODIRECT_DONE)
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
spin_unlock(&dreq->lock);
nfs_mark_request_commit(req, NULL, cinfo, 0);
}
@@ -728,6 +654,23 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
nfs_direct_write_reschedule(dreq);
}
+static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
+{
+ struct nfs_commit_info cinfo;
+ struct nfs_page *req;
+ LIST_HEAD(reqs);
+
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+ nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
+
+ while (!list_empty(&reqs)) {
+ req = nfs_list_entry(reqs.next);
+ nfs_list_remove_request(req);
+ nfs_release_request(req);
+ nfs_unlock_and_release_request(req);
+ }
+}
+
static void nfs_direct_write_schedule_work(struct work_struct *work)
{
struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
@@ -742,6 +685,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
nfs_direct_write_reschedule(dreq);
break;
default:
+ nfs_direct_write_clear_reqs(dreq);
nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
nfs_direct_complete(dreq);
}
@@ -768,20 +712,15 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
}
nfs_direct_count_bytes(dreq, hdr);
- if (hdr->good_bytes != 0) {
- if (nfs_write_need_commit(hdr)) {
- if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
- request_commit = true;
- else if (dreq->flags == 0) {
- nfs_direct_set_hdr_verf(dreq, hdr);
- request_commit = true;
- dreq->flags = NFS_ODIRECT_DO_COMMIT;
- } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
- request_commit = true;
- if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
- dreq->flags =
- NFS_ODIRECT_RESCHED_WRITES;
- }
+ if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) {
+ switch (dreq->flags) {
+ case 0:
+ dreq->flags = NFS_ODIRECT_DO_COMMIT;
+ request_commit = true;
+ break;
+ case NFS_ODIRECT_RESCHED_WRITES:
+ case NFS_ODIRECT_DO_COMMIT:
+ request_commit = true;
}
}
spin_unlock(&dreq->lock);
@@ -990,11 +929,13 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
l_ctx = nfs_get_lock_context(dreq->ctx);
if (IS_ERR(l_ctx)) {
result = PTR_ERR(l_ctx);
+ nfs_direct_req_release(dreq);
goto out_release;
}
dreq->l_ctx = l_ctx;
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
+ pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
nfs_start_io_direct(inode);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index c9b605f6c9cb..a13e69009f19 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -49,6 +49,7 @@ MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
MODULE_DESCRIPTION("The NFSv4 file layout driver");
#define FILELAYOUT_POLL_RETRY_MAX (15*HZ)
+static const struct pnfs_commit_ops filelayout_commit_ops;
static loff_t
filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
@@ -750,72 +751,17 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg)
/* This assumes a single RW lseg */
if (lseg->pls_range.iomode == IOMODE_RW) {
struct nfs4_filelayout *flo;
+ struct inode *inode;
flo = FILELAYOUT_FROM_HDR(lseg->pls_layout);
- flo->commit_info.nbuckets = 0;
- kfree(flo->commit_info.buckets);
- flo->commit_info.buckets = NULL;
+ inode = flo->generic_hdr.plh_inode;
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_release_lseg(&flo->commit_info, lseg);
+ spin_unlock(&inode->i_lock);
}
_filelayout_free_lseg(fl);
}
-static int
-filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo,
- gfp_t gfp_flags)
-{
- struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
- struct pnfs_commit_bucket *buckets;
- int size, i;
-
- if (fl->commit_through_mds)
- return 0;
-
- size = (fl->stripe_type == STRIPE_SPARSE) ?
- fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
-
- if (cinfo->ds->nbuckets >= size) {
- /* This assumes there is only one IOMODE_RW lseg. What
- * we really want to do is have a layout_hdr level
- * dictionary of <multipath_list4, fh> keys, each
- * associated with a struct list_head, populated by calls
- * to filelayout_write_pagelist().
- * */
- return 0;
- }
-
- buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
- gfp_flags);
- if (!buckets)
- return -ENOMEM;
- for (i = 0; i < size; i++) {
- INIT_LIST_HEAD(&buckets[i].written);
- INIT_LIST_HEAD(&buckets[i].committing);
- /* mark direct verifier as unset */
- buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
- }
-
- spin_lock(&cinfo->inode->i_lock);
- if (cinfo->ds->nbuckets >= size)
- goto out;
- for (i = 0; i < cinfo->ds->nbuckets; i++) {
- list_splice(&cinfo->ds->buckets[i].written,
- &buckets[i].written);
- list_splice(&cinfo->ds->buckets[i].committing,
- &buckets[i].committing);
- buckets[i].direct_verf.committed =
- cinfo->ds->buckets[i].direct_verf.committed;
- buckets[i].wlseg = cinfo->ds->buckets[i].wlseg;
- buckets[i].clseg = cinfo->ds->buckets[i].clseg;
- }
- swap(cinfo->ds->buckets, buckets);
- cinfo->ds->nbuckets = size;
-out:
- spin_unlock(&cinfo->inode->i_lock);
- kfree(buckets);
- return 0;
-}
-
static struct pnfs_layout_segment *
filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
struct nfs4_layoutget_res *lgr,
@@ -938,9 +884,6 @@ static void
filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- struct nfs_commit_info cinfo;
- int status;
-
pnfs_generic_pg_check_layout(pgio);
if (!pgio->pg_lseg) {
pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
@@ -959,17 +902,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
- goto out_mds;
- nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
- status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
- if (status < 0) {
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
- goto out_mds;
- }
- return;
-out_mds:
- nfs_pageio_reset_write_mds(pgio);
+ nfs_pageio_reset_write_mds(pgio);
}
static const struct nfs_pageio_ops filelayout_pg_read_ops = {
@@ -1078,36 +1011,6 @@ out_err:
return -EAGAIN;
}
-/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
- * for @page
- * @cinfo - commit info for current inode
- * @page - page to search for matching head request
- *
- * Returns a the head request if one is found, otherwise returns NULL.
- */
-static struct nfs_page *
-filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
-{
- struct nfs_page *freq, *t;
- struct pnfs_commit_bucket *b;
- int i;
-
- /* Linearly search the commit lists for each bucket until a matching
- * request is found */
- for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
- list_for_each_entry_safe(freq, t, &b->written, wb_list) {
- if (freq->wb_page == page)
- return freq->wb_head;
- }
- list_for_each_entry_safe(freq, t, &b->committing, wb_list) {
- if (freq->wb_page == page)
- return freq->wb_head;
- }
- }
-
- return NULL;
-}
-
static int
filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
int how, struct nfs_commit_info *cinfo)
@@ -1140,13 +1043,17 @@ filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
struct nfs4_filelayout *flo;
flo = kzalloc(sizeof(*flo), gfp_flags);
- return flo != NULL ? &flo->generic_hdr : NULL;
+ if (flo == NULL)
+ return NULL;
+ pnfs_init_ds_commit_info(&flo->commit_info);
+ flo->commit_info.ops = &filelayout_commit_ops;
+ return &flo->generic_hdr;
}
static void
filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
{
- kfree(FILELAYOUT_FROM_HDR(lo));
+ kfree_rcu(FILELAYOUT_FROM_HDR(lo), generic_hdr.plh_rcu);
}
static struct pnfs_ds_commit_info *
@@ -1160,6 +1067,46 @@ filelayout_get_ds_info(struct inode *inode)
return &FILELAYOUT_FROM_HDR(layout)->commit_info;
}
+static void
+filelayout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ struct pnfs_commit_array *array, *new;
+ unsigned int size = (fl->stripe_type == STRIPE_SPARSE) ?
+ fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
+
+ new = pnfs_alloc_commit_array(size, GFP_NOIO);
+ if (new) {
+ spin_lock(&inode->i_lock);
+ array = pnfs_add_commit_array(fl_cinfo, new, lseg);
+ spin_unlock(&inode->i_lock);
+ if (array != new)
+ pnfs_free_commit_array(new);
+ }
+}
+
+static void
+filelayout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_destroy(fl_cinfo);
+ spin_unlock(&inode->i_lock);
+}
+
+static const struct pnfs_commit_ops filelayout_commit_ops = {
+ .setup_ds_info = filelayout_setup_ds_info,
+ .release_ds_info = filelayout_release_ds_info,
+ .mark_request_commit = filelayout_mark_request_commit,
+ .clear_request_commit = pnfs_generic_clear_request_commit,
+ .scan_commit_lists = pnfs_generic_scan_commit_lists,
+ .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
+ .search_commit_reqs = pnfs_generic_search_commit_reqs,
+ .commit_pagelist = filelayout_commit_pagelist,
+};
+
static struct pnfs_layoutdriver_type filelayout_type = {
.id = LAYOUT_NFSV4_1_FILES,
.name = "LAYOUT_NFSV4_1_FILES",
@@ -1173,12 +1120,6 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.pg_read_ops = &filelayout_pg_read_ops,
.pg_write_ops = &filelayout_pg_write_ops,
.get_ds_info = &filelayout_get_ds_info,
- .mark_request_commit = filelayout_mark_request_commit,
- .clear_request_commit = pnfs_generic_clear_request_commit,
- .scan_commit_lists = pnfs_generic_scan_commit_lists,
- .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
- .search_commit_reqs = filelayout_search_commit_reqs,
- .commit_pagelist = filelayout_commit_pagelist,
.read_pagelist = filelayout_read_pagelist,
.write_pagelist = filelayout_write_pagelist,
.alloc_deviceid_node = filelayout_alloc_deviceid_node,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index bb9148b83166..7d399f72ebbb 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -32,6 +32,7 @@
static unsigned short io_maxretrans;
+static const struct pnfs_commit_ops ff_layout_commit_ops;
static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
struct nfs_pgio_header *hdr);
static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
@@ -48,9 +49,11 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
ffl = kzalloc(sizeof(*ffl), gfp_flags);
if (ffl) {
+ pnfs_init_ds_commit_info(&ffl->commit_info);
INIT_LIST_HEAD(&ffl->error_list);
INIT_LIST_HEAD(&ffl->mirrors);
ffl->last_report_time = ktime_get();
+ ffl->commit_info.ops = &ff_layout_commit_ops;
return &ffl->generic_hdr;
} else
return NULL;
@@ -59,14 +62,14 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
static void
ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
{
+ struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(lo);
struct nfs4_ff_layout_ds_err *err, *n;
- list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list,
- list) {
+ list_for_each_entry_safe(err, n, &ffl->error_list, list) {
list_del(&err->list);
kfree(err);
}
- kfree(FF_LAYOUT_FROM_HDR(lo));
+ kfree_rcu(ffl, generic_hdr.plh_rcu);
}
static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
@@ -248,36 +251,10 @@ static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
{
- int i;
-
- if (fls->mirror_array) {
- for (i = 0; i < fls->mirror_array_cnt; i++) {
- /* normally mirror_ds is freed in
- * .free_deviceid_node but we still do it here
- * for .alloc_lseg error path */
- ff_layout_put_mirror(fls->mirror_array[i]);
- }
- kfree(fls->mirror_array);
- fls->mirror_array = NULL;
- }
-}
-
-static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr)
-{
- int ret = 0;
+ u32 i;
- dprintk("--> %s\n", __func__);
-
- /* FIXME: remove this check when layout segment support is added */
- if (lgr->range.offset != 0 ||
- lgr->range.length != NFS4_MAX_UINT64) {
- dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
- __func__);
- ret = -EINVAL;
- }
-
- dprintk("--> %s returns %d\n", __func__, ret);
- return ret;
+ for (i = 0; i < fls->mirror_array_cnt; i++)
+ ff_layout_put_mirror(fls->mirror_array[i]);
}
static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
@@ -289,6 +266,23 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
}
static bool
+ff_lseg_match_mirrors(struct pnfs_layout_segment *l1,
+ struct pnfs_layout_segment *l2)
+{
+ const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1);
+ const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1);
+ u32 i;
+
+ if (fl1->mirror_array_cnt != fl2->mirror_array_cnt)
+ return false;
+ for (i = 0; i < fl1->mirror_array_cnt; i++) {
+ if (fl1->mirror_array[i] != fl2->mirror_array[i])
+ return false;
+ }
+ return true;
+}
+
+static bool
ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
const struct pnfs_layout_range *l2)
{
@@ -323,6 +317,8 @@ ff_lseg_merge(struct pnfs_layout_segment *new,
new->pls_range.length);
if (new_end < old->pls_range.offset)
return false;
+ if (!ff_lseg_match_mirrors(new, old))
+ return false;
/* Mergeable: copy info from 'old' to 'new' */
if (new_end < old_end)
@@ -400,16 +396,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
goto out_err_free;
rc = -ENOMEM;
- fls = kzalloc(sizeof(*fls), gfp_flags);
+ fls = kzalloc(struct_size(fls, mirror_array, mirror_array_cnt),
+ gfp_flags);
if (!fls)
goto out_err_free;
fls->mirror_array_cnt = mirror_array_cnt;
fls->stripe_unit = stripe_unit;
- fls->mirror_array = kcalloc(fls->mirror_array_cnt,
- sizeof(fls->mirror_array[0]), gfp_flags);
- if (fls->mirror_array == NULL)
- goto out_err_free;
for (i = 0; i < fls->mirror_array_cnt; i++) {
struct nfs4_ff_layout_mirror *mirror;
@@ -545,9 +538,6 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
out_sort_mirrors:
ff_layout_sort_mirrors(fls);
- rc = ff_layout_check_layout(lgr);
- if (rc)
- goto out_err_free;
ret = &fls->generic_hdr;
dprintk("<-- %s (success)\n", __func__);
out_free_page:
@@ -560,17 +550,6 @@ out_err_free:
goto out_free_page;
}
-static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
-{
- struct pnfs_layout_segment *lseg;
-
- list_for_each_entry(lseg, &layout->plh_segs, pls_list)
- if (lseg->pls_range.iomode == IOMODE_RW)
- return true;
-
- return false;
-}
-
static void
ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
{
@@ -585,23 +564,12 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
inode = ffl->generic_hdr.plh_inode;
spin_lock(&inode->i_lock);
- if (!ff_layout_has_rw_segments(lseg->pls_layout)) {
- ffl->commit_info.nbuckets = 0;
- kfree(ffl->commit_info.buckets);
- ffl->commit_info.buckets = NULL;
- }
+ pnfs_generic_ds_cinfo_release_lseg(&ffl->commit_info, lseg);
spin_unlock(&inode->i_lock);
}
_ff_layout_free_lseg(fls);
}
-/* Return 1 until we have multiple lsegs support */
-static int
-ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
-{
- return 1;
-}
-
static void
nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
{
@@ -746,52 +714,6 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
spin_unlock(&mirror->lock);
}
-static int
-ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo,
- gfp_t gfp_flags)
-{
- struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
- struct pnfs_commit_bucket *buckets;
- int size;
-
- if (cinfo->ds->nbuckets != 0) {
- /* This assumes there is only one RW lseg per file.
- * To support multiple lseg per file, we need to
- * change struct pnfs_commit_bucket to allow dynamic
- * increasing nbuckets.
- */
- return 0;
- }
-
- size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg);
-
- buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
- gfp_flags);
- if (!buckets)
- return -ENOMEM;
- else {
- int i;
-
- spin_lock(&cinfo->inode->i_lock);
- if (cinfo->ds->nbuckets != 0)
- kfree(buckets);
- else {
- cinfo->ds->buckets = buckets;
- cinfo->ds->nbuckets = size;
- for (i = 0; i < size; i++) {
- INIT_LIST_HEAD(&buckets[i].written);
- INIT_LIST_HEAD(&buckets[i].committing);
- /* mark direct verifier as unset */
- buckets[i].direct_verf.committed =
- NFS_INVALID_STABLE_HOW;
- }
- }
- spin_unlock(&cinfo->inode->i_lock);
- return 0;
- }
-}
-
static void
ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx)
{
@@ -876,8 +798,8 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
pnfs_put_lseg(pgio->pg_lseg);
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
nfs_req_openctx(req),
- 0,
- NFS4_MAX_UINT64,
+ req_offset(req),
+ req->wb_bytes,
IOMODE_READ,
strict_iomode,
GFP_KERNEL);
@@ -888,6 +810,14 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
}
static void
+ff_layout_pg_check_layout(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ pnfs_generic_pg_check_layout(pgio);
+ pnfs_generic_pg_check_range(pgio, req);
+}
+
+static void
ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
@@ -897,7 +827,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
int ds_idx;
retry:
- pnfs_generic_pg_check_layout(pgio);
+ ff_layout_pg_check_layout(pgio, req);
/* Use full layout for now */
if (!pgio->pg_lseg) {
ff_layout_pg_get_read(pgio, req, false);
@@ -953,18 +883,16 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
{
struct nfs4_ff_layout_mirror *mirror;
struct nfs_pgio_mirror *pgm;
- struct nfs_commit_info cinfo;
struct nfs4_pnfs_ds *ds;
int i;
- int status;
retry:
- pnfs_generic_pg_check_layout(pgio);
+ ff_layout_pg_check_layout(pgio, req);
if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
nfs_req_openctx(req),
- 0,
- NFS4_MAX_UINT64,
+ req_offset(req),
+ req->wb_bytes,
IOMODE_RW,
false,
GFP_NOFS);
@@ -978,11 +906,6 @@ retry:
if (pgio->pg_lseg == NULL)
goto out_mds;
- nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
- status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
- if (status < 0)
- goto out_mds;
-
/* Use a direct mapping of ds_idx to pgio mirror_idx */
if (WARN_ON_ONCE(pgio->pg_mirror_count !=
FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
@@ -1297,21 +1220,23 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
}
}
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+ mirror, offset, length, status, opnum,
+ GFP_NOIO);
+
switch (status) {
case NFS4ERR_DELAY:
case NFS4ERR_GRACE:
- return;
- default:
break;
+ case NFS4ERR_NXIO:
+ ff_layout_mark_ds_unreachable(lseg, idx);
+ /* Fallthrough */
+ default:
+ pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+ lseg);
}
- mirror = FF_LAYOUT_COMP(lseg, idx);
- err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
- mirror, offset, length, status, opnum,
- GFP_NOIO);
- if (status == NFS4ERR_NXIO)
- ff_layout_mark_ds_unreachable(lseg, idx);
- pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
}
@@ -2012,6 +1937,33 @@ ff_layout_get_ds_info(struct inode *inode)
}
static void
+ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ struct pnfs_commit_array *array, *new;
+
+ new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, GFP_NOIO);
+ if (new) {
+ spin_lock(&inode->i_lock);
+ array = pnfs_add_commit_array(fl_cinfo, new, lseg);
+ spin_unlock(&inode->i_lock);
+ if (array != new)
+ pnfs_free_commit_array(new);
+ }
+}
+
+static void
+ff_layout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_destroy(fl_cinfo);
+ spin_unlock(&inode->i_lock);
+}
+
+static void
ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d)
{
nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
@@ -2496,6 +2448,16 @@ ff_layout_set_layoutdriver(struct nfs_server *server,
return 0;
}
+static const struct pnfs_commit_ops ff_layout_commit_ops = {
+ .setup_ds_info = ff_layout_setup_ds_info,
+ .release_ds_info = ff_layout_release_ds_info,
+ .mark_request_commit = pnfs_layout_mark_request_commit,
+ .clear_request_commit = pnfs_generic_clear_request_commit,
+ .scan_commit_lists = pnfs_generic_scan_commit_lists,
+ .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
+ .commit_pagelist = ff_layout_commit_pagelist,
+};
+
static struct pnfs_layoutdriver_type flexfilelayout_type = {
.id = LAYOUT_FLEX_FILES,
.name = "LAYOUT_FLEX_FILES",
@@ -2512,11 +2474,6 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
.pg_write_ops = &ff_layout_pg_write_ops,
.get_ds_info = ff_layout_get_ds_info,
.free_deviceid_node = ff_layout_free_deviceid_node,
- .mark_request_commit = pnfs_layout_mark_request_commit,
- .clear_request_commit = pnfs_generic_clear_request_commit,
- .scan_commit_lists = pnfs_generic_scan_commit_lists,
- .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
- .commit_pagelist = ff_layout_commit_pagelist,
.read_pagelist = ff_layout_read_pagelist,
.write_pagelist = ff_layout_write_pagelist,
.alloc_deviceid_node = ff_layout_alloc_deviceid_node,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 2f369966abf7..354a031c69b1 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -99,7 +99,7 @@ struct nfs4_ff_layout_segment {
u64 stripe_unit;
u32 flags;
u32 mirror_array_cnt;
- struct nfs4_ff_layout_mirror **mirror_array;
+ struct nfs4_ff_layout_mirror *mirror_array[];
};
struct nfs4_flexfile_layout {
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index e113fcb4bb4c..ccc88be88d6a 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -190,6 +190,7 @@ static const struct constant_table nfs_vers_tokens[] = {
{ "4.0", Opt_vers_4_0 },
{ "4.1", Opt_vers_4_1 },
{ "4.2", Opt_vers_4_2 },
+ {}
};
enum {
@@ -202,13 +203,14 @@ enum {
nr__Opt_xprt
};
-static const struct constant_table nfs_xprt_protocol_tokens[nr__Opt_xprt] = {
+static const struct constant_table nfs_xprt_protocol_tokens[] = {
{ "rdma", Opt_xprt_rdma },
{ "rdma6", Opt_xprt_rdma6 },
{ "tcp", Opt_xprt_tcp },
{ "tcp6", Opt_xprt_tcp6 },
{ "udp", Opt_xprt_udp },
{ "udp6", Opt_xprt_udp6 },
+ {}
};
enum {
@@ -239,6 +241,7 @@ static const struct constant_table nfs_secflavor_tokens[] = {
{ "spkm3i", Opt_sec_spkmi },
{ "spkm3p", Opt_sec_spkmp },
{ "sys", Opt_sec_sys },
+ {}
};
/*
@@ -1135,7 +1138,7 @@ out_no_address:
return nfs_invalf(fc, "NFS4: mount program didn't pass remote address");
out_invalid_transport_udp:
- return nfs_invalf(fc, "NFSv4: Unsupported transport protocol udp");
+ return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
}
#endif
@@ -1257,7 +1260,7 @@ out_v4_not_compiled:
nfs_errorf(fc, "NFS: NFSv4 is not compiled into kernel");
return -EPROTONOSUPPORT;
out_invalid_transport_udp:
- return nfs_invalf(fc, "NFSv4: Unsupported transport protocol udp");
+ return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
out_no_address:
return nfs_invalf(fc, "NFS: mount program didn't pass remote address");
out_mountproto_mismatch:
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 11bf15800ac9..b9d0921cb4fe 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -62,7 +62,6 @@
/* Default is to see 64-bit inode numbers */
static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
-static void nfs_invalidate_inode(struct inode *);
static int nfs_update_inode(struct inode *, struct nfs_fattr *);
static struct kmem_cache * nfs_inode_cachep;
@@ -284,10 +283,18 @@ EXPORT_SYMBOL_GPL(nfs_invalidate_atime);
* Invalidate, but do not unhash, the inode.
* NB: must be called with inode->i_lock held!
*/
-static void nfs_invalidate_inode(struct inode *inode)
+static void nfs_set_inode_stale_locked(struct inode *inode)
{
set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
nfs_zap_caches_locked(inode);
+ trace_nfs_set_inode_stale(inode);
+}
+
+void nfs_set_inode_stale(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ nfs_set_inode_stale_locked(inode);
+ spin_unlock(&inode->i_lock);
}
struct nfs_find_desc {
@@ -959,16 +966,16 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
struct file *filp)
{
struct nfs_open_context *ctx;
- const struct cred *cred = get_current_cred();
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx) {
- put_cred(cred);
+ if (!ctx)
return ERR_PTR(-ENOMEM);
- }
nfs_sb_active(dentry->d_sb);
ctx->dentry = dget(dentry);
- ctx->cred = cred;
+ if (filp)
+ ctx->cred = get_cred(filp->f_cred);
+ else
+ ctx->cred = get_current_cred();
ctx->ll_cred = NULL;
ctx->state = NULL;
ctx->mode = f_mode;
@@ -1163,9 +1170,10 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
status = 0;
break;
case -ESTALE:
- nfs_zap_caches(inode);
if (!S_ISDIR(inode->i_mode))
- set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+ nfs_set_inode_stale(inode);
+ else
+ nfs_zap_caches(inode);
}
goto err_out;
}
@@ -2064,7 +2072,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
* lookup validation will know that the inode is bad.
* (But we fall through to invalidate the caches.)
*/
- nfs_invalidate_inode(inode);
+ nfs_set_inode_stale_locked(inode);
return -ESTALE;
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f80c47d5ff27..1f32a9fbfdaf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -274,12 +274,6 @@ void nfs_free_request(struct nfs_page *req);
struct nfs_pgio_mirror *
nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
-static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
-{
- WARN_ON_ONCE(desc->pg_mirror_count < 1);
- return desc->pg_mirror_count > 1;
-}
-
static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1,
const struct nfs_open_context *ctx2)
{
@@ -417,7 +411,9 @@ extern int __init register_nfs_fs(void);
extern void __exit unregister_nfs_fs(void);
extern bool nfs_sb_active(struct super_block *sb);
extern void nfs_sb_deactive(struct super_block *sb);
-
+extern int nfs_client_for_each_server(struct nfs_client *clp,
+ int (*fn)(struct nfs_server *, void *),
+ void *data);
/* io.c */
extern void nfs_start_io_read(struct inode *inode);
extern void nfs_end_io_read(struct inode *inode);
@@ -515,13 +511,25 @@ int nfs_filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
#ifdef CONFIG_NFS_V4_1
+static inline void
+pnfs_bucket_clear_pnfs_ds_commit_verifiers(struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets)
+{
+ unsigned int i;
+
+ for (i = 0; i < nbuckets; i++)
+ buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+}
static inline
void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
{
- int i;
+ struct pnfs_commit_array *array;
- for (i = 0; i < cinfo->nbuckets; i++)
- cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &cinfo->commits, cinfo_list)
+ pnfs_bucket_clear_pnfs_ds_commit_verifiers(array->buckets,
+ array->nbuckets);
+ rcu_read_unlock();
}
#else
static inline
@@ -542,6 +550,14 @@ nfs_write_verifier_cmp(const struct nfs_write_verifier *v1,
return memcmp(v1->data, v2->data, sizeof(v1->data));
}
+static inline bool
+nfs_write_match_verf(const struct nfs_writeverf *verf,
+ struct nfs_page *req)
+{
+ return verf->committed > NFS_UNSTABLE &&
+ !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier);
+}
+
/* unlink.c */
extern struct rpc_task *
nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index f3ece8ed3203..6b063227e34e 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -145,6 +145,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
struct vfsmount *mnt = ERR_PTR(-ENOMEM);
struct nfs_server *server = NFS_SERVER(d_inode(path->dentry));
struct nfs_client *client = server->nfs_client;
+ int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout);
int ret;
if (IS_ROOT(path->dentry))
@@ -190,12 +191,12 @@ struct vfsmount *nfs_d_automount(struct path *path)
if (IS_ERR(mnt))
goto out_fc;
- if (nfs_mountpoint_expiry_timeout < 0)
+ mntget(mnt); /* prevent immediate expiration */
+ if (timeout <= 0)
goto out_fc;
- mntget(mnt); /* prevent immediate expiration */
mnt_set_expiry(mnt, &nfs_automount_list);
- schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+ schedule_delayed_work(&nfs_automount_task, timeout);
out_fc:
put_fs_context(fc);
@@ -233,10 +234,11 @@ const struct inode_operations nfs_referral_inode_operations = {
static void nfs_expire_automounts(struct work_struct *work)
{
struct list_head *list = &nfs_automount_list;
+ int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout);
mark_mounts_for_expiry(list);
- if (!list_empty(list))
- schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+ if (!list_empty(list) && timeout > 0)
+ schedule_delayed_work(&nfs_automount_task, timeout);
}
void nfs_release_automount_timer(void)
@@ -247,10 +249,7 @@ void nfs_release_automount_timer(void)
/**
* nfs_do_submount - set up mountpoint when crossing a filesystem boundary
- * @dentry: parent directory
- * @fh: filehandle for new root dentry
- * @fattr: attributes for new root inode
- * @authflavor: security flavor to use when performing the mount
+ * @fc: pointer to struct nfs_fs_context
*
*/
int nfs_do_submount(struct fs_context *fc)
@@ -312,3 +311,53 @@ int nfs_submount(struct fs_context *fc, struct nfs_server *server)
return nfs_do_submount(fc);
}
EXPORT_SYMBOL_GPL(nfs_submount);
+
+static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp)
+{
+ long num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = kstrtol(val, 0, &num);
+ if (ret)
+ return -EINVAL;
+ if (num > 0) {
+ if (num >= INT_MAX / HZ)
+ num = INT_MAX;
+ else
+ num *= HZ;
+ *((int *)kp->arg) = num;
+ if (!list_empty(&nfs_automount_list))
+ mod_delayed_work(system_wq, &nfs_automount_task, num);
+ } else {
+ *((int *)kp->arg) = -1*HZ;
+ cancel_delayed_work(&nfs_automount_task);
+ }
+ return 0;
+}
+
+static int param_get_nfs_timeout(char *buffer, const struct kernel_param *kp)
+{
+ long num = *((int *)kp->arg);
+
+ if (num > 0) {
+ if (num >= INT_MAX - (HZ - 1))
+ num = INT_MAX / HZ;
+ else
+ num = (num + (HZ - 1)) / HZ;
+ } else
+ num = -1;
+ return scnprintf(buffer, PAGE_SIZE, "%li\n", num);
+}
+
+static const struct kernel_param_ops param_ops_nfs_timeout = {
+ .set = param_set_nfs_timeout,
+ .get = param_get_nfs_timeout,
+};
+#define param_check_nfs_timeout(name, p) __param_check(name, p, int);
+
+module_param(nfs_mountpoint_expiry_timeout, nfs_timeout, 0644);
+MODULE_PARM_DESC(nfs_mountpoint_expiry_timeout,
+ "Set the NFS automounted mountpoint timeout value (seconds)."
+ "Values <= 0 turn expiration off.");
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 8be1ba7c62bb..2b7f6dcd2eb8 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -42,7 +42,9 @@ enum nfs4_client_state {
NFS4CLNT_LEASE_MOVED,
NFS4CLNT_DELEGATION_EXPIRED,
NFS4CLNT_RUN_MANAGER,
- NFS4CLNT_DELEGRETURN_RUNNING,
+ NFS4CLNT_RECALL_RUNNING,
+ NFS4CLNT_RECALL_ANY_LAYOUT_READ,
+ NFS4CLNT_RECALL_ANY_LAYOUT_RW,
};
#define NFS4_RENEW_TIMEOUT 0x01
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 1297919e0fce..8e5d6223ddd3 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -252,6 +252,9 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
if (remap_flags & ~REMAP_FILE_ADVISORY)
return -EINVAL;
+ if (IS_SWAPFILE(dst_inode) || IS_SWAPFILE(src_inode))
+ return -ETXTBSY;
+
/* check alignment w.r.t. clone_blksize */
ret = -EINVAL;
if (bs) {
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 84026e7b8a5f..a3ab6e219061 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -354,7 +354,7 @@ static int try_location(struct fs_context *fc,
/**
* nfs_follow_referral - set up mountpoint when hitting a referral on moved error
- * @dentry: parent directory
+ * @fc: pointer to struct nfs_fs_context
* @locations: array of NFSv4 server location information
*
*/
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cb34e840e4fb..512afb1c7867 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2346,7 +2346,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
.callback_ops = &nfs4_open_confirm_ops,
.callback_data = data,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
int status;
@@ -2511,7 +2511,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data,
.callback_ops = &nfs4_open_ops,
.callback_data = data,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
int status;
@@ -2790,16 +2790,19 @@ static int nfs41_check_delegation_stateid(struct nfs4_state *state)
return NFS_OK;
}
+ spin_lock(&delegation->lock);
nfs4_stateid_copy(&stateid, &delegation->stateid);
if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED,
&delegation->flags)) {
+ spin_unlock(&delegation->lock);
rcu_read_unlock();
return NFS_OK;
}
if (delegation->cred)
cred = get_cred(delegation->cred);
+ spin_unlock(&delegation->lock);
rcu_read_unlock();
status = nfs41_test_and_free_expired_stateid(server, &stateid, cred);
trace_nfs4_test_delegation_stateid(state, NULL, status);
@@ -3651,7 +3654,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
.rpc_message = &msg,
.callback_ops = &nfs4_close_ops,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
int status = -ENOMEM;
@@ -5544,7 +5547,7 @@ unwind:
struct nfs4_cached_acl {
int cached;
size_t len;
- char data[0];
+ char data[];
};
static void nfs4_set_cached_acl(struct inode *inode, struct nfs4_cached_acl *acl)
@@ -6253,6 +6256,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
/* Fallthrough */
case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_STALE_STATEID:
+ case -ETIMEDOUT:
task->tk_status = 0;
break;
case -NFS4ERR_OLD_STATEID:
@@ -6343,7 +6347,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
.rpc_client = server->client,
.rpc_message = &msg,
.callback_ops = &nfs4_delegreturn_ops,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | RPC_TASK_TIMEOUT,
};
int status = 0;
@@ -6926,7 +6930,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
.rpc_message = &msg,
.callback_ops = &nfs4_lock_ops,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
int ret;
@@ -9170,7 +9174,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
.rpc_message = &msg,
.callback_ops = &nfs4_layoutget_call_ops,
.callback_data = lgp,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
struct pnfs_layout_segment *lseg = NULL;
struct nfs4_exception exception = {
@@ -9287,6 +9291,7 @@ static void nfs4_layoutreturn_release(void *calldata)
lrp->ld_private.ops->free(&lrp->ld_private);
pnfs_put_layout_hdr(lrp->args.layout);
nfs_iput_and_deactive(lrp->inode);
+ put_cred(lrp->cred);
kfree(calldata);
dprintk("<-- %s\n", __func__);
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f7723d221945..ac93715c05a4 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2524,6 +2524,21 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
}
return 0;
}
+
+static void nfs4_layoutreturn_any_run(struct nfs_client *clp)
+{
+ int iomode = 0;
+
+ if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &clp->cl_state))
+ iomode += IOMODE_READ;
+ if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &clp->cl_state))
+ iomode += IOMODE_RW;
+ /* Note: IOMODE_READ + IOMODE_RW == IOMODE_ANY */
+ if (iomode) {
+ pnfs_layout_return_unused_byclid(clp, iomode);
+ set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+ }
+}
#else /* CONFIG_NFS_V4_1 */
static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
@@ -2531,6 +2546,10 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
{
return 0;
}
+
+static void nfs4_layoutreturn_any_run(struct nfs_client *clp)
+{
+}
#endif /* CONFIG_NFS_V4_1 */
static void nfs4_state_manager(struct nfs_client *clp)
@@ -2635,12 +2654,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
- if (!test_and_set_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state)) {
+ if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) {
if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
nfs_client_return_marked_delegations(clp);
set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
}
- clear_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state);
+ nfs4_layoutreturn_any_run(clp);
+ clear_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state);
}
/* Did we race with an attempt to give us more work? */
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 1e97e5e04cb4..543541173a3d 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -584,7 +584,9 @@ TRACE_DEFINE_ENUM(NFS4CLNT_MOVED);
TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_MOVED);
TRACE_DEFINE_ENUM(NFS4CLNT_DELEGATION_EXPIRED);
TRACE_DEFINE_ENUM(NFS4CLNT_RUN_MANAGER);
-TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_RUNNING);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_RUNNING);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_READ);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_RW);
#define show_nfs4_clp_state(state) \
__print_flags(state, "|", \
@@ -605,7 +607,9 @@ TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_RUNNING);
{ NFS4CLNT_LEASE_MOVED, "LEASE_MOVED" }, \
{ NFS4CLNT_DELEGATION_EXPIRED, "DELEGATION_EXPIRED" }, \
{ NFS4CLNT_RUN_MANAGER, "RUN_MANAGER" }, \
- { NFS4CLNT_DELEGRETURN_RUNNING, "DELEGRETURN_RUNNING" })
+ { NFS4CLNT_RECALL_RUNNING, "RECALL_RUNNING" }, \
+ { NFS4CLNT_RECALL_ANY_LAYOUT_READ, "RECALL_ANY_LAYOUT_READ" }, \
+ { NFS4CLNT_RECALL_ANY_LAYOUT_RW, "RECALL_ANY_LAYOUT_RW" })
TRACE_EVENT(nfs4_state_mgr,
TP_PROTO(
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index effaa4247b91..8d3278805602 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -88,7 +88,7 @@
#define NFS_ROOT "/tftpboot/%s"
/* Default NFSROOT mount options. */
-#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096"
+#define NFS_DEF_OPTIONS "vers=2,tcp,rsize=4096,wsize=4096"
/* Parameters passed from the kernel command line */
static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = "";
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index a9588d19a5ae..7e7a97ae21ed 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -181,6 +181,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done,
int error \
), \
TP_ARGS(inode, error))
+DEFINE_NFS_INODE_EVENT(nfs_set_inode_stale);
DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter);
DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit);
DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 20b3717cd7ca..f61f96603df7 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -33,9 +33,7 @@ static const struct rpc_call_ops nfs_pgio_common_ops;
struct nfs_pgio_mirror *
nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc)
{
- return nfs_pgio_has_mirroring(desc) ?
- &desc->pg_mirrors[desc->pg_mirror_idx] :
- &desc->pg_mirrors[0];
+ return &desc->pg_mirrors[desc->pg_mirror_idx];
}
EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror);
@@ -133,47 +131,166 @@ nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx)
EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait);
/*
- * nfs_page_group_lock - lock the head of the page group
- * @req - request in group that is to be locked
+ * nfs_page_lock_head_request - page lock the head of the page group
+ * @req: any member of the page group
+ */
+struct nfs_page *
+nfs_page_group_lock_head(struct nfs_page *req)
+{
+ struct nfs_page *head = req->wb_head;
+
+ while (!nfs_lock_request(head)) {
+ int ret = nfs_wait_on_request(head);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ }
+ if (head != req)
+ kref_get(&head->wb_kref);
+ return head;
+}
+
+/*
+ * nfs_unroll_locks - unlock all newly locked reqs and wait on @req
+ * @head: head request of page group, must be holding head lock
+ * @req: request that couldn't lock and needs to wait on the req bit lock
*
- * this lock must be held when traversing or modifying the page
- * group list
+ * This is a helper function for nfs_lock_and_join_requests
+ * returns 0 on success, < 0 on error.
+ */
+static void
+nfs_unroll_locks(struct nfs_page *head, struct nfs_page *req)
+{
+ struct nfs_page *tmp;
+
+ /* relinquish all the locks successfully grabbed this run */
+ for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) {
+ if (!kref_read(&tmp->wb_kref))
+ continue;
+ nfs_unlock_and_release_request(tmp);
+ }
+}
+
+/*
+ * nfs_page_group_lock_subreq - try to lock a subrequest
+ * @head: head request of page group
+ * @subreq: request to lock
*
- * return 0 on success, < 0 on error
+ * This is a helper function for nfs_lock_and_join_requests which
+ * must be called with the head request and page group both locked.
+ * On error, it returns with the page group unlocked.
*/
-int
-nfs_page_group_lock(struct nfs_page *req)
+static int
+nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq)
{
- struct nfs_page *head = req->wb_head;
+ int ret;
+
+ if (!kref_get_unless_zero(&subreq->wb_kref))
+ return 0;
+ while (!nfs_lock_request(subreq)) {
+ nfs_page_group_unlock(head);
+ ret = nfs_wait_on_request(subreq);
+ if (!ret)
+ ret = nfs_page_group_lock(head);
+ if (ret < 0) {
+ nfs_unroll_locks(head, subreq);
+ nfs_release_request(subreq);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/*
+ * nfs_page_group_lock_subrequests - try to lock the subrequests
+ * @head: head request of page group
+ *
+ * This is a helper function for nfs_lock_and_join_requests which
+ * must be called with the head request locked.
+ */
+int nfs_page_group_lock_subrequests(struct nfs_page *head)
+{
+ struct nfs_page *subreq;
+ int ret;
- WARN_ON_ONCE(head != head->wb_head);
+ ret = nfs_page_group_lock(head);
+ if (ret < 0)
+ return ret;
+ /* lock each request in the page group */
+ for (subreq = head->wb_this_page; subreq != head;
+ subreq = subreq->wb_this_page) {
+ ret = nfs_page_group_lock_subreq(head, subreq);
+ if (ret < 0)
+ return ret;
+ }
+ nfs_page_group_unlock(head);
+ return 0;
+}
- if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags))
+/*
+ * nfs_page_set_headlock - set the request PG_HEADLOCK
+ * @req: request that is to be locked
+ *
+ * this lock must be held when modifying req->wb_head
+ *
+ * return 0 on success, < 0 on error
+ */
+int
+nfs_page_set_headlock(struct nfs_page *req)
+{
+ if (!test_and_set_bit(PG_HEADLOCK, &req->wb_flags))
return 0;
- set_bit(PG_CONTENDED1, &head->wb_flags);
+ set_bit(PG_CONTENDED1, &req->wb_flags);
smp_mb__after_atomic();
- return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+ return wait_on_bit_lock(&req->wb_flags, PG_HEADLOCK,
TASK_UNINTERRUPTIBLE);
}
/*
- * nfs_page_group_unlock - unlock the head of the page group
- * @req - request in group that is to be unlocked
+ * nfs_page_clear_headlock - clear the request PG_HEADLOCK
+ * @req: request that is to be locked
*/
void
-nfs_page_group_unlock(struct nfs_page *req)
+nfs_page_clear_headlock(struct nfs_page *req)
{
- struct nfs_page *head = req->wb_head;
-
- WARN_ON_ONCE(head != head->wb_head);
-
smp_mb__before_atomic();
- clear_bit(PG_HEADLOCK, &head->wb_flags);
+ clear_bit(PG_HEADLOCK, &req->wb_flags);
smp_mb__after_atomic();
- if (!test_bit(PG_CONTENDED1, &head->wb_flags))
+ if (!test_bit(PG_CONTENDED1, &req->wb_flags))
return;
- wake_up_bit(&head->wb_flags, PG_HEADLOCK);
+ wake_up_bit(&req->wb_flags, PG_HEADLOCK);
+}
+
+/*
+ * nfs_page_group_lock - lock the head of the page group
+ * @req: request in group that is to be locked
+ *
+ * this lock must be held when traversing or modifying the page
+ * group list
+ *
+ * return 0 on success, < 0 on error
+ */
+int
+nfs_page_group_lock(struct nfs_page *req)
+{
+ int ret;
+
+ ret = nfs_page_set_headlock(req);
+ if (ret || req->wb_head == req)
+ return ret;
+ return nfs_page_set_headlock(req->wb_head);
+}
+
+/*
+ * nfs_page_group_unlock - unlock the head of the page group
+ * @req: request in group that is to be unlocked
+ */
+void
+nfs_page_group_unlock(struct nfs_page *req)
+{
+ if (req != req->wb_head)
+ nfs_page_clear_headlock(req->wb_head);
+ nfs_page_clear_headlock(req);
}
/*
@@ -359,15 +476,23 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
}
static struct nfs_page *
-nfs_create_subreq(struct nfs_page *req, struct nfs_page *last,
- unsigned int pgbase, unsigned int offset,
+nfs_create_subreq(struct nfs_page *req,
+ unsigned int pgbase,
+ unsigned int offset,
unsigned int count)
{
+ struct nfs_page *last;
struct nfs_page *ret;
ret = __nfs_create_request(req->wb_lock_context, req->wb_page,
pgbase, offset, count);
if (!IS_ERR(ret)) {
+ /* find the last request */
+ for (last = req->wb_head;
+ last->wb_this_page != req->wb_head;
+ last = last->wb_this_page)
+ ;
+
nfs_lock_request(ret);
ret->wb_index = req->wb_index;
nfs_page_group_init(ret, last);
@@ -627,9 +752,8 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
.callback_ops = call_ops,
.callback_data = hdr,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC | flags,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags,
};
- int ret = 0;
hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
@@ -641,18 +765,10 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
(unsigned long long)hdr->args.offset);
task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task)) {
- ret = PTR_ERR(task);
- goto out;
- }
- if (how & FLUSH_SYNC) {
- ret = rpc_wait_for_completion_task(task);
- if (ret == 0)
- ret = task->tk_status;
- }
+ if (IS_ERR(task))
+ return PTR_ERR(task);
rpc_put_task(task);
-out:
- return ret;
+ return 0;
}
EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
@@ -886,15 +1002,6 @@ static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
pgio->pg_mirror_count = mirror_count;
}
-/*
- * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
- */
-void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
-{
- pgio->pg_mirror_count = 1;
- pgio->pg_mirror_idx = 0;
-}
-
static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
{
pgio->pg_mirror_count = 1;
@@ -911,7 +1018,7 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
}
/**
- * nfs_can_coalesce_requests - test two requests for compatibility
+ * nfs_coalesce_size - test two requests for compatibility
* @prev: pointer to nfs_page
* @req: pointer to nfs_page
* @pgio: pointer to nfs_pagio_descriptor
@@ -920,41 +1027,36 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
* page data area they describe is contiguous, and that their RPC
* credentials, NFSv4 open state, and lockowners are the same.
*
- * Return 'true' if this is the case, else return 'false'.
+ * Returns size of the request that can be coalesced
*/
-static bool nfs_can_coalesce_requests(struct nfs_page *prev,
+static unsigned int nfs_coalesce_size(struct nfs_page *prev,
struct nfs_page *req,
struct nfs_pageio_descriptor *pgio)
{
- size_t size;
struct file_lock_context *flctx;
if (prev) {
if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev)))
- return false;
+ return 0;
flctx = d_inode(nfs_req_openctx(req)->dentry)->i_flctx;
if (flctx != NULL &&
!(list_empty_careful(&flctx->flc_posix) &&
list_empty_careful(&flctx->flc_flock)) &&
!nfs_match_lock_context(req->wb_lock_context,
prev->wb_lock_context))
- return false;
+ return 0;
if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
- return false;
+ return 0;
if (req->wb_page == prev->wb_page) {
if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes)
- return false;
+ return 0;
} else {
if (req->wb_pgbase != 0 ||
prev->wb_pgbase + prev->wb_bytes != PAGE_SIZE)
- return false;
+ return 0;
}
}
- size = pgio->pg_ops->pg_test(pgio, prev, req);
- WARN_ON_ONCE(size > req->wb_bytes);
- if (size && size < req->wb_bytes)
- req->wb_bytes = size;
- return size > 0;
+ return pgio->pg_ops->pg_test(pgio, prev, req);
}
/**
@@ -962,15 +1064,16 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
* @desc: destination io descriptor
* @req: request
*
- * Returns true if the request 'req' was successfully coalesced into the
- * existing list of pages 'desc'.
+ * If the request 'req' was successfully coalesced into the existing list
+ * of pages 'desc', it returns the size of req.
*/
-static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
- struct nfs_page *req)
+static unsigned int
+nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
{
struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
struct nfs_page *prev = NULL;
+ unsigned int size;
if (mirror->pg_count != 0) {
prev = nfs_list_entry(mirror->pg_list.prev);
@@ -990,11 +1093,12 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
return 0;
}
- if (!nfs_can_coalesce_requests(prev, req, desc))
- return 0;
+ size = nfs_coalesce_size(prev, req, desc);
+ if (size < req->wb_bytes)
+ return size;
nfs_list_move_request(req, &mirror->pg_list);
mirror->pg_count += req->wb_bytes;
- return 1;
+ return req->wb_bytes;
}
/*
@@ -1034,7 +1138,8 @@ nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc,
* @req: request
*
* This may split a request into subrequests which are all part of the
- * same page group.
+ * same page group. If so, it will submit @req as the last one, to ensure
+ * the pointer to @req is still valid in case of failure.
*
* Returns true if the request 'req' was successfully coalesced into the
* existing list of pages 'desc'.
@@ -1043,51 +1148,50 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *req)
{
struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
struct nfs_page *subreq;
- unsigned int bytes_left = 0;
- unsigned int offset, pgbase;
+ unsigned int size, subreq_size;
nfs_page_group_lock(req);
subreq = req;
- bytes_left = subreq->wb_bytes;
- offset = subreq->wb_offset;
- pgbase = subreq->wb_pgbase;
-
- do {
- if (!nfs_pageio_do_add_request(desc, subreq)) {
- /* make sure pg_test call(s) did nothing */
- WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
- WARN_ON_ONCE(subreq->wb_offset != offset);
- WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
-
+ subreq_size = subreq->wb_bytes;
+ for(;;) {
+ size = nfs_pageio_do_add_request(desc, subreq);
+ if (size == subreq_size) {
+ /* We successfully submitted a request */
+ if (subreq == req)
+ break;
+ req->wb_pgbase += size;
+ req->wb_bytes -= size;
+ req->wb_offset += size;
+ subreq_size = req->wb_bytes;
+ subreq = req;
+ continue;
+ }
+ if (WARN_ON_ONCE(subreq != req)) {
+ nfs_page_group_unlock(req);
+ nfs_pageio_cleanup_request(desc, subreq);
+ subreq = req;
+ subreq_size = req->wb_bytes;
+ nfs_page_group_lock(req);
+ }
+ if (!size) {
+ /* Can't coalesce any more, so do I/O */
nfs_page_group_unlock(req);
desc->pg_moreio = 1;
nfs_pageio_doio(desc);
if (desc->pg_error < 0 || mirror->pg_recoalesce)
- goto out_cleanup_subreq;
+ return 0;
/* retry add_request for this subreq */
nfs_page_group_lock(req);
continue;
}
-
- /* check for buggy pg_test call(s) */
- WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
- WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
- WARN_ON_ONCE(subreq->wb_bytes == 0);
-
- bytes_left -= subreq->wb_bytes;
- offset += subreq->wb_bytes;
- pgbase += subreq->wb_bytes;
-
- if (bytes_left) {
- subreq = nfs_create_subreq(req, subreq, pgbase,
- offset, bytes_left);
- if (IS_ERR(subreq))
- goto err_ptr;
- }
- } while (bytes_left > 0);
+ subreq = nfs_create_subreq(req, req->wb_pgbase,
+ req->wb_offset, size);
+ if (IS_ERR(subreq))
+ goto err_ptr;
+ subreq_size = size;
+ }
nfs_page_group_unlock(req);
return 1;
@@ -1095,10 +1199,6 @@ err_ptr:
desc->pg_error = PTR_ERR(subreq);
nfs_page_group_unlock(req);
return 0;
-out_cleanup_subreq:
- if (req != subreq)
- nfs_pageio_cleanup_request(desc, subreq);
- return 0;
}
static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
@@ -1167,7 +1267,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
{
u32 midx;
unsigned int pgbase, offset, bytes;
- struct nfs_page *dupreq, *lastreq;
+ struct nfs_page *dupreq;
pgbase = req->wb_pgbase;
offset = req->wb_offset;
@@ -1177,38 +1277,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (desc->pg_error < 0)
goto out_failed;
- for (midx = 0; midx < desc->pg_mirror_count; midx++) {
- if (midx) {
- nfs_page_group_lock(req);
+ /* Create the mirror instances first, and fire them off */
+ for (midx = 1; midx < desc->pg_mirror_count; midx++) {
+ nfs_page_group_lock(req);
- /* find the last request */
- for (lastreq = req->wb_head;
- lastreq->wb_this_page != req->wb_head;
- lastreq = lastreq->wb_this_page)
- ;
+ dupreq = nfs_create_subreq(req,
+ pgbase, offset, bytes);
- dupreq = nfs_create_subreq(req, lastreq,
- pgbase, offset, bytes);
-
- nfs_page_group_unlock(req);
- if (IS_ERR(dupreq)) {
- desc->pg_error = PTR_ERR(dupreq);
- goto out_failed;
- }
- } else
- dupreq = req;
+ nfs_page_group_unlock(req);
+ if (IS_ERR(dupreq)) {
+ desc->pg_error = PTR_ERR(dupreq);
+ goto out_failed;
+ }
- if (nfs_pgio_has_mirroring(desc))
- desc->pg_mirror_idx = midx;
+ desc->pg_mirror_idx = midx;
if (!nfs_pageio_add_request_mirror(desc, dupreq))
goto out_cleanup_subreq;
}
+ desc->pg_mirror_idx = 0;
+ if (!nfs_pageio_add_request_mirror(desc, req))
+ goto out_failed;
+
return 1;
out_cleanup_subreq:
- if (req != dupreq)
- nfs_pageio_cleanup_request(desc, dupreq);
+ nfs_pageio_cleanup_request(desc, dupreq);
out_failed:
nfs_pageio_error_cleanup(desc);
return 0;
@@ -1226,8 +1320,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx];
u32 restore_idx = desc->pg_mirror_idx;
- if (nfs_pgio_has_mirroring(desc))
- desc->pg_mirror_idx = mirror_idx;
+ desc->pg_mirror_idx = mirror_idx;
for (;;) {
nfs_pageio_doio(desc);
if (desc->pg_error < 0 || !mirror->pg_recoalesce)
@@ -1320,6 +1413,14 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
}
}
+/*
+ * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
+ */
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+ nfs_pageio_complete(pgio);
+}
+
int __init nfs_init_nfspagecache(void)
{
nfs_page_cachep = kmem_cache_create("nfs_page",
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 542ea8dfd1bc..f2dc35c22964 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -268,11 +268,11 @@ pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
struct nfs_server *server = NFS_SERVER(lo->plh_inode);
struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
- if (!list_empty(&lo->plh_layouts)) {
+ if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
struct nfs_client *clp = server->nfs_client;
spin_lock(&clp->cl_lock);
- list_del_init(&lo->plh_layouts);
+ list_del_rcu(&lo->plh_layouts);
spin_unlock(&clp->cl_lock);
}
put_cred(lo->plh_lc_cred);
@@ -309,6 +309,16 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
}
}
+static struct inode *
+pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct inode *inode = igrab(lo->plh_inode);
+ if (inode)
+ return inode;
+ set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
+ return NULL;
+}
+
static void
pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
u32 seq)
@@ -496,6 +506,7 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
{
INIT_LIST_HEAD(&lseg->pls_list);
INIT_LIST_HEAD(&lseg->pls_lc_list);
+ INIT_LIST_HEAD(&lseg->pls_commits);
refcount_set(&lseg->pls_refcount, 1);
set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
lseg->pls_layout = lo;
@@ -782,9 +793,10 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
/* If the sb is being destroyed, just bail */
if (!nfs_sb_active(server->super))
break;
- inode = igrab(lo->plh_inode);
+ inode = pnfs_grab_inode_layout_hdr(lo);
if (inode != NULL) {
- list_del_init(&lo->plh_layouts);
+ if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags))
+ list_del_rcu(&lo->plh_layouts);
if (pnfs_layout_add_bulk_destroy_list(inode,
layout_list))
continue;
@@ -794,7 +806,6 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
} else {
rcu_read_unlock();
spin_unlock(&clp->cl_lock);
- set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
}
nfs_sb_deactive(server->super);
spin_lock(&clp->cl_lock);
@@ -903,10 +914,21 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
pnfs_destroy_layouts_byclid(clp, false);
}
+static void
+pnfs_set_layout_cred(struct pnfs_layout_hdr *lo, const struct cred *cred)
+{
+ const struct cred *old;
+
+ if (cred && cred_fscmp(lo->plh_lc_cred, cred) != 0) {
+ old = xchg(&lo->plh_lc_cred, get_cred(cred));
+ put_cred(old);
+ }
+}
+
/* update lo->plh_stateid with new if is more recent */
void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
- bool update_barrier)
+ const struct cred *cred, bool update_barrier)
{
u32 oldseq, newseq, new_barrier = 0;
@@ -914,6 +936,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
newseq = be32_to_cpu(new->seqid);
if (!pnfs_layout_is_valid(lo)) {
+ pnfs_set_layout_cred(lo, cred);
nfs4_stateid_copy(&lo->plh_stateid, new);
lo->plh_barrier = newseq;
pnfs_clear_layoutreturn_info(lo);
@@ -1061,7 +1084,7 @@ pnfs_alloc_init_layoutget_args(struct inode *ino,
lgp->args.ctx = get_nfs_open_context(ctx);
nfs4_stateid_copy(&lgp->args.stateid, stateid);
lgp->gfp_flags = gfp_flags;
- lgp->cred = get_cred(ctx->cred);
+ lgp->cred = ctx->cred;
return lgp;
}
@@ -1072,7 +1095,6 @@ void pnfs_layoutget_free(struct nfs4_layoutget *lgp)
nfs4_free_pages(lgp->args.layout.pages, max_pages);
if (lgp->args.inode)
pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout);
- put_cred(lgp->cred);
put_nfs_open_context(lgp->args.ctx);
kfree(lgp);
}
@@ -1109,7 +1131,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
pnfs_free_returned_lsegs(lo, &freeme, range, seq);
- pnfs_set_layout_stateid(lo, stateid, true);
+ pnfs_set_layout_stateid(lo, stateid, NULL, true);
} else
pnfs_mark_layout_stateid_invalid(lo, &freeme);
out_unlock:
@@ -1122,6 +1144,7 @@ out_unlock:
static bool
pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
nfs4_stateid *stateid,
+ const struct cred **cred,
enum pnfs_iomode *iomode)
{
/* Serialise LAYOUTGET/LAYOUTRETURN */
@@ -1132,18 +1155,17 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
pnfs_get_layout_hdr(lo);
if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
- if (stateid != NULL) {
- nfs4_stateid_copy(stateid, &lo->plh_stateid);
- if (lo->plh_return_seq != 0)
- stateid->seqid = cpu_to_be32(lo->plh_return_seq);
- }
+ nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ *cred = get_cred(lo->plh_lc_cred);
+ if (lo->plh_return_seq != 0)
+ stateid->seqid = cpu_to_be32(lo->plh_return_seq);
if (iomode != NULL)
*iomode = lo->plh_return_iomode;
pnfs_clear_layoutreturn_info(lo);
return true;
}
- if (stateid != NULL)
- nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ *cred = get_cred(lo->plh_lc_cred);
if (iomode != NULL)
*iomode = IOMODE_ANY;
return true;
@@ -1167,20 +1189,26 @@ pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args,
}
static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
- enum pnfs_iomode iomode, bool sync)
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *stateid,
+ const struct cred **pcred,
+ enum pnfs_iomode iomode,
+ bool sync)
{
struct inode *ino = lo->plh_inode;
struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
struct nfs4_layoutreturn *lrp;
+ const struct cred *cred = *pcred;
int status = 0;
+ *pcred = NULL;
lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
if (unlikely(lrp == NULL)) {
status = -ENOMEM;
spin_lock(&ino->i_lock);
pnfs_clear_layoutreturn_waitbit(lo);
spin_unlock(&ino->i_lock);
+ put_cred(cred);
pnfs_put_layout_hdr(lo);
goto out;
}
@@ -1188,7 +1216,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode);
lrp->args.ld_private = &lrp->ld_private;
lrp->clp = NFS_SERVER(ino)->nfs_client;
- lrp->cred = lo->plh_lc_cred;
+ lrp->cred = cred;
if (ld->prepare_layoutreturn)
ld->prepare_layoutreturn(&lrp->args);
@@ -1233,15 +1261,16 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
return;
spin_lock(&inode->i_lock);
if (pnfs_layout_need_return(lo)) {
+ const struct cred *cred;
nfs4_stateid stateid;
enum pnfs_iomode iomode;
bool send;
- send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
+ send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
spin_unlock(&inode->i_lock);
if (send) {
/* Send an async layoutreturn so we dont deadlock */
- pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
}
} else
spin_unlock(&inode->i_lock);
@@ -1261,6 +1290,7 @@ _pnfs_return_layout(struct inode *ino)
struct pnfs_layout_hdr *lo = NULL;
struct nfs_inode *nfsi = NFS_I(ino);
LIST_HEAD(tmp_list);
+ const struct cred *cred;
nfs4_stateid stateid;
int status = 0;
bool send, valid_layout;
@@ -1305,10 +1335,10 @@ _pnfs_return_layout(struct inode *ino)
goto out_put_layout_hdr;
}
- send = pnfs_prepare_layoutreturn(lo, &stateid, NULL);
+ send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL);
spin_unlock(&ino->i_lock);
if (send)
- status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
+ status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true);
out_put_layout_hdr:
pnfs_free_lseg_list(&tmp_list);
pnfs_put_layout_hdr(lo);
@@ -1354,6 +1384,7 @@ bool pnfs_roc(struct inode *ino,
struct nfs4_state *state;
struct pnfs_layout_hdr *lo;
struct pnfs_layout_segment *lseg, *next;
+ const struct cred *lc_cred;
nfs4_stateid stateid;
enum pnfs_iomode iomode = 0;
bool layoutreturn = false, roc = false;
@@ -1423,16 +1454,20 @@ retry:
* 2. we don't send layoutreturn
*/
/* lo ref dropped in pnfs_roc_release() */
- layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
+ layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode);
/* If the creds don't match, we can't compound the layoutreturn */
- if (!layoutreturn || cred_fscmp(cred, lo->plh_lc_cred) != 0)
+ if (!layoutreturn)
goto out_noroc;
+ if (cred_fscmp(cred, lc_cred) != 0)
+ goto out_noroc_put_cred;
roc = layoutreturn;
pnfs_init_layoutreturn_args(args, lo, &stateid, iomode);
res->lrs_present = 0;
layoutreturn = false;
+out_noroc_put_cred:
+ put_cred(lc_cred);
out_noroc:
spin_unlock(&ino->i_lock);
rcu_read_unlock();
@@ -1445,7 +1480,7 @@ out_noroc:
return true;
}
if (layoutreturn)
- pnfs_send_layoutreturn(lo, &stateid, iomode, true);
+ pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, true);
pnfs_put_layout_hdr(lo);
return false;
}
@@ -1859,15 +1894,14 @@ static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
static void _add_to_server_list(struct pnfs_layout_hdr *lo,
struct nfs_server *server)
{
- if (list_empty(&lo->plh_layouts)) {
+ if (!test_and_set_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
struct nfs_client *clp = server->nfs_client;
/* The lo must be on the clp list if there is any
* chance of a CB_LAYOUTRECALL(FILE) coming in.
*/
spin_lock(&clp->cl_lock);
- if (list_empty(&lo->plh_layouts))
- list_add_tail(&lo->plh_layouts, &server->layouts);
+ list_add_tail_rcu(&lo->plh_layouts, &server->layouts);
spin_unlock(&clp->cl_lock);
}
}
@@ -2323,14 +2357,14 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
if (!pnfs_layout_is_valid(lo)) {
/* We have a completely new layout */
- pnfs_set_layout_stateid(lo, &res->stateid, true);
+ pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true);
} else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
/* existing state ID, make sure the sequence number matches. */
if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
dprintk("%s forget reply due to sequence\n", __func__);
goto out_forget;
}
- pnfs_set_layout_stateid(lo, &res->stateid, false);
+ pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false);
} else {
/*
* We got an entirely new state ID. Mark all segments for the
@@ -2423,43 +2457,159 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
return -ENOENT;
}
-void pnfs_error_mark_layout_for_return(struct inode *inode,
- struct pnfs_layout_segment *lseg)
+static void
+pnfs_mark_layout_for_return(struct inode *inode,
+ const struct pnfs_layout_range *range)
{
- struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
- struct pnfs_layout_range range = {
- .iomode = lseg->pls_range.iomode,
- .offset = 0,
- .length = NFS4_MAX_UINT64,
- };
+ struct pnfs_layout_hdr *lo;
bool return_now = false;
spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
if (!pnfs_layout_is_valid(lo)) {
spin_unlock(&inode->i_lock);
return;
}
- pnfs_set_plh_return_info(lo, range.iomode, 0);
+ pnfs_set_plh_return_info(lo, range->iomode, 0);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
* for how it works.
*/
- if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, &range, 0) != -EBUSY) {
+ if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0) != -EBUSY) {
+ const struct cred *cred;
nfs4_stateid stateid;
enum pnfs_iomode iomode;
- return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
+ return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
spin_unlock(&inode->i_lock);
if (return_now)
- pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
} else {
spin_unlock(&inode->i_lock);
nfs_commit_inode(inode, 0);
}
}
+
+void pnfs_error_mark_layout_for_return(struct inode *inode,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_layout_range range = {
+ .iomode = lseg->pls_range.iomode,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ pnfs_mark_layout_for_return(inode, &range);
+}
EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
+static bool
+pnfs_layout_can_be_returned(struct pnfs_layout_hdr *lo)
+{
+ return pnfs_layout_is_valid(lo) &&
+ !test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) &&
+ !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
+}
+
+static struct pnfs_layout_segment *
+pnfs_find_first_lseg(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ enum pnfs_iomode iomode)
+{
+ struct pnfs_layout_segment *lseg;
+
+ list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+ if (!test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
+ continue;
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ continue;
+ if (lseg->pls_range.iomode != iomode && iomode != IOMODE_ANY)
+ continue;
+ if (pnfs_lseg_range_intersecting(&lseg->pls_range, range))
+ return lseg;
+ }
+ return NULL;
+}
+
+/* Find open file states whose mode matches that of the range */
+static bool
+pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range)
+{
+ struct list_head *head;
+ struct nfs_open_context *ctx;
+ fmode_t mode = 0;
+
+ if (!pnfs_layout_can_be_returned(lo) ||
+ !pnfs_find_first_lseg(lo, range, range->iomode))
+ return false;
+
+ head = &NFS_I(lo->plh_inode)->open_files;
+ list_for_each_entry_rcu(ctx, head, list) {
+ if (ctx->state)
+ mode |= ctx->state->state & (FMODE_READ|FMODE_WRITE);
+ }
+
+ switch (range->iomode) {
+ default:
+ break;
+ case IOMODE_READ:
+ mode &= ~FMODE_WRITE;
+ break;
+ case IOMODE_RW:
+ if (pnfs_find_first_lseg(lo, range, IOMODE_READ))
+ mode &= ~FMODE_READ;
+ }
+ return mode == 0;
+}
+
+static int
+pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data)
+{
+ const struct pnfs_layout_range *range = data;
+ struct pnfs_layout_hdr *lo;
+ struct inode *inode;
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
+ if (!pnfs_layout_can_be_returned(lo) ||
+ test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ continue;
+ inode = lo->plh_inode;
+ spin_lock(&inode->i_lock);
+ if (!pnfs_should_return_unused_layout(lo, range)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ spin_unlock(&inode->i_lock);
+ inode = pnfs_grab_inode_layout_hdr(lo);
+ if (!inode)
+ continue;
+ rcu_read_unlock();
+ pnfs_mark_layout_for_return(inode, range);
+ iput(inode);
+ cond_resched();
+ goto restart;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+void
+pnfs_layout_return_unused_byclid(struct nfs_client *clp,
+ enum pnfs_iomode iomode)
+{
+ struct pnfs_layout_range range = {
+ .iomode = iomode,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ nfs_client_for_each_server(clp, pnfs_layout_return_unused_byserver,
+ &range);
+}
+
void
pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio)
{
@@ -2475,7 +2625,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout);
* Check for any intersection between the request and the pgio->pg_lseg,
* and if none, put this pgio->pg_lseg away.
*/
-static void
+void
pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) {
@@ -2483,6 +2633,7 @@ pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page
pgio->pg_lseg = NULL;
}
}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range);
void
pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
@@ -3000,10 +3151,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
end_pos = nfsi->layout->plh_lwb;
nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
+ data->cred = get_cred(nfsi->layout->plh_lc_cred);
spin_unlock(&inode->i_lock);
data->args.inode = inode;
- data->cred = get_cred(nfsi->layout->plh_lc_cred);
nfs_fattr_init(&data->fattr);
data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
data->res.fattr = &data->fattr;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 0fafdadc9c8d..8e0ada581b92 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -66,6 +66,7 @@ struct nfs4_pnfs_ds {
struct pnfs_layout_segment {
struct list_head pls_list;
struct list_head pls_lc_list;
+ struct list_head pls_commits;
struct pnfs_layout_range pls_range;
refcount_t pls_refcount;
u32 pls_seq;
@@ -105,6 +106,7 @@ enum {
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */
+ NFS_LAYOUT_HASHED, /* The layout visible */
};
enum layoutdriver_policy_flags {
@@ -148,22 +150,6 @@ struct pnfs_layoutdriver_type {
const struct nfs_pageio_ops *pg_write_ops;
struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode);
- void (*mark_request_commit) (struct nfs_page *req,
- struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo,
- u32 ds_commit_idx);
- void (*clear_request_commit) (struct nfs_page *req,
- struct nfs_commit_info *cinfo);
- int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
- int max);
- void (*recover_commit_reqs) (struct list_head *list,
- struct nfs_commit_info *cinfo);
- struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
- struct page *page);
- int (*commit_pagelist)(struct inode *inode,
- struct list_head *mds_pages,
- int how,
- struct nfs_commit_info *cinfo);
int (*sync)(struct inode *inode, bool datasync);
@@ -186,6 +172,29 @@ struct pnfs_layoutdriver_type {
int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
};
+struct pnfs_commit_ops {
+ void (*setup_ds_info)(struct pnfs_ds_commit_info *,
+ struct pnfs_layout_segment *);
+ void (*release_ds_info)(struct pnfs_ds_commit_info *,
+ struct inode *inode);
+ int (*commit_pagelist)(struct inode *inode,
+ struct list_head *mds_pages,
+ int how,
+ struct nfs_commit_info *cinfo);
+ void (*mark_request_commit) (struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
+ void (*clear_request_commit) (struct nfs_page *req,
+ struct nfs_commit_info *cinfo);
+ int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
+ int max);
+ void (*recover_commit_reqs) (struct list_head *list,
+ struct nfs_commit_info *cinfo);
+ struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
+ struct page *page);
+};
+
struct pnfs_layout_hdr {
refcount_t plh_refcount;
atomic_t plh_outstanding; /* number of RPCs out */
@@ -203,6 +212,7 @@ struct pnfs_layout_hdr {
loff_t plh_lwb; /* last write byte for layoutcommit */
const struct cred *plh_lc_cred; /* layoutcommit cred */
struct inode *plh_inode;
+ struct rcu_head plh_rcu;
};
struct pnfs_device {
@@ -242,6 +252,7 @@ void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *);
void unset_pnfs_layoutdriver(struct nfs_server *);
void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio);
+void pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req);
void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
@@ -267,6 +278,7 @@ bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
const nfs4_stateid *new,
+ const struct cred *cred,
bool update_barrier);
int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
@@ -326,6 +338,9 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
void pnfs_error_mark_layout_for_return(struct inode *inode,
struct pnfs_layout_segment *lseg);
+void pnfs_layout_return_unused_byclid(struct nfs_client *clp,
+ enum pnfs_iomode iomode);
+
/* nfs4_deviceid_flags */
enum {
NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */
@@ -360,6 +375,16 @@ bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
void nfs4_deviceid_purge_client(const struct nfs_client *);
/* pnfs_nfs.c */
+struct pnfs_commit_array *pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags);
+void pnfs_free_commit_array(struct pnfs_commit_array *p);
+struct pnfs_commit_array *pnfs_add_commit_array(struct pnfs_ds_commit_info *,
+ struct pnfs_commit_array *,
+ struct pnfs_layout_segment *);
+
+void pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg);
+void pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo);
+
void pnfs_generic_clear_request_commit(struct nfs_page *req,
struct nfs_commit_info *cinfo);
void pnfs_generic_commit_release(void *calldata);
@@ -367,6 +392,8 @@ void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data);
void pnfs_generic_rw_release(void *data);
void pnfs_generic_recover_commit_reqs(struct list_head *dst,
struct nfs_commit_info *cinfo);
+struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo,
+ struct page *page);
int pnfs_generic_commit_pagelist(struct inode *inode,
struct list_head *mds_pages,
int how,
@@ -438,9 +465,11 @@ static inline int
pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
struct nfs_commit_info *cinfo)
{
- if (cinfo->ds == NULL || cinfo->ds->ncommitting == 0)
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (fl_cinfo == NULL || fl_cinfo->ncommitting == 0)
return PNFS_NOT_ATTEMPTED;
- return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how, cinfo);
+ return fl_cinfo->ops->commit_pagelist(inode, mds_pages, how, cinfo);
}
static inline struct pnfs_ds_commit_info *
@@ -454,6 +483,28 @@ pnfs_get_ds_info(struct inode *inode)
}
static inline void
+pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+ struct pnfs_ds_commit_info *inode_cinfo = pnfs_get_ds_info(inode);
+ if (inode_cinfo != NULL)
+ fl_cinfo->ops = inode_cinfo->ops;
+}
+
+static inline void
+pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo)
+{
+ INIT_LIST_HEAD(&fl_cinfo->commits);
+ fl_cinfo->ops = NULL;
+}
+
+static inline void
+pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+ if (fl_cinfo->ops != NULL && fl_cinfo->ops->release_ds_info != NULL)
+ fl_cinfo->ops->release_ds_info(fl_cinfo, inode);
+}
+
+static inline void
pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node)
{
set_bit(NFS_DEVICEID_INVALID, &node->flags);
@@ -463,24 +514,22 @@ static inline bool
pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
struct nfs_commit_info *cinfo, u32 ds_commit_idx)
{
- struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
- struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
- if (lseg == NULL || ld->mark_request_commit == NULL)
+ if (!lseg || !fl_cinfo->ops->mark_request_commit)
return false;
- ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
+ fl_cinfo->ops->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
return true;
}
static inline bool
pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo)
{
- struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
- struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
- if (ld == NULL || ld->clear_request_commit == NULL)
+ if (!fl_cinfo || !fl_cinfo->ops || !fl_cinfo->ops->clear_request_commit)
return false;
- ld->clear_request_commit(req, cinfo);
+ fl_cinfo->ops->clear_request_commit(req, cinfo);
return true;
}
@@ -488,21 +537,31 @@ static inline int
pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
int max)
{
- if (cinfo->ds == NULL || cinfo->ds->nwritten == 0)
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (!fl_cinfo || fl_cinfo->nwritten == 0)
return 0;
- else
- return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max);
+ return fl_cinfo->ops->scan_commit_lists(cinfo, max);
+}
+
+static inline void
+pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (fl_cinfo && fl_cinfo->nwritten != 0)
+ fl_cinfo->ops->recover_commit_reqs(head, cinfo);
}
static inline struct nfs_page *
pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
struct page *page)
{
- struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
- if (ld == NULL || ld->search_commit_reqs == NULL)
+ if (!fl_cinfo->ops || !fl_cinfo->ops->search_commit_reqs)
return NULL;
- return ld->search_commit_reqs(cinfo, page);
+ return fl_cinfo->ops->search_commit_reqs(cinfo, page);
}
/* Should the pNFS client commit and return the layout upon a setattr */
@@ -750,6 +809,21 @@ pnfs_get_ds_info(struct inode *inode)
return NULL;
}
+static inline void
+pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+}
+
+static inline void
+pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo)
+{
+}
+
+static inline void
+pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+}
+
static inline bool
pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
struct nfs_commit_info *cinfo, u32 ds_commit_idx)
@@ -770,6 +844,11 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
return 0;
}
+static inline void
+pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo)
+{
+}
+
static inline struct nfs_page *
pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
struct page *page)
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 8b37e7f8e789..e7ddbce48321 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -59,6 +59,17 @@ void pnfs_generic_commit_release(void *calldata)
}
EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
+static struct pnfs_layout_segment *
+pnfs_free_bucket_lseg(struct pnfs_commit_bucket *bucket)
+{
+ if (list_empty(&bucket->committing) && list_empty(&bucket->written)) {
+ struct pnfs_layout_segment *freeme = bucket->lseg;
+ bucket->lseg = NULL;
+ return freeme;
+ }
+ return NULL;
+}
+
/* The generic layer is about to remove the req from the commit list.
* If this will make the bucket empty, it will need to put the lseg reference.
* Note this must be called holding nfsi->commit_mutex
@@ -78,8 +89,7 @@ pnfs_generic_clear_request_commit(struct nfs_page *req,
bucket = list_first_entry(&req->wb_list,
struct pnfs_commit_bucket,
written);
- freeme = bucket->wlseg;
- bucket->wlseg = NULL;
+ freeme = pnfs_free_bucket_lseg(bucket);
}
out:
nfs_request_remove_commit_list(req, cinfo);
@@ -87,10 +97,154 @@ out:
}
EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
+struct pnfs_commit_array *
+pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags)
+{
+ struct pnfs_commit_array *p;
+ struct pnfs_commit_bucket *b;
+
+ p = kmalloc(struct_size(p, buckets, n), gfp_flags);
+ if (!p)
+ return NULL;
+ p->nbuckets = n;
+ INIT_LIST_HEAD(&p->cinfo_list);
+ INIT_LIST_HEAD(&p->lseg_list);
+ p->lseg = NULL;
+ for (b = &p->buckets[0]; n != 0; b++, n--) {
+ INIT_LIST_HEAD(&b->written);
+ INIT_LIST_HEAD(&b->committing);
+ b->lseg = NULL;
+ b->direct_verf.committed = NFS_INVALID_STABLE_HOW;
+ }
+ return p;
+}
+EXPORT_SYMBOL_GPL(pnfs_alloc_commit_array);
+
+void
+pnfs_free_commit_array(struct pnfs_commit_array *p)
+{
+ kfree_rcu(p, rcu);
+}
+EXPORT_SYMBOL_GPL(pnfs_free_commit_array);
+
+static struct pnfs_commit_array *
+pnfs_find_commit_array_by_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array;
+
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (array->lseg == lseg)
+ return array;
+ }
+ return NULL;
+}
+
+struct pnfs_commit_array *
+pnfs_add_commit_array(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_commit_array *new,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array;
+
+ array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+ if (array)
+ return array;
+ new->lseg = lseg;
+ refcount_set(&new->refcount, 1);
+ list_add_rcu(&new->cinfo_list, &fl_cinfo->commits);
+ list_add(&new->lseg_list, &lseg->pls_commits);
+ return new;
+}
+EXPORT_SYMBOL_GPL(pnfs_add_commit_array);
+
+static struct pnfs_commit_array *
+pnfs_lookup_commit_array(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array;
+
+ rcu_read_lock();
+ array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+ if (!array) {
+ rcu_read_unlock();
+ fl_cinfo->ops->setup_ds_info(fl_cinfo, lseg);
+ rcu_read_lock();
+ array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+ }
+ rcu_read_unlock();
+ return array;
+}
+
+static void
+pnfs_release_commit_array_locked(struct pnfs_commit_array *array)
+{
+ list_del_rcu(&array->cinfo_list);
+ list_del(&array->lseg_list);
+ pnfs_free_commit_array(array);
+}
+
+static void
+pnfs_put_commit_array_locked(struct pnfs_commit_array *array)
+{
+ if (refcount_dec_and_test(&array->refcount))
+ pnfs_release_commit_array_locked(array);
+}
+
+static void
+pnfs_put_commit_array(struct pnfs_commit_array *array, struct inode *inode)
+{
+ if (refcount_dec_and_lock(&array->refcount, &inode->i_lock)) {
+ pnfs_release_commit_array_locked(array);
+ spin_unlock(&inode->i_lock);
+ }
+}
+
+static struct pnfs_commit_array *
+pnfs_get_commit_array(struct pnfs_commit_array *array)
+{
+ if (refcount_inc_not_zero(&array->refcount))
+ return array;
+ return NULL;
+}
+
+static void
+pnfs_remove_and_free_commit_array(struct pnfs_commit_array *array)
+{
+ array->lseg = NULL;
+ list_del_init(&array->lseg_list);
+ pnfs_put_commit_array_locked(array);
+}
+
+void
+pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array, *tmp;
+
+ list_for_each_entry_safe(array, tmp, &lseg->pls_commits, lseg_list)
+ pnfs_remove_and_free_commit_array(array);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_release_lseg);
+
+void
+pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo)
+{
+ struct pnfs_commit_array *array, *tmp;
+
+ list_for_each_entry_safe(array, tmp, &fl_cinfo->commits, cinfo_list)
+ pnfs_remove_and_free_commit_array(array);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_destroy);
+
+/*
+ * Locks the nfs_page requests for commit and moves them to
+ * @bucket->committing.
+ */
static int
-pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
- struct nfs_commit_info *cinfo,
- int max)
+pnfs_bucket_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
+ struct nfs_commit_info *cinfo,
+ int max)
{
struct list_head *src = &bucket->written;
struct list_head *dst = &bucket->committing;
@@ -101,158 +255,254 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
if (ret) {
cinfo->ds->nwritten -= ret;
cinfo->ds->ncommitting += ret;
- if (bucket->clseg == NULL)
- bucket->clseg = pnfs_get_lseg(bucket->wlseg);
- if (list_empty(src)) {
- pnfs_put_lseg(bucket->wlseg);
- bucket->wlseg = NULL;
- }
}
return ret;
}
+static int pnfs_bucket_scan_array(struct nfs_commit_info *cinfo,
+ struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ int max)
+{
+ unsigned int i;
+ int rv = 0, cnt;
+
+ for (i = 0; i < nbuckets && max != 0; i++) {
+ cnt = pnfs_bucket_scan_ds_commit_list(&buckets[i], cinfo, max);
+ rv += cnt;
+ max -= cnt;
+ }
+ return rv;
+}
+
/* Move reqs from written to committing lists, returning count
* of number moved.
*/
-int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
- int max)
+int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max)
{
- int i, rv = 0, cnt;
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_array *array;
+ int rv = 0, cnt;
- lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
- for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
- cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
- cinfo, max);
- max -= cnt;
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (!array->lseg || !pnfs_get_commit_array(array))
+ continue;
+ rcu_read_unlock();
+ cnt = pnfs_bucket_scan_array(cinfo, array->buckets,
+ array->nbuckets, max);
+ rcu_read_lock();
+ pnfs_put_commit_array(array, cinfo->inode);
rv += cnt;
+ max -= cnt;
+ if (!max)
+ break;
}
+ rcu_read_unlock();
return rv;
}
EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists);
-/* Pull everything off the committing lists and dump into @dst. */
-void pnfs_generic_recover_commit_reqs(struct list_head *dst,
- struct nfs_commit_info *cinfo)
+static unsigned int
+pnfs_bucket_recover_commit_reqs(struct list_head *dst,
+ struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ struct nfs_commit_info *cinfo)
{
struct pnfs_commit_bucket *b;
struct pnfs_layout_segment *freeme;
- int nwritten;
- int i;
+ unsigned int nwritten, ret = 0;
+ unsigned int i;
- lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
restart:
- for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+ for (i = 0, b = buckets; i < nbuckets; i++, b++) {
nwritten = nfs_scan_commit_list(&b->written, dst, cinfo, 0);
if (!nwritten)
continue;
- cinfo->ds->nwritten -= nwritten;
- if (list_empty(&b->written)) {
- freeme = b->wlseg;
- b->wlseg = NULL;
+ ret += nwritten;
+ freeme = pnfs_free_bucket_lseg(b);
+ if (freeme) {
pnfs_put_lseg(freeme);
goto restart;
}
}
+ return ret;
+}
+
+/* Pull everything off the committing lists and dump into @dst. */
+void pnfs_generic_recover_commit_reqs(struct list_head *dst,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_array *array;
+ unsigned int nwritten;
+
+ lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (!array->lseg || !pnfs_get_commit_array(array))
+ continue;
+ rcu_read_unlock();
+ nwritten = pnfs_bucket_recover_commit_reqs(dst,
+ array->buckets,
+ array->nbuckets,
+ cinfo);
+ rcu_read_lock();
+ pnfs_put_commit_array(array, cinfo->inode);
+ fl_cinfo->nwritten -= nwritten;
+ }
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs);
-static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
+static struct nfs_page *
+pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets, struct page *page)
+{
+ struct nfs_page *req;
+ struct pnfs_commit_bucket *b;
+ unsigned int i;
+
+ /* Linearly search the commit lists for each bucket until a matching
+ * request is found */
+ for (i = 0, b = buckets; i < nbuckets; i++, b++) {
+ list_for_each_entry(req, &b->written, wb_list) {
+ if (req->wb_page == page)
+ return req->wb_head;
+ }
+ list_for_each_entry(req, &b->committing, wb_list) {
+ if (req->wb_page == page)
+ return req->wb_head;
+ }
+ }
+ return NULL;
+}
+
+/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head reqest
+ * for @page
+ * @cinfo - commit info for current inode
+ * @page - page to search for matching head request
+ *
+ * Returns a the head request if one is found, otherwise returns NULL.
+ */
+struct nfs_page *
+pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
{
struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_array *array;
+ struct nfs_page *req;
+
+ list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) {
+ req = pnfs_bucket_search_commit_reqs(array->buckets,
+ array->nbuckets, page);
+ if (req)
+ return req;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_search_commit_reqs);
+
+static struct pnfs_layout_segment *
+pnfs_bucket_get_committing(struct list_head *head,
+ struct pnfs_commit_bucket *bucket,
+ struct nfs_commit_info *cinfo)
+{
+ struct list_head *pos;
+
+ list_for_each(pos, &bucket->committing)
+ cinfo->ds->ncommitting--;
+ list_splice_init(&bucket->committing, head);
+ return pnfs_free_bucket_lseg(bucket);
+}
+
+static struct nfs_commit_data *
+pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket,
+ struct nfs_commit_info *cinfo)
+{
+ struct nfs_commit_data *data = nfs_commitdata_alloc(false);
+
+ if (!data)
+ return NULL;
+ data->lseg = pnfs_bucket_get_committing(&data->pages, bucket, cinfo);
+ if (!data->lseg)
+ data->lseg = pnfs_get_lseg(bucket->lseg);
+ return data;
+}
+
+static void pnfs_generic_retry_commit(struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ struct nfs_commit_info *cinfo,
+ unsigned int idx)
+{
struct pnfs_commit_bucket *bucket;
struct pnfs_layout_segment *freeme;
- struct list_head *pos;
LIST_HEAD(pages);
- int i;
- mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
- for (i = idx; i < fl_cinfo->nbuckets; i++) {
- bucket = &fl_cinfo->buckets[i];
+ for (bucket = buckets; idx < nbuckets; bucket++, idx++) {
if (list_empty(&bucket->committing))
continue;
- freeme = bucket->clseg;
- bucket->clseg = NULL;
- list_for_each(pos, &bucket->committing)
- cinfo->ds->ncommitting--;
- list_splice_init(&bucket->committing, &pages);
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ freeme = pnfs_bucket_get_committing(&pages, bucket, cinfo);
mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
- nfs_retry_commit(&pages, freeme, cinfo, i);
+ nfs_retry_commit(&pages, freeme, cinfo, idx);
pnfs_put_lseg(freeme);
- mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
}
- mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
}
static unsigned int
-pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
- struct list_head *list)
+pnfs_bucket_alloc_ds_commits(struct list_head *list,
+ struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ struct nfs_commit_info *cinfo)
{
- struct pnfs_ds_commit_info *fl_cinfo;
struct pnfs_commit_bucket *bucket;
struct nfs_commit_data *data;
- int i;
+ unsigned int i;
unsigned int nreq = 0;
- fl_cinfo = cinfo->ds;
- bucket = fl_cinfo->buckets;
- for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
+ for (i = 0, bucket = buckets; i < nbuckets; i++, bucket++) {
if (list_empty(&bucket->committing))
continue;
- data = nfs_commitdata_alloc(false);
- if (!data)
- break;
- data->ds_commit_index = i;
- list_add(&data->pages, list);
- nreq++;
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ if (!list_empty(&bucket->committing)) {
+ data = pnfs_bucket_fetch_commitdata(bucket, cinfo);
+ if (!data)
+ goto out_error;
+ data->ds_commit_index = i;
+ list_add_tail(&data->list, list);
+ atomic_inc(&cinfo->mds->rpcs_out);
+ nreq++;
+ }
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
}
-
+ return nreq;
+out_error:
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
/* Clean up on error */
- pnfs_generic_retry_commit(cinfo, i);
+ pnfs_generic_retry_commit(buckets, nbuckets, cinfo, i);
return nreq;
}
-static inline
-void pnfs_fetch_commit_bucket_list(struct list_head *pages,
- struct nfs_commit_data *data,
- struct nfs_commit_info *cinfo)
+static unsigned int
+pnfs_alloc_ds_commits_list(struct list_head *list,
+ struct pnfs_ds_commit_info *fl_cinfo,
+ struct nfs_commit_info *cinfo)
{
- struct pnfs_commit_bucket *bucket;
- struct list_head *pos;
-
- bucket = &cinfo->ds->buckets[data->ds_commit_index];
- mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
- list_for_each(pos, &bucket->committing)
- cinfo->ds->ncommitting--;
- list_splice_init(&bucket->committing, pages);
- data->lseg = bucket->clseg;
- bucket->clseg = NULL;
- mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
-
-}
+ struct pnfs_commit_array *array;
+ unsigned int ret = 0;
-/* Helper function for pnfs_generic_commit_pagelist to catch an empty
- * page list. This can happen when two commits race.
- *
- * This must be called instead of nfs_init_commit - call one or the other, but
- * not both!
- */
-static bool
-pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages,
- struct nfs_commit_data *data,
- struct nfs_commit_info *cinfo)
-{
- if (list_empty(pages)) {
- if (atomic_dec_and_test(&cinfo->mds->rpcs_out))
- wake_up_var(&cinfo->mds->rpcs_out);
- /* don't call nfs_commitdata_release - it tries to put
- * the open_context which is not acquired until nfs_init_commit
- * which has not been called on @data */
- WARN_ON_ONCE(data->context);
- nfs_commit_free(data);
- return true;
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (!array->lseg || !pnfs_get_commit_array(array))
+ continue;
+ rcu_read_unlock();
+ ret += pnfs_bucket_alloc_ds_commits(list, array->buckets,
+ array->nbuckets, cinfo);
+ rcu_read_lock();
+ pnfs_put_commit_array(array, cinfo->inode);
}
-
- return false;
+ rcu_read_unlock();
+ return ret;
}
/* This follows nfs_commit_list pretty closely */
@@ -262,6 +512,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
int (*initiate_commit)(struct nfs_commit_data *data,
int how))
{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
struct nfs_commit_data *data, *tmp;
LIST_HEAD(list);
unsigned int nreq = 0;
@@ -269,40 +520,25 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
if (!list_empty(mds_pages)) {
data = nfs_commitdata_alloc(true);
data->ds_commit_index = -1;
- list_add(&data->pages, &list);
+ list_splice_init(mds_pages, &data->pages);
+ list_add_tail(&data->list, &list);
+ atomic_inc(&cinfo->mds->rpcs_out);
nreq++;
}
- nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
-
+ nreq += pnfs_alloc_ds_commits_list(&list, fl_cinfo, cinfo);
if (nreq == 0)
goto out;
- atomic_add(nreq, &cinfo->mds->rpcs_out);
-
- list_for_each_entry_safe(data, tmp, &list, pages) {
- list_del_init(&data->pages);
+ list_for_each_entry_safe(data, tmp, &list, list) {
+ list_del(&data->list);
if (data->ds_commit_index < 0) {
- /* another commit raced with us */
- if (pnfs_generic_commit_cancel_empty_pagelist(mds_pages,
- data, cinfo))
- continue;
-
- nfs_init_commit(data, mds_pages, NULL, cinfo);
+ nfs_init_commit(data, NULL, NULL, cinfo);
nfs_initiate_commit(NFS_CLIENT(inode), data,
NFS_PROTO(data->inode),
data->mds_ops, how, 0);
} else {
- LIST_HEAD(pages);
-
- pnfs_fetch_commit_bucket_list(&pages, data, cinfo);
-
- /* another commit raced with us */
- if (pnfs_generic_commit_cancel_empty_pagelist(&pages,
- data, cinfo))
- continue;
-
- nfs_init_commit(data, &pages, data->lseg, cinfo);
+ nfs_init_commit(data, NULL, data->lseg, cinfo);
initiate_commit(data, how);
}
}
@@ -930,32 +1166,33 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
u32 ds_commit_idx)
{
struct list_head *list;
- struct pnfs_commit_bucket *buckets;
+ struct pnfs_commit_array *array;
+ struct pnfs_commit_bucket *bucket;
mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
- buckets = cinfo->ds->buckets;
- list = &buckets[ds_commit_idx].written;
- if (list_empty(list)) {
- if (!pnfs_is_valid_lseg(lseg)) {
- mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
- cinfo->completion_ops->resched_write(cinfo, req);
- return;
- }
- /* Non-empty buckets hold a reference on the lseg. That ref
- * is normally transferred to the COMMIT call and released
- * there. It could also be released if the last req is pulled
- * off due to a rewrite, in which case it will be done in
- * pnfs_common_clear_request_commit
- */
- WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
- buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
- }
+ array = pnfs_lookup_commit_array(cinfo->ds, lseg);
+ if (!array || !pnfs_is_valid_lseg(lseg))
+ goto out_resched;
+ bucket = &array->buckets[ds_commit_idx];
+ list = &bucket->written;
+ /* Non-empty buckets hold a reference on the lseg. That ref
+ * is normally transferred to the COMMIT call and released
+ * there. It could also be released if the last req is pulled
+ * off due to a rewrite, in which case it will be done in
+ * pnfs_common_clear_request_commit
+ */
+ if (!bucket->lseg)
+ bucket->lseg = pnfs_get_lseg(lseg);
set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
cinfo->ds->nwritten++;
nfs_request_add_commit_list_locked(req, list, cinfo);
mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
nfs_mark_page_unstable(req->wb_page, cinfo);
+ return;
+out_resched:
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ cinfo->completion_ops->resched_write(cinfo, req);
}
EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 34bb9add2302..13b22e898116 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -250,7 +250,7 @@ static int nfs_readpage_done(struct rpc_task *task,
trace_nfs_readpage_done(task, hdr);
if (task->tk_status == -ESTALE) {
- set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+ nfs_set_inode_stale(inode);
nfs_mark_for_revalidate(inode);
}
return 0;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index bb14bede6da5..59ef3b13ccca 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -176,6 +176,41 @@ void nfs_sb_deactive(struct super_block *sb)
}
EXPORT_SYMBOL_GPL(nfs_sb_deactive);
+static int __nfs_list_for_each_server(struct list_head *head,
+ int (*fn)(struct nfs_server *, void *),
+ void *data)
+{
+ struct nfs_server *server, *last = NULL;
+ int ret = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, head, client_link) {
+ if (!nfs_sb_active(server->super))
+ continue;
+ rcu_read_unlock();
+ if (last)
+ nfs_sb_deactive(last->super);
+ last = server;
+ ret = fn(server, data);
+ if (ret)
+ goto out;
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+out:
+ if (last)
+ nfs_sb_deactive(last->super);
+ return ret;
+}
+
+int nfs_client_for_each_server(struct nfs_client *clp,
+ int (*fn)(struct nfs_server *, void *),
+ void *data)
+{
+ return __nfs_list_for_each_server(&clp->cl_superblocks, fn, data);
+}
+EXPORT_SYMBOL_GPL(nfs_client_for_each_server);
+
/*
* Deliver file system statistics to userspace
*/
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 0effeee28352..b27ebdccef70 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -98,7 +98,7 @@ static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data)
.callback_ops = &nfs_unlink_ops,
.callback_data = data,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
struct rpc_task *task;
struct inode *dir = d_inode(data->dentry->d_parent);
@@ -341,7 +341,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
.callback_ops = &nfs_rename_ops,
.workqueue = nfsiod_workqueue,
.rpc_client = NFS_CLIENT(old_dir),
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
data = kzalloc(sizeof(*data), GFP_KERNEL);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c478b772cc49..df4b87c30ac9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -149,6 +149,31 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc)
kref_put(&ioc->refcount, nfs_io_completion_release);
}
+static void
+nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode)
+{
+ if (!test_and_set_bit(PG_INODE_REF, &req->wb_flags)) {
+ kref_get(&req->wb_kref);
+ atomic_long_inc(&NFS_I(inode)->nrequests);
+ }
+}
+
+static int
+nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
+{
+ int ret;
+
+ if (!test_bit(PG_REMOVE, &req->wb_flags))
+ return 0;
+ ret = nfs_page_group_lock(req);
+ if (ret)
+ return ret;
+ if (test_and_clear_bit(PG_REMOVE, &req->wb_flags))
+ nfs_page_set_inode_ref(req, inode);
+ nfs_page_group_unlock(req);
+ return 0;
+}
+
static struct nfs_page *
nfs_page_private_request(struct page *page)
{
@@ -218,6 +243,36 @@ static struct nfs_page *nfs_page_find_head_request(struct page *page)
return req;
}
+static struct nfs_page *nfs_find_and_lock_page_request(struct page *page)
+{
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_page *req, *head;
+ int ret;
+
+ for (;;) {
+ req = nfs_page_find_head_request(page);
+ if (!req)
+ return req;
+ head = nfs_page_group_lock_head(req);
+ if (head != req)
+ nfs_release_request(req);
+ if (IS_ERR(head))
+ return head;
+ ret = nfs_cancel_remove_inode(head, inode);
+ if (ret < 0) {
+ nfs_unlock_and_release_request(head);
+ return ERR_PTR(ret);
+ }
+ /* Ensure that nobody removed the request before we locked it */
+ if (head == nfs_page_private_request(page))
+ break;
+ if (PageSwapCache(page))
+ break;
+ nfs_unlock_and_release_request(head);
+ }
+ return head;
+}
+
/* Adjust the file length if we're writing beyond the end */
static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
{
@@ -380,34 +435,6 @@ static void nfs_end_page_writeback(struct nfs_page *req)
}
/*
- * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req
- *
- * this is a helper function for nfs_lock_and_join_requests
- *
- * @inode - inode associated with request page group, must be holding inode lock
- * @head - head request of page group, must be holding head lock
- * @req - request that couldn't lock and needs to wait on the req bit lock
- *
- * NOTE: this must be called holding page_group bit lock
- * which will be released before returning.
- *
- * returns 0 on success, < 0 on error.
- */
-static void
-nfs_unroll_locks(struct inode *inode, struct nfs_page *head,
- struct nfs_page *req)
-{
- struct nfs_page *tmp;
-
- /* relinquish all the locks successfully grabbed this run */
- for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) {
- if (!kref_read(&tmp->wb_kref))
- continue;
- nfs_unlock_and_release_request(tmp);
- }
-}
-
-/*
* nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests
*
* @destroy_list - request list (using wb_this_page) terminated by @old_head
@@ -428,22 +455,29 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
destroy_list = (subreq->wb_this_page == old_head) ?
NULL : subreq->wb_this_page;
+ /* Note: lock subreq in order to change subreq->wb_head */
+ nfs_page_set_headlock(subreq);
WARN_ON_ONCE(old_head != subreq->wb_head);
/* make sure old group is not used */
subreq->wb_this_page = subreq;
+ subreq->wb_head = subreq;
clear_bit(PG_REMOVE, &subreq->wb_flags);
/* Note: races with nfs_page_group_destroy() */
if (!kref_read(&subreq->wb_kref)) {
/* Check if we raced with nfs_page_group_destroy() */
- if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags))
+ if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) {
+ nfs_page_clear_headlock(subreq);
nfs_free_request(subreq);
+ } else
+ nfs_page_clear_headlock(subreq);
continue;
}
+ nfs_page_clear_headlock(subreq);
- subreq->wb_head = subreq;
+ nfs_release_request(old_head);
if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) {
nfs_release_request(subreq);
@@ -457,105 +491,43 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
}
/*
- * nfs_lock_and_join_requests - join all subreqs to the head req and return
- * a locked reference, cancelling any pending
- * operations for this page.
- *
- * @page - the page used to lookup the "page group" of nfs_page structures
+ * nfs_join_page_group - destroy subrequests of the head req
+ * @head: the page used to lookup the "page group" of nfs_page structures
+ * @inode: Inode to which the request belongs.
*
* This function joins all sub requests to the head request by first
* locking all requests in the group, cancelling any pending operations
* and finally updating the head request to cover the whole range covered by
* the (former) group. All subrequests are removed from any write or commit
* lists, unlinked from the group and destroyed.
- *
- * Returns a locked, referenced pointer to the head request - which after
- * this call is guaranteed to be the only request associated with the page.
- * Returns NULL if no requests are found for @page, or a ERR_PTR if an
- * error was encountered.
*/
-static struct nfs_page *
-nfs_lock_and_join_requests(struct page *page)
+void
+nfs_join_page_group(struct nfs_page *head, struct inode *inode)
{
- struct inode *inode = page_file_mapping(page)->host;
- struct nfs_page *head, *subreq;
+ struct nfs_page *subreq;
struct nfs_page *destroy_list = NULL;
- unsigned int total_bytes;
- int ret;
+ unsigned int pgbase, off, bytes;
-try_again:
- /*
- * A reference is taken only on the head request which acts as a
- * reference to the whole page group - the group will not be destroyed
- * until the head reference is released.
- */
- head = nfs_page_find_head_request(page);
- if (!head)
- return NULL;
-
- /* lock the page head first in order to avoid an ABBA inefficiency */
- if (!nfs_lock_request(head)) {
- ret = nfs_wait_on_request(head);
- nfs_release_request(head);
- if (ret < 0)
- return ERR_PTR(ret);
- goto try_again;
- }
-
- /* Ensure that nobody removed the request before we locked it */
- if (head != nfs_page_private_request(page) && !PageSwapCache(page)) {
- nfs_unlock_and_release_request(head);
- goto try_again;
- }
-
- ret = nfs_page_group_lock(head);
- if (ret < 0)
- goto release_request;
-
- /* lock each request in the page group */
- total_bytes = head->wb_bytes;
+ pgbase = head->wb_pgbase;
+ bytes = head->wb_bytes;
+ off = head->wb_offset;
for (subreq = head->wb_this_page; subreq != head;
subreq = subreq->wb_this_page) {
-
- if (!kref_get_unless_zero(&subreq->wb_kref)) {
- if (subreq->wb_offset == head->wb_offset + total_bytes)
- total_bytes += subreq->wb_bytes;
- continue;
- }
-
- while (!nfs_lock_request(subreq)) {
- /*
- * Unlock page to allow nfs_page_group_sync_on_bit()
- * to succeed
- */
- nfs_page_group_unlock(head);
- ret = nfs_wait_on_request(subreq);
- if (!ret)
- ret = nfs_page_group_lock(head);
- if (ret < 0) {
- nfs_unroll_locks(inode, head, subreq);
- nfs_release_request(subreq);
- goto release_request;
- }
- }
- /*
- * Subrequests are always contiguous, non overlapping
- * and in order - but may be repeated (mirrored writes).
- */
- if (subreq->wb_offset == (head->wb_offset + total_bytes)) {
- /* keep track of how many bytes this group covers */
- total_bytes += subreq->wb_bytes;
- } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
- ((subreq->wb_offset + subreq->wb_bytes) >
- (head->wb_offset + total_bytes)))) {
- nfs_page_group_unlock(head);
- nfs_unroll_locks(inode, head, subreq);
- nfs_unlock_and_release_request(subreq);
- ret = -EIO;
- goto release_request;
+ /* Subrequests should always form a contiguous range */
+ if (pgbase > subreq->wb_pgbase) {
+ off -= pgbase - subreq->wb_pgbase;
+ bytes += pgbase - subreq->wb_pgbase;
+ pgbase = subreq->wb_pgbase;
}
+ bytes = max(subreq->wb_pgbase + subreq->wb_bytes
+ - pgbase, bytes);
}
+ /* Set the head request's range to cover the former page group */
+ head->wb_pgbase = pgbase;
+ head->wb_bytes = bytes;
+ head->wb_offset = off;
+
/* Now that all requests are locked, make sure they aren't on any list.
* Commit list removal accounting is done after locks are dropped */
subreq = head;
@@ -569,36 +541,52 @@ try_again:
/* destroy list will be terminated by head */
destroy_list = head->wb_this_page;
head->wb_this_page = head;
-
- /* change head request to cover whole range that
- * the former page group covered */
- head->wb_bytes = total_bytes;
}
- /* Postpone destruction of this request */
- if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) {
- set_bit(PG_INODE_REF, &head->wb_flags);
- kref_get(&head->wb_kref);
- atomic_long_inc(&NFS_I(inode)->nrequests);
- }
+ nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
+}
- nfs_page_group_unlock(head);
+/*
+ * nfs_lock_and_join_requests - join all subreqs to the head req
+ * @page: the page used to lookup the "page group" of nfs_page structures
+ *
+ * This function joins all sub requests to the head request by first
+ * locking all requests in the group, cancelling any pending operations
+ * and finally updating the head request to cover the whole range covered by
+ * the (former) group. All subrequests are removed from any write or commit
+ * lists, unlinked from the group and destroyed.
+ *
+ * Returns a locked, referenced pointer to the head request - which after
+ * this call is guaranteed to be the only request associated with the page.
+ * Returns NULL if no requests are found for @page, or a ERR_PTR if an
+ * error was encountered.
+ */
+static struct nfs_page *
+nfs_lock_and_join_requests(struct page *page)
+{
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_page *head;
+ int ret;
- nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
+ /*
+ * A reference is taken only on the head request which acts as a
+ * reference to the whole page group - the group will not be destroyed
+ * until the head reference is released.
+ */
+ head = nfs_find_and_lock_page_request(page);
+ if (IS_ERR_OR_NULL(head))
+ return head;
- /* Did we lose a race with nfs_inode_remove_request()? */
- if (!(PagePrivate(page) || PageSwapCache(page))) {
+ /* lock each request in the page group */
+ ret = nfs_page_group_lock_subrequests(head);
+ if (ret < 0) {
nfs_unlock_and_release_request(head);
- return NULL;
+ return ERR_PTR(ret);
}
- /* still holds ref on head from nfs_page_find_head_request
- * and still has lock on head from lock loop */
- return head;
+ nfs_join_page_group(head, inode);
-release_request:
- nfs_unlock_and_release_request(head);
- return ERR_PTR(ret);
+ return head;
}
static void nfs_write_error(struct nfs_page *req, int error)
@@ -1707,7 +1695,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
.callback_ops = call_ops,
.callback_data = data,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC | flags,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags,
.priority = priority,
};
/* Set up the initial task struct. */
@@ -1746,14 +1734,19 @@ void nfs_init_commit(struct nfs_commit_data *data,
struct pnfs_layout_segment *lseg,
struct nfs_commit_info *cinfo)
{
- struct nfs_page *first = nfs_list_entry(head->next);
- struct nfs_open_context *ctx = nfs_req_openctx(first);
- struct inode *inode = d_inode(ctx->dentry);
+ struct nfs_page *first;
+ struct nfs_open_context *ctx;
+ struct inode *inode;
/* Set up the RPC argument and reply structs
* NB: take care not to mess about with data->commit et al. */
- list_splice_init(head, &data->pages);
+ if (head)
+ list_splice_init(head, &data->pages);
+
+ first = nfs_list_entry(data->pages.next);
+ ctx = nfs_req_openctx(first);
+ inode = d_inode(ctx->dentry);
data->inode = inode;
data->cred = ctx->cred;
@@ -1869,8 +1862,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* Okay, COMMIT succeeded, apparently. Check the verifier
* returned by the server against all stored verfs. */
- if (verf->committed > NFS_UNSTABLE &&
- !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier)) {
+ if (nfs_write_match_verf(verf, req)) {
/* We have a match */
if (req->wb_page)
nfs_inode_remove_request(req);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 65b3abbcce4e..2f834add165b 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7402,6 +7402,10 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_inline_data *idata = &di->id2.i_data;
+ /* No need to punch hole beyond i_size. */
+ if (start >= i_size_read(inode))
+ return 0;
+
if (end > i_size_read(inode))
end = i_size_read(inode);
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index c740159d9ad1..af375e049aae 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -346,23 +346,8 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
struct iov_iter *iter)
{
int ret;
- struct orangefs_read_options *ro;
-
orangefs_stats.reads++;
- /*
- * Remember how they set "count" in read(2) or pread(2) or whatever -
- * users can use count as a knob to control orangefs io size and later
- * we can try to help them fill as many pages as possible in readpage.
- */
- if (!iocb->ki_filp->private_data) {
- iocb->ki_filp->private_data = kmalloc(sizeof *ro, GFP_KERNEL);
- if (!iocb->ki_filp->private_data)
- return(ENOMEM);
- ro = iocb->ki_filp->private_data;
- ro->blksiz = iter->count;
- }
-
down_read(&file_inode(iocb->ki_filp)->i_rwsem);
ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
if (ret)
@@ -650,12 +635,6 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
return rc;
}
-static int orangefs_file_open(struct inode * inode, struct file *file)
-{
- file->private_data = NULL;
- return generic_file_open(inode, file);
-}
-
static int orangefs_flush(struct file *file, fl_owner_t id)
{
/*
@@ -666,19 +645,8 @@ static int orangefs_flush(struct file *file, fl_owner_t id)
* on an explicit fsync call. This duplicates historical OrangeFS
* behavior.
*/
- struct inode *inode = file->f_mapping->host;
int r;
- kfree(file->private_data);
- file->private_data = NULL;
-
- if (inode->i_state & I_DIRTY_TIME) {
- spin_lock(&inode->i_lock);
- inode->i_state &= ~I_DIRTY_TIME;
- spin_unlock(&inode->i_lock);
- mark_inode_dirty_sync(inode);
- }
-
r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX);
if (r > 0)
return 0;
@@ -694,7 +662,7 @@ const struct file_operations orangefs_file_operations = {
.lock = orangefs_lock,
.unlocked_ioctl = orangefs_ioctl,
.mmap = orangefs_file_mmap,
- .open = orangefs_file_open,
+ .open = generic_file_open,
.flush = orangefs_flush,
.release = orangefs_file_release,
.fsync = orangefs_fsync,
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 961c0fd8675a..12ae630fbed7 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -259,46 +259,19 @@ static int orangefs_readpage(struct file *file, struct page *page)
pgoff_t index; /* which page */
struct page *next_page;
char *kaddr;
- struct orangefs_read_options *ro = file->private_data;
loff_t read_size;
- loff_t roundedup;
int buffer_index = -1; /* orangefs shared memory slot */
int slot_index; /* index into slot */
int remaining;
/*
- * If they set some miniscule size for "count" in read(2)
- * (for example) then let's try to read a page, or the whole file
- * if it is smaller than a page. Once "count" goes over a page
- * then lets round up to the highest page size multiple that is
- * less than or equal to "count" and do that much orangefs IO and
- * try to fill as many pages as we can from it.
- *
- * "count" should be represented in ro->blksiz.
- *
- * inode->i_size = file size.
+ * Get up to this many bytes from Orangefs at a time and try
+ * to fill them into the page cache at once. Tests with dd made
+ * this seem like a reasonable static number, if there was
+ * interest perhaps this number could be made setable through
+ * sysfs...
*/
- if (ro) {
- if (ro->blksiz < PAGE_SIZE) {
- if (inode->i_size < PAGE_SIZE)
- read_size = inode->i_size;
- else
- read_size = PAGE_SIZE;
- } else {
- roundedup = ((PAGE_SIZE - 1) & ro->blksiz) ?
- ((ro->blksiz + PAGE_SIZE) & ~(PAGE_SIZE -1)) :
- ro->blksiz;
- if (roundedup > inode->i_size)
- read_size = inode->i_size;
- else
- read_size = roundedup;
-
- }
- } else {
- read_size = PAGE_SIZE;
- }
- if (!read_size)
- read_size = PAGE_SIZE;
+ read_size = 524288;
if (PageDirty(page))
orangefs_launder_page(page);
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index ed67f39fa7ce..e12aeb9623d6 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -239,10 +239,6 @@ struct orangefs_write_range {
kgid_t gid;
};
-struct orangefs_read_options {
- ssize_t blksiz;
-};
-
extern struct orangefs_stats orangefs_stats;
/*
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 9fc47c2e078d..9709cf22cab3 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -36,6 +36,13 @@ static int ovl_ccup_get(char *buf, const struct kernel_param *param)
module_param_call(check_copy_up, ovl_ccup_set, ovl_ccup_get, NULL, 0644);
MODULE_PARM_DESC(check_copy_up, "Obsolete; does nothing");
+static bool ovl_must_copy_xattr(const char *name)
+{
+ return !strcmp(name, XATTR_POSIX_ACL_ACCESS) ||
+ !strcmp(name, XATTR_POSIX_ACL_DEFAULT) ||
+ !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
+}
+
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size, value_size = 0;
@@ -107,8 +114,13 @@ retry:
continue; /* Discard */
}
error = vfs_setxattr(new, name, value, size, 0);
- if (error)
- break;
+ if (error) {
+ if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name))
+ break;
+
+ /* Ignore failure to copy unknown xattrs */
+ error = 0;
+ }
}
kfree(value);
out:
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 8e57d5372b8f..279009dee366 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -42,7 +42,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
return err;
}
-static struct dentry *ovl_lookup_temp(struct dentry *workdir)
+struct dentry *ovl_lookup_temp(struct dentry *workdir)
{
struct dentry *temp;
char name[20];
@@ -243,6 +243,9 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
ovl_dir_modified(dentry->d_parent, false);
ovl_dentry_set_upper_alias(dentry);
+ ovl_dentry_update_reval(dentry, newdentry,
+ DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+
if (!hardlink) {
/*
* ovl_obtain_alias() can be called after ovl_create_real()
@@ -819,6 +822,28 @@ static bool ovl_pure_upper(struct dentry *dentry)
!ovl_test_flag(OVL_WHITEOUTS, d_inode(dentry));
}
+static void ovl_drop_nlink(struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+ struct dentry *alias;
+
+ /* Try to find another, hashed alias */
+ spin_lock(&inode->i_lock);
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ if (alias != dentry && !d_unhashed(alias))
+ break;
+ }
+ spin_unlock(&inode->i_lock);
+
+ /*
+ * Changes to underlying layers may cause i_nlink to lose sync with
+ * reality. In this case prevent the link count from going to zero
+ * prematurely.
+ */
+ if (inode->i_nlink > !!alias)
+ drop_nlink(inode);
+}
+
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
int err;
@@ -856,7 +881,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
if (is_dir)
clear_nlink(dentry->d_inode);
else
- drop_nlink(dentry->d_inode);
+ ovl_drop_nlink(dentry);
}
ovl_nlink_end(dentry);
@@ -1201,7 +1226,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
if (new_is_dir)
clear_nlink(d_inode(new));
else
- drop_nlink(d_inode(new));
+ ovl_drop_nlink(new);
}
ovl_dir_modified(old->d_parent, ovl_type_origin(old) ||
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 6f54d70cef27..475c61f53f0f 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -308,29 +308,35 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb,
ovl_set_flag(OVL_UPPERDATA, inode);
dentry = d_find_any_alias(inode);
- if (!dentry) {
- dentry = d_alloc_anon(inode->i_sb);
- if (!dentry)
- goto nomem;
- oe = ovl_alloc_entry(lower ? 1 : 0);
- if (!oe)
- goto nomem;
-
- if (lower) {
- oe->lowerstack->dentry = dget(lower);
- oe->lowerstack->layer = lowerpath->layer;
- }
- dentry->d_fsdata = oe;
- if (upper_alias)
- ovl_dentry_set_upper_alias(dentry);
+ if (dentry)
+ goto out_iput;
+
+ dentry = d_alloc_anon(inode->i_sb);
+ if (unlikely(!dentry))
+ goto nomem;
+ oe = ovl_alloc_entry(lower ? 1 : 0);
+ if (!oe)
+ goto nomem;
+
+ if (lower) {
+ oe->lowerstack->dentry = dget(lower);
+ oe->lowerstack->layer = lowerpath->layer;
}
+ dentry->d_fsdata = oe;
+ if (upper_alias)
+ ovl_dentry_set_upper_alias(dentry);
+
+ ovl_dentry_update_reval(dentry, upper,
+ DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
return d_instantiate_anon(dentry, inode);
nomem:
- iput(inode);
dput(dentry);
- return ERR_PTR(-ENOMEM);
+ dentry = ERR_PTR(-ENOMEM);
+out_iput:
+ iput(inode);
+ return dentry;
}
/* Get the upper or lower dentry in stach whose on layer @idx */
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 79e8994e3bc1..b0d42ece4d7c 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -79,6 +79,7 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
{
bool samefs = ovl_same_fs(dentry->d_sb);
unsigned int xinobits = ovl_xino_bits(dentry->d_sb);
+ unsigned int xinoshift = 64 - xinobits;
if (samefs) {
/*
@@ -89,22 +90,22 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
stat->dev = dentry->d_sb->s_dev;
return 0;
} else if (xinobits) {
- unsigned int shift = 64 - xinobits;
/*
* All inode numbers of underlying fs should not be using the
* high xinobits, so we use high xinobits to partition the
* overlay st_ino address space. The high bits holds the fsid
- * (upper fsid is 0). This way overlay inode numbers are unique
- * and all inodes use overlay st_dev. Inode numbers are also
- * persistent for a given layer configuration.
+ * (upper fsid is 0). The lowest xinobit is reserved for mapping
+ * the non-peresistent inode numbers range in case of overflow.
+ * This way all overlay inode numbers are unique and use the
+ * overlay st_dev.
*/
- if (stat->ino >> shift) {
- pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
- dentry, stat->ino, xinobits);
- } else {
- stat->ino |= ((u64)fsid) << shift;
+ if (likely(!(stat->ino >> xinoshift))) {
+ stat->ino |= ((u64)fsid) << (xinoshift + 1);
stat->dev = dentry->d_sb->s_dev;
return 0;
+ } else if (ovl_xino_warn(dentry->d_sb)) {
+ pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
+ dentry, stat->ino, xinobits);
}
}
@@ -504,7 +505,7 @@ static const struct address_space_operations ovl_aops = {
/*
* It is possible to stack overlayfs instance on top of another
- * overlayfs instance as lower layer. We need to annonate the
+ * overlayfs instance as lower layer. We need to annotate the
* stackable i_mutex locks according to stack level of the super
* block instance. An overlayfs instance can never be in stack
* depth 0 (there is always a real fs below it). An overlayfs
@@ -561,27 +562,73 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
#endif
}
-static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev,
- unsigned long ino, int fsid)
+static void ovl_next_ino(struct inode *inode)
+{
+ struct ovl_fs *ofs = inode->i_sb->s_fs_info;
+
+ inode->i_ino = atomic_long_inc_return(&ofs->last_ino);
+ if (unlikely(!inode->i_ino))
+ inode->i_ino = atomic_long_inc_return(&ofs->last_ino);
+}
+
+static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid)
{
int xinobits = ovl_xino_bits(inode->i_sb);
+ unsigned int xinoshift = 64 - xinobits;
/*
* When d_ino is consistent with st_ino (samefs or i_ino has enough
* bits to encode layer), set the same value used for st_ino to i_ino,
* so inode number exposed via /proc/locks and a like will be
* consistent with d_ino and st_ino values. An i_ino value inconsistent
- * with d_ino also causes nfsd readdirplus to fail. When called from
- * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real
- * upper inode i_ino on ovl_inode_init() or ovl_inode_update().
+ * with d_ino also causes nfsd readdirplus to fail.
*/
- if (ovl_same_dev(inode->i_sb)) {
- inode->i_ino = ino;
- if (xinobits && fsid && !(ino >> (64 - xinobits)))
- inode->i_ino |= (unsigned long)fsid << (64 - xinobits);
- } else {
- inode->i_ino = get_next_ino();
+ inode->i_ino = ino;
+ if (ovl_same_fs(inode->i_sb)) {
+ return;
+ } else if (xinobits && likely(!(ino >> xinoshift))) {
+ inode->i_ino |= (unsigned long)fsid << (xinoshift + 1);
+ return;
+ }
+
+ /*
+ * For directory inodes on non-samefs with xino disabled or xino
+ * overflow, we allocate a non-persistent inode number, to be used for
+ * resolving st_ino collisions in ovl_map_dev_ino().
+ *
+ * To avoid ino collision with legitimate xino values from upper
+ * layer (fsid 0), use the lowest xinobit to map the non
+ * persistent inode numbers to the unified st_ino address space.
+ */
+ if (S_ISDIR(inode->i_mode)) {
+ ovl_next_ino(inode);
+ if (xinobits) {
+ inode->i_ino &= ~0UL >> xinobits;
+ inode->i_ino |= 1UL << xinoshift;
+ }
}
+}
+
+void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip,
+ unsigned long ino, int fsid)
+{
+ struct inode *realinode;
+
+ if (oip->upperdentry)
+ OVL_I(inode)->__upperdentry = oip->upperdentry;
+ if (oip->lowerpath && oip->lowerpath->dentry)
+ OVL_I(inode)->lower = igrab(d_inode(oip->lowerpath->dentry));
+ if (oip->lowerdata)
+ OVL_I(inode)->lowerdata = igrab(d_inode(oip->lowerdata));
+
+ realinode = ovl_inode_real(inode);
+ ovl_copyattr(realinode, inode);
+ ovl_copyflags(realinode, inode);
+ ovl_map_ino(inode, ino, fsid);
+}
+
+static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
+{
inode->i_mode = mode;
inode->i_flags |= S_NOCMTIME;
#ifdef CONFIG_FS_POSIX_ACL
@@ -719,7 +766,7 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
inode = new_inode(sb);
if (inode)
- ovl_fill_inode(inode, mode, rdev, 0, 0);
+ ovl_fill_inode(inode, mode, rdev);
return inode;
}
@@ -891,7 +938,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL;
bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
oip->index);
- int fsid = bylower ? oip->lowerpath->layer->fsid : 0;
+ int fsid = bylower ? lowerpath->layer->fsid : 0;
bool is_dir, metacopy = false;
unsigned long ino = 0;
int err = oip->newinode ? -EEXIST : -ENOMEM;
@@ -941,9 +988,11 @@ struct inode *ovl_get_inode(struct super_block *sb,
err = -ENOMEM;
goto out_err;
}
+ ino = realinode->i_ino;
+ fsid = lowerpath->layer->fsid;
}
- ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid);
- ovl_inode_init(inode, upperdentry, lowerdentry, oip->lowerdata);
+ ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev);
+ ovl_inode_init(inode, oip, ino, fsid);
if (upperdentry && ovl_is_impuredir(upperdentry))
ovl_set_flag(OVL_IMPURE, inode);
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index ed9e129fae04..0db23baf98e7 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -845,7 +845,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (err)
goto out;
- if (upperdentry && unlikely(ovl_dentry_remote(upperdentry))) {
+ if (upperdentry && upperdentry->d_flags & DCACHE_OP_REAL) {
dput(upperdentry);
err = -EREMOTE;
goto out;
@@ -1076,6 +1076,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
goto out_free_oe;
}
+ ovl_dentry_update_reval(dentry, upperdentry,
+ DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+
revert_creds(old_cred);
if (origin_path) {
dput(origin_path->dentry);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 3d3f2b8bdae5..e6f3670146ed 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -48,6 +48,12 @@ enum ovl_entry_flag {
OVL_E_CONNECTED,
};
+enum {
+ OVL_XINO_OFF,
+ OVL_XINO_AUTO,
+ OVL_XINO_ON,
+};
+
/*
* The tuple (fh,uuid) is a universal unique identifier for a copy up origin,
* where:
@@ -87,7 +93,7 @@ struct ovl_fb {
u8 flags; /* OVL_FH_FLAG_* */
u8 type; /* fid_type of fid */
uuid_t uuid; /* uuid of filesystem */
- u32 fid[0]; /* file identifier should be 32bit aligned in-memory */
+ u32 fid[]; /* file identifier should be 32bit aligned in-memory */
} __packed;
/* In-memory and on-wire format for overlay file handle */
@@ -230,6 +236,8 @@ bool ovl_index_all(struct super_block *sb);
bool ovl_verify_lower(struct super_block *sb);
struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
bool ovl_dentry_remote(struct dentry *dentry);
+void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *upperdentry,
+ unsigned int mask);
bool ovl_dentry_weird(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
@@ -264,8 +272,6 @@ void ovl_set_upperdata(struct inode *inode);
bool ovl_redirect_dir(struct super_block *sb);
const char *ovl_dentry_get_redirect(struct dentry *dentry);
void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
-void ovl_inode_init(struct inode *inode, struct dentry *upperdentry,
- struct dentry *lowerdentry, struct dentry *lowerdata);
void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
void ovl_dir_modified(struct dentry *dentry, bool impurity);
u64 ovl_dentry_version_get(struct dentry *dentry);
@@ -301,6 +307,16 @@ static inline bool ovl_is_impuredir(struct dentry *dentry)
return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE);
}
+/*
+ * With xino=auto, we do best effort to keep all inodes on same st_dev and
+ * d_ino consistent with st_ino.
+ * With xino=on, we do the same effort but we warn if we failed.
+ */
+static inline bool ovl_xino_warn(struct super_block *sb)
+{
+ return OVL_FS(sb)->config.xino == OVL_XINO_ON;
+}
+
/* All layers on same fs? */
static inline bool ovl_same_fs(struct super_block *sb)
{
@@ -410,6 +426,8 @@ struct ovl_inode_params {
char *redirect;
struct dentry *lowerdata;
};
+void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip,
+ unsigned long ino, int fsid);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
bool is_upper);
@@ -451,6 +469,7 @@ struct ovl_cattr {
struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct ovl_cattr *attr);
int ovl_cleanup(struct inode *dir, struct dentry *dentry);
+struct dentry *ovl_lookup_temp(struct dentry *workdir);
struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr);
/* file.c */
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 89015ea822e7..5762d802fe01 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -75,6 +75,8 @@ struct ovl_fs {
struct inode *indexdir_trap;
/* -1: disabled, 0: same fs, 1..32: number of unused ino bits */
int xino_mode;
+ /* For allocation of non-persistent inode numbers */
+ atomic_long_t last_ino;
};
static inline struct ovl_fs *OVL_FS(struct super_block *sb)
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 40ac9ce2465a..e452ff7d583d 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -438,15 +438,23 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
/* Map inode number to lower fs unique range */
static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
- const char *name, int namelen)
+ const char *name, int namelen, bool warn)
{
- if (ino >> (64 - xinobits)) {
- pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n",
- namelen, name, ino, xinobits);
+ unsigned int xinoshift = 64 - xinobits;
+
+ if (unlikely(ino >> xinoshift)) {
+ if (warn) {
+ pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n",
+ namelen, name, ino, xinobits);
+ }
return ino;
}
- return ino | ((u64)fsid) << (64 - xinobits);
+ /*
+ * The lowest xinobit is reserved for mapping the non-peresistent inode
+ * numbers range, but this range is only exposed via st_ino, not here.
+ */
+ return ino | ((u64)fsid) << (xinoshift + 1);
}
/*
@@ -515,7 +523,8 @@ get:
} else if (xinobits && !OVL_TYPE_UPPER(type)) {
ino = ovl_remap_lower_ino(ino, xinobits,
ovl_layer_lower(this)->fsid,
- p->name, p->len);
+ p->name, p->len,
+ ovl_xino_warn(dir->d_sb));
}
out:
@@ -645,6 +654,7 @@ struct ovl_readdir_translate {
u64 parent_ino;
int fsid;
int xinobits;
+ bool xinowarn;
};
static int ovl_fill_real(struct dir_context *ctx, const char *name,
@@ -665,7 +675,7 @@ static int ovl_fill_real(struct dir_context *ctx, const char *name,
ino = p->ino;
} else if (rdt->xinobits) {
ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid,
- name, namelen);
+ name, namelen, rdt->xinowarn);
}
return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
@@ -696,6 +706,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
.ctx.actor = ovl_fill_real,
.orig_ctx = ctx,
.xinobits = ovl_xino_bits(dir->d_sb),
+ .xinowarn = ovl_xino_warn(dir->d_sb),
};
if (rdt.xinobits && lower_layer)
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index ac967f1cb6e5..732ad5495c92 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -113,53 +113,54 @@ bug:
return dentry;
}
-static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak)
{
- struct ovl_entry *oe = dentry->d_fsdata;
- unsigned int i;
int ret = 1;
- for (i = 0; i < oe->numlower; i++) {
- struct dentry *d = oe->lowerstack[i].dentry;
-
- if (d->d_flags & DCACHE_OP_REVALIDATE) {
- ret = d->d_op->d_revalidate(d, flags);
- if (ret < 0)
- return ret;
- if (!ret) {
- if (!(flags & LOOKUP_RCU))
- d_invalidate(d);
- return -ESTALE;
- }
+ if (weak) {
+ if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE)
+ ret = d->d_op->d_weak_revalidate(d, flags);
+ } else if (d->d_flags & DCACHE_OP_REVALIDATE) {
+ ret = d->d_op->d_revalidate(d, flags);
+ if (!ret) {
+ if (!(flags & LOOKUP_RCU))
+ d_invalidate(d);
+ ret = -ESTALE;
}
}
- return 1;
+ return ret;
}
-static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
+static int ovl_dentry_revalidate_common(struct dentry *dentry,
+ unsigned int flags, bool weak)
{
struct ovl_entry *oe = dentry->d_fsdata;
+ struct dentry *upper;
unsigned int i;
int ret = 1;
- for (i = 0; i < oe->numlower; i++) {
- struct dentry *d = oe->lowerstack[i].dentry;
+ upper = ovl_dentry_upper(dentry);
+ if (upper)
+ ret = ovl_revalidate_real(upper, flags, weak);
- if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) {
- ret = d->d_op->d_weak_revalidate(d, flags);
- if (ret <= 0)
- break;
- }
+ for (i = 0; ret > 0 && i < oe->numlower; i++) {
+ ret = ovl_revalidate_real(oe->lowerstack[i].dentry, flags,
+ weak);
}
return ret;
}
-static const struct dentry_operations ovl_dentry_operations = {
- .d_release = ovl_dentry_release,
- .d_real = ovl_d_real,
-};
+static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return ovl_dentry_revalidate_common(dentry, flags, false);
+}
+
+static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return ovl_dentry_revalidate_common(dentry, flags, true);
+}
-static const struct dentry_operations ovl_reval_dentry_operations = {
+static const struct dentry_operations ovl_dentry_operations = {
.d_release = ovl_dentry_release,
.d_real = ovl_d_real,
.d_revalidate = ovl_dentry_revalidate,
@@ -316,12 +317,6 @@ static const char *ovl_redirect_mode_def(void)
return ovl_redirect_dir_def ? "on" : "off";
}
-enum {
- OVL_XINO_OFF,
- OVL_XINO_AUTO,
- OVL_XINO_ON,
-};
-
static const char * const ovl_xino_str[] = {
"off",
"auto",
@@ -751,13 +746,12 @@ static int ovl_mount_dir(const char *name, struct path *path)
ovl_unescape(tmp);
err = ovl_mount_dir_noesc(tmp, path);
- if (!err)
- if (ovl_dentry_remote(path->dentry)) {
- pr_err("filesystem on '%s' not supported as upperdir\n",
- tmp);
- path_put_init(path);
- err = -EINVAL;
- }
+ if (!err && path->dentry->d_flags & DCACHE_OP_REAL) {
+ pr_err("filesystem on '%s' not supported as upperdir\n",
+ tmp);
+ path_put_init(path);
+ err = -EINVAL;
+ }
kfree(tmp);
}
return err;
@@ -778,7 +772,7 @@ static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs,
}
static int ovl_lower_dir(const char *name, struct path *path,
- struct ovl_fs *ofs, int *stack_depth, bool *remote)
+ struct ovl_fs *ofs, int *stack_depth)
{
int fh_type;
int err;
@@ -793,9 +787,6 @@ static int ovl_lower_dir(const char *name, struct path *path,
*stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
- if (ovl_dentry_remote(path->dentry))
- *remote = true;
-
/*
* The inodes index feature and NFS export need to encode and decode
* file handles, so they require that all layers support them.
@@ -1074,11 +1065,73 @@ out:
return err;
}
+/*
+ * Returns 1 if RENAME_WHITEOUT is supported, 0 if not supported and
+ * negative values if error is encountered.
+ */
+static int ovl_check_rename_whiteout(struct dentry *workdir)
+{
+ struct inode *dir = d_inode(workdir);
+ struct dentry *temp;
+ struct dentry *dest;
+ struct dentry *whiteout;
+ struct name_snapshot name;
+ int err;
+
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+
+ temp = ovl_create_temp(workdir, OVL_CATTR(S_IFREG | 0));
+ err = PTR_ERR(temp);
+ if (IS_ERR(temp))
+ goto out_unlock;
+
+ dest = ovl_lookup_temp(workdir);
+ err = PTR_ERR(dest);
+ if (IS_ERR(dest)) {
+ dput(temp);
+ goto out_unlock;
+ }
+
+ /* Name is inline and stable - using snapshot as a copy helper */
+ take_dentry_name_snapshot(&name, temp);
+ err = ovl_do_rename(dir, temp, dir, dest, RENAME_WHITEOUT);
+ if (err) {
+ if (err == -EINVAL)
+ err = 0;
+ goto cleanup_temp;
+ }
+
+ whiteout = lookup_one_len(name.name.name, workdir, name.name.len);
+ err = PTR_ERR(whiteout);
+ if (IS_ERR(whiteout))
+ goto cleanup_temp;
+
+ err = ovl_is_whiteout(whiteout);
+
+ /* Best effort cleanup of whiteout and temp file */
+ if (err)
+ ovl_cleanup(dir, whiteout);
+ dput(whiteout);
+
+cleanup_temp:
+ ovl_cleanup(dir, temp);
+ release_dentry_name_snapshot(&name);
+ dput(temp);
+ dput(dest);
+
+out_unlock:
+ inode_unlock(dir);
+
+ return err;
+}
+
static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
struct path *workpath)
{
struct vfsmount *mnt = ofs->upper_mnt;
struct dentry *temp;
+ bool rename_whiteout;
+ bool d_type;
int fh_type;
int err;
@@ -1104,11 +1157,8 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
if (err < 0)
goto out;
- /*
- * We allowed this configuration and don't want to break users over
- * kernel upgrade. So warn instead of erroring out.
- */
- if (!err)
+ d_type = err;
+ if (!d_type)
pr_warn("upper fs needs to support d_type.\n");
/* Check if upper/work fs supports O_TMPFILE */
@@ -1119,6 +1169,16 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
else
pr_warn("upper fs does not support tmpfile.\n");
+
+ /* Check if upper/work fs supports RENAME_WHITEOUT */
+ err = ovl_check_rename_whiteout(ofs->workdir);
+ if (err < 0)
+ goto out;
+
+ rename_whiteout = err;
+ if (!rename_whiteout)
+ pr_warn("upper fs does not support RENAME_WHITEOUT.\n");
+
/*
* Check if upper/work fs supports trusted.overlay.* xattr
*/
@@ -1133,6 +1193,18 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE);
}
+ /*
+ * We allowed sub-optimal upper fs configuration and don't want to break
+ * users over kernel upgrade, but we never allowed remote upper fs, so
+ * we can enforce strict requirements for remote upper fs.
+ */
+ if (ovl_dentry_remote(ofs->workdir) &&
+ (!d_type || !rename_whiteout || ofs->noxattr)) {
+ pr_err("upper fs missing required features.\n");
+ err = -EINVAL;
+ goto out;
+ }
+
/* Check if upper/work fs supports file handles */
fh_type = ovl_can_decode_fh(ofs->workdir->d_sb);
if (ofs->config.index && !fh_type) {
@@ -1401,11 +1473,12 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
/*
* When all layers on same fs, overlay can use real inode numbers.
- * With mount option "xino=on", mounter declares that there are enough
- * free high bits in underlying fs to hold the unique fsid.
+ * With mount option "xino=<on|auto>", mounter declares that there are
+ * enough free high bits in underlying fs to hold the unique fsid.
* If overlayfs does encounter underlying inodes using the high xino
* bits reserved for fsid, it emits a warning and uses the original
- * inode number.
+ * inode number or a non persistent inode number allocated from a
+ * dedicated range.
*/
if (ofs->numfs - !ofs->upper_mnt == 1) {
if (ofs->config.xino == OVL_XINO_ON)
@@ -1413,14 +1486,16 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
ofs->xino_mode = 0;
} else if (ofs->config.xino == OVL_XINO_OFF) {
ofs->xino_mode = -1;
- } else if (ofs->config.xino == OVL_XINO_ON && ofs->xino_mode < 0) {
+ } else if (ofs->xino_mode < 0) {
/*
* This is a roundup of number of bits needed for encoding
- * fsid, where fsid 0 is reserved for upper fs even with
- * lower only overlay.
+ * fsid, where fsid 0 is reserved for upper fs (even with
+ * lower only overlay) +1 extra bit is reserved for the non
+ * persistent inode number range that is used for resolving
+ * xino lower bits overflow.
*/
- BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 31);
- ofs->xino_mode = ilog2(ofs->numfs - 1) + 1;
+ BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 30);
+ ofs->xino_mode = ilog2(ofs->numfs - 1) + 2;
}
if (ofs->xino_mode > 0) {
@@ -1440,7 +1515,6 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
char *lowertmp, *lower;
struct path *stack = NULL;
unsigned int stacklen, numlower = 0, i;
- bool remote = false;
struct ovl_entry *oe;
err = -ENOMEM;
@@ -1472,7 +1546,7 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
lower = lowertmp;
for (numlower = 0; numlower < stacklen; numlower++) {
err = ovl_lower_dir(lower, &stack[numlower], ofs,
- &sb->s_stack_depth, &remote);
+ &sb->s_stack_depth);
if (err)
goto out_err;
@@ -1500,11 +1574,6 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
oe->lowerstack[i].layer = &ofs->layers[i+1];
}
- if (remote)
- sb->s_d_op = &ovl_reval_dentry_operations;
- else
- sb->s_d_op = &ovl_dentry_operations;
-
out:
for (i = 0; i < numlower; i++)
path_put(&stack[i]);
@@ -1589,6 +1658,44 @@ static int ovl_check_overlapping_layers(struct super_block *sb,
return 0;
}
+static struct dentry *ovl_get_root(struct super_block *sb,
+ struct dentry *upperdentry,
+ struct ovl_entry *oe)
+{
+ struct dentry *root;
+ struct ovl_path *lowerpath = &oe->lowerstack[0];
+ unsigned long ino = d_inode(lowerpath->dentry)->i_ino;
+ int fsid = lowerpath->layer->fsid;
+ struct ovl_inode_params oip = {
+ .upperdentry = upperdentry,
+ .lowerpath = lowerpath,
+ };
+
+ root = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
+ if (!root)
+ return NULL;
+
+ root->d_fsdata = oe;
+
+ if (upperdentry) {
+ /* Root inode uses upper st_ino/i_ino */
+ ino = d_inode(upperdentry)->i_ino;
+ fsid = 0;
+ ovl_dentry_set_upper_alias(root);
+ if (ovl_is_impuredir(upperdentry))
+ ovl_set_flag(OVL_IMPURE, d_inode(root));
+ }
+
+ /* Root is always merge -> can have whiteouts */
+ ovl_set_flag(OVL_WHITEOUTS, d_inode(root));
+ ovl_dentry_set_flag(OVL_E_CONNECTED, root);
+ ovl_set_upperdata(d_inode(root));
+ ovl_inode_init(d_inode(root), &oip, ino, fsid);
+ ovl_dentry_update_reval(root, upperdentry, DCACHE_OP_WEAK_REVALIDATE);
+
+ return root;
+}
+
static int ovl_fill_super(struct super_block *sb, void *data, int silent)
{
struct path upperpath = { };
@@ -1598,6 +1705,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
struct cred *cred;
int err;
+ sb->s_d_op = &ovl_dentry_operations;
+
err = -ENOMEM;
ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
if (!ofs)
@@ -1624,6 +1733,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_stack_depth = 0;
sb->s_maxbytes = MAX_LFS_FILESIZE;
+ atomic_long_set(&ofs->last_ino, 1);
/* Assume underlaying fs uses 32bit inodes unless proven otherwise */
if (ofs->config.xino != OVL_XINO_OFF) {
ofs->xino_mode = BITS_PER_LONG - 32;
@@ -1710,25 +1820,11 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags |= SB_POSIXACL;
err = -ENOMEM;
- root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
+ root_dentry = ovl_get_root(sb, upperpath.dentry, oe);
if (!root_dentry)
goto out_free_oe;
- root_dentry->d_fsdata = oe;
-
mntput(upperpath.mnt);
- if (upperpath.dentry) {
- ovl_dentry_set_upper_alias(root_dentry);
- if (ovl_is_impuredir(upperpath.dentry))
- ovl_set_flag(OVL_IMPURE, d_inode(root_dentry));
- }
-
- /* Root is always merge -> can have whiteouts */
- ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry));
- ovl_dentry_set_flag(OVL_E_CONNECTED, root_dentry);
- ovl_set_upperdata(d_inode(root_dentry));
- ovl_inode_init(d_inode(root_dentry), upperpath.dentry,
- ovl_dentry_lower(root_dentry), NULL);
sb->s_root = root_dentry;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 042f7eb4f7f4..36b60788ee47 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -93,8 +93,24 @@ struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
bool ovl_dentry_remote(struct dentry *dentry)
{
return dentry->d_flags &
- (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE |
- DCACHE_OP_REAL);
+ (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+}
+
+void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *upperdentry,
+ unsigned int mask)
+{
+ struct ovl_entry *oe = OVL_E(dentry);
+ unsigned int i, flags = 0;
+
+ if (upperdentry)
+ flags |= upperdentry->d_flags;
+ for (i = 0; i < oe->numlower; i++)
+ flags |= oe->lowerstack[i].dentry->d_flags;
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~mask;
+ dentry->d_flags |= flags & mask;
+ spin_unlock(&dentry->d_lock);
}
bool ovl_dentry_weird(struct dentry *dentry)
@@ -386,24 +402,6 @@ void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect)
oi->redirect = redirect;
}
-void ovl_inode_init(struct inode *inode, struct dentry *upperdentry,
- struct dentry *lowerdentry, struct dentry *lowerdata)
-{
- struct inode *realinode = d_inode(upperdentry ?: lowerdentry);
-
- if (upperdentry)
- OVL_I(inode)->__upperdentry = upperdentry;
- if (lowerdentry)
- OVL_I(inode)->lower = igrab(d_inode(lowerdentry));
- if (lowerdata)
- OVL_I(inode)->lowerdata = igrab(d_inode(lowerdata));
-
- ovl_copyattr(realinode, inode);
- ovl_copyflags(realinode, inode);
- if (!inode->i_ino)
- inode->i_ino = realinode->i_ino;
-}
-
void ovl_inode_update(struct inode *inode, struct dentry *upperdentry)
{
struct inode *upperinode = d_inode(upperdentry);
@@ -416,8 +414,6 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry)
smp_wmb();
OVL_I(inode)->__upperdentry = upperdentry;
if (inode_unhashed(inode)) {
- if (!inode->i_ino)
- inode->i_ino = upperinode->i_ino;
inode->i_private = upperinode;
__insert_inode_hash(inode, (unsigned long) upperinode);
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 5efaf3708ec6..8e16f14bb05a 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -635,28 +635,35 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
struct mm_struct *mm = get_task_mm(task);
if (mm) {
+ unsigned long size;
+ unsigned long resident = 0;
+ unsigned long shared = 0;
+ unsigned long text = 0;
+ unsigned long data = 0;
+
size = task_statm(mm, &shared, &text, &data, &resident);
mmput(mm);
- }
- /*
- * For quick read, open code by putting numbers directly
- * expected format is
- * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
- * size, resident, shared, text, data);
- */
- seq_put_decimal_ull(m, "", size);
- seq_put_decimal_ull(m, " ", resident);
- seq_put_decimal_ull(m, " ", shared);
- seq_put_decimal_ull(m, " ", text);
- seq_put_decimal_ull(m, " ", 0);
- seq_put_decimal_ull(m, " ", data);
- seq_put_decimal_ull(m, " ", 0);
- seq_putc(m, '\n');
+ /*
+ * For quick read, open code by putting numbers directly
+ * expected format is
+ * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
+ * size, resident, shared, text, data);
+ */
+ seq_put_decimal_ull(m, "", size);
+ seq_put_decimal_ull(m, " ", resident);
+ seq_put_decimal_ull(m, " ", shared);
+ seq_put_decimal_ull(m, " ", text);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_put_decimal_ull(m, " ", data);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_putc(m, '\n');
+ } else {
+ seq_write(m, "0 0 0 0 0 0 0\n", 14);
+ }
return 0;
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 74f948a6b621..6042b646ab27 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1839,9 +1839,9 @@ void proc_pid_evict_inode(struct proc_inode *ei)
struct pid *pid = ei->pid;
if (S_ISDIR(ei->vfs_inode.i_mode)) {
- spin_lock(&pid->wait_pidfd.lock);
+ spin_lock(&pid->lock);
hlist_del_init_rcu(&ei->sibling_inodes);
- spin_unlock(&pid->wait_pidfd.lock);
+ spin_unlock(&pid->lock);
}
put_pid(pid);
@@ -1877,9 +1877,9 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
/* Let the pid remember us for quick removal */
ei->pid = pid;
if (S_ISDIR(mode)) {
- spin_lock(&pid->wait_pidfd.lock);
+ spin_lock(&pid->lock);
hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
- spin_unlock(&pid->wait_pidfd.lock);
+ spin_unlock(&pid->lock);
}
task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
@@ -3273,7 +3273,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
void proc_flush_pid(struct pid *pid)
{
- proc_invalidate_siblings_dcache(&pid->inodes, &pid->wait_pidfd.lock);
+ proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock);
put_pid(pid);
}
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index c1dea9b8222e..d0989a443c77 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -17,6 +17,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file)
}
static const struct proc_ops cpuinfo_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = cpuinfo_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 3faed94e4b65..4ed6dabdf6ff 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -531,6 +531,12 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
return p;
}
+static inline void pde_set_flags(struct proc_dir_entry *pde)
+{
+ if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
+ pde->flags |= PROC_ENTRY_PERMANENT;
+}
+
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
const struct proc_ops *proc_ops, void *data)
@@ -541,6 +547,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
if (!p)
return NULL;
p->proc_ops = proc_ops;
+ pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_data);
@@ -572,6 +579,7 @@ static int proc_seq_release(struct inode *inode, struct file *file)
}
static const struct proc_ops proc_seq_ops = {
+ /* not permanent -- can call into arbitrary seq_operations */
.proc_open = proc_seq_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
@@ -602,6 +610,7 @@ static int proc_single_open(struct inode *inode, struct file *file)
}
static const struct proc_ops proc_single_ops = {
+ /* not permanent -- can call into arbitrary ->single_show */
.proc_open = proc_single_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
@@ -662,9 +671,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
de = pde_subdir_find(parent, fn, len);
if (de) {
- rb_erase(&de->subdir_node, &parent->subdir);
- if (S_ISDIR(de->mode)) {
- parent->nlink--;
+ if (unlikely(pde_is_permanent(de))) {
+ WARN(1, "removing permanent /proc entry '%s'", de->name);
+ de = NULL;
+ } else {
+ rb_erase(&de->subdir_node, &parent->subdir);
+ if (S_ISDIR(de->mode))
+ parent->nlink--;
}
}
write_unlock(&proc_subdir_lock);
@@ -700,12 +713,24 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
write_unlock(&proc_subdir_lock);
return -ENOENT;
}
+ if (unlikely(pde_is_permanent(root))) {
+ write_unlock(&proc_subdir_lock);
+ WARN(1, "removing permanent /proc entry '%s/%s'",
+ root->parent->name, root->name);
+ return -EINVAL;
+ }
rb_erase(&root->subdir_node, &parent->subdir);
de = root;
while (1) {
next = pde_subdir_first(de);
if (next) {
+ if (unlikely(pde_is_permanent(root))) {
+ write_unlock(&proc_subdir_lock);
+ WARN(1, "removing permanent /proc entry '%s/%s'",
+ next->parent->name, next->name);
+ return -EINVAL;
+ }
rb_erase(&next->subdir_node, &de->subdir);
de = next;
continue;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 1e730ea1dcd6..fb4cace9ea41 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -202,6 +202,7 @@ static void unuse_pde(struct proc_dir_entry *pde)
/* pde is locked on entry, unlocked on exit */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
+ __releases(&pde->pde_unload_lock)
{
/*
* close() (proc_reg_release()) can't delete an entry and proceed:
@@ -258,135 +259,204 @@ void proc_entry_rundown(struct proc_dir_entry *de)
spin_unlock(&de->pde_unload_lock);
}
+static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence)
+{
+ typeof_member(struct proc_ops, proc_lseek) lseek;
+
+ lseek = pde->proc_ops->proc_lseek;
+ if (!lseek)
+ lseek = default_llseek;
+ return lseek(file, offset, whence);
+}
+
static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
loff_t rv = -EINVAL;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_lseek) lseek;
- lseek = pde->proc_ops->proc_lseek;
- if (!lseek)
- lseek = default_llseek;
- rv = lseek(file, offset, whence);
+ if (pde_is_permanent(pde)) {
+ return pde_lseek(pde, file, offset, whence);
+ } else if (use_pde(pde)) {
+ rv = pde_lseek(pde, file, offset, whence);
unuse_pde(pde);
}
return rv;
}
+static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ typeof_member(struct proc_ops, proc_read) read;
+
+ read = pde->proc_ops->proc_read;
+ if (read)
+ return read(file, buf, count, ppos);
+ return -EIO;
+}
+
static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_read) read;
- read = pde->proc_ops->proc_read;
- if (read)
- rv = read(file, buf, count, ppos);
+ if (pde_is_permanent(pde)) {
+ return pde_read(pde, file, buf, count, ppos);
+ } else if (use_pde(pde)) {
+ rv = pde_read(pde, file, buf, count, ppos);
unuse_pde(pde);
}
return rv;
}
+static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+ typeof_member(struct proc_ops, proc_write) write;
+
+ write = pde->proc_ops->proc_write;
+ if (write)
+ return write(file, buf, count, ppos);
+ return -EIO;
+}
+
static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_write) write;
- write = pde->proc_ops->proc_write;
- if (write)
- rv = write(file, buf, count, ppos);
+ if (pde_is_permanent(pde)) {
+ return pde_write(pde, file, buf, count, ppos);
+ } else if (use_pde(pde)) {
+ rv = pde_write(pde, file, buf, count, ppos);
unuse_pde(pde);
}
return rv;
}
+static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
+{
+ typeof_member(struct proc_ops, proc_poll) poll;
+
+ poll = pde->proc_ops->proc_poll;
+ if (poll)
+ return poll(file, pts);
+ return DEFAULT_POLLMASK;
+}
+
static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
__poll_t rv = DEFAULT_POLLMASK;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_poll) poll;
- poll = pde->proc_ops->proc_poll;
- if (poll)
- rv = poll(file, pts);
+ if (pde_is_permanent(pde)) {
+ return pde_poll(pde, file, pts);
+ } else if (use_pde(pde)) {
+ rv = pde_poll(pde, file, pts);
unuse_pde(pde);
}
return rv;
}
+static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
+{
+ typeof_member(struct proc_ops, proc_ioctl) ioctl;
+
+ ioctl = pde->proc_ops->proc_ioctl;
+ if (ioctl)
+ return ioctl(file, cmd, arg);
+ return -ENOTTY;
+}
+
static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_ioctl) ioctl;
- ioctl = pde->proc_ops->proc_ioctl;
- if (ioctl)
- rv = ioctl(file, cmd, arg);
+ if (pde_is_permanent(pde)) {
+ return pde_ioctl(pde, file, cmd, arg);
+ } else if (use_pde(pde)) {
+ rv = pde_ioctl(pde, file, cmd, arg);
unuse_pde(pde);
}
return rv;
}
#ifdef CONFIG_COMPAT
+static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
+{
+ typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
+
+ compat_ioctl = pde->proc_ops->proc_compat_ioctl;
+ if (compat_ioctl)
+ return compat_ioctl(file, cmd, arg);
+ return -ENOTTY;
+}
+
static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
-
- compat_ioctl = pde->proc_ops->proc_compat_ioctl;
- if (compat_ioctl)
- rv = compat_ioctl(file, cmd, arg);
+ if (pde_is_permanent(pde)) {
+ return pde_compat_ioctl(pde, file, cmd, arg);
+ } else if (use_pde(pde)) {
+ rv = pde_compat_ioctl(pde, file, cmd, arg);
unuse_pde(pde);
}
return rv;
}
#endif
+static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
+{
+ typeof_member(struct proc_ops, proc_mmap) mmap;
+
+ mmap = pde->proc_ops->proc_mmap;
+ if (mmap)
+ return mmap(file, vma);
+ return -EIO;
+}
+
static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
int rv = -EIO;
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_mmap) mmap;
- mmap = pde->proc_ops->proc_mmap;
- if (mmap)
- rv = mmap(file, vma);
+ if (pde_is_permanent(pde)) {
+ return pde_mmap(pde, file, vma);
+ } else if (use_pde(pde)) {
+ rv = pde_mmap(pde, file, vma);
unuse_pde(pde);
}
return rv;
}
static unsigned long
-proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr,
unsigned long len, unsigned long pgoff,
unsigned long flags)
{
- struct proc_dir_entry *pde = PDE(file_inode(file));
- unsigned long rv = -EIO;
-
- if (use_pde(pde)) {
- typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
+ typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
- get_area = pde->proc_ops->proc_get_unmapped_area;
+ get_area = pde->proc_ops->proc_get_unmapped_area;
#ifdef CONFIG_MMU
- if (!get_area)
- get_area = current->mm->get_unmapped_area;
+ if (!get_area)
+ get_area = current->mm->get_unmapped_area;
#endif
+ if (get_area)
+ return get_area(file, orig_addr, len, pgoff, flags);
+ return orig_addr;
+}
+
+static unsigned long
+proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ unsigned long rv = -EIO;
- if (get_area)
- rv = get_area(file, orig_addr, len, pgoff, flags);
- else
- rv = orig_addr;
+ if (pde_is_permanent(pde)) {
+ return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
+ } else if (use_pde(pde)) {
+ rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
unuse_pde(pde);
}
return rv;
@@ -400,6 +470,13 @@ static int proc_reg_open(struct inode *inode, struct file *file)
typeof_member(struct proc_ops, proc_release) release;
struct pde_opener *pdeo;
+ if (pde_is_permanent(pde)) {
+ open = pde->proc_ops->proc_open;
+ if (open)
+ rv = open(inode, file);
+ return rv;
+ }
+
/*
* Ensure that
* 1) PDE's ->release hook will be called no matter what
@@ -449,6 +526,17 @@ static int proc_reg_release(struct inode *inode, struct file *file)
{
struct proc_dir_entry *pde = PDE(inode);
struct pde_opener *pdeo;
+
+ if (pde_is_permanent(pde)) {
+ typeof_member(struct proc_ops, proc_release) release;
+
+ release = pde->proc_ops->proc_release;
+ if (release) {
+ return release(inode, file);
+ }
+ return 0;
+ }
+
spin_lock(&pde->pde_unload_lock);
list_for_each_entry(pdeo, &pde->pde_openers, lh) {
if (pdeo->file == file) {
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9e294f0290e5..917cc85e3466 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -61,6 +61,7 @@ struct proc_dir_entry {
struct rb_node subdir_node;
char *name;
umode_t mode;
+ u8 flags;
u8 namelen;
char inline_name[];
} __randomize_layout;
@@ -73,6 +74,11 @@ struct proc_dir_entry {
0)
#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry))
+static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_PERMANENT;
+}
+
extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index ec1b7d2fb773..b38ad552887f 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -50,6 +50,7 @@ static __poll_t kmsg_poll(struct file *file, poll_table *wait)
static const struct proc_ops kmsg_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_read = kmsg_read,
.proc_poll = kmsg_poll,
.proc_open = kmsg_open,
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 0449edf460f5..46b3293015fe 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -224,6 +224,7 @@ static int stat_open(struct inode *inode, struct file *file)
}
static const struct proc_ops stat_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = stat_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3ba9ae83bff5..8d382d4ec067 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -123,38 +123,14 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
-static void vma_stop(struct proc_maps_private *priv)
-{
- struct mm_struct *mm = priv->mm;
-
- release_task_mempolicy(priv);
- up_read(&mm->mmap_sem);
- mmput(mm);
-}
-
-static struct vm_area_struct *
-m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
-{
- if (vma == priv->tail_vma)
- return NULL;
- return vma->vm_next ?: priv->tail_vma;
-}
-
-static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
-{
- if (m->count < m->size) /* vma is copied successfully */
- m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL;
-}
-
static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
- unsigned long last_addr = m->version;
+ unsigned long last_addr = *ppos;
struct mm_struct *mm;
struct vm_area_struct *vma;
- unsigned int pos = *ppos;
- /* See m_cache_vma(). Zero at the start or after lseek. */
+ /* See m_next(). Zero at the start or after lseek. */
if (last_addr == -1UL)
return NULL;
@@ -163,64 +139,59 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
return ERR_PTR(-ESRCH);
mm = priv->mm;
- if (!mm || !mmget_not_zero(mm))
+ if (!mm || !mmget_not_zero(mm)) {
+ put_task_struct(priv->task);
+ priv->task = NULL;
return NULL;
+ }
if (down_read_killable(&mm->mmap_sem)) {
mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
return ERR_PTR(-EINTR);
}
hold_task_mempolicy(priv);
priv->tail_vma = get_gate_vma(mm);
- if (last_addr) {
- vma = find_vma(mm, last_addr - 1);
- if (vma && vma->vm_start <= last_addr)
- vma = m_next_vma(priv, vma);
- if (vma)
- return vma;
- }
-
- m->version = 0;
- if (pos < mm->map_count) {
- for (vma = mm->mmap; pos; pos--) {
- m->version = vma->vm_start;
- vma = vma->vm_next;
- }
+ vma = find_vma(mm, last_addr);
+ if (vma)
return vma;
- }
-
- /* we do not bother to update m->version in this case */
- if (pos == mm->map_count && priv->tail_vma)
- return priv->tail_vma;
- vma_stop(priv);
- return NULL;
+ return priv->tail_vma;
}
-static void *m_next(struct seq_file *m, void *v, loff_t *pos)
+static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
- struct vm_area_struct *next;
+ struct vm_area_struct *next, *vma = v;
+
+ if (vma == priv->tail_vma)
+ next = NULL;
+ else if (vma->vm_next)
+ next = vma->vm_next;
+ else
+ next = priv->tail_vma;
+
+ *ppos = next ? next->vm_start : -1UL;
- (*pos)++;
- next = m_next_vma(priv, v);
- if (!next)
- vma_stop(priv);
return next;
}
static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
+ struct mm_struct *mm = priv->mm;
- if (!IS_ERR_OR_NULL(v))
- vma_stop(priv);
- if (priv->task) {
- put_task_struct(priv->task);
- priv->task = NULL;
- }
+ if (!priv->task)
+ return;
+
+ release_task_mempolicy(priv);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
}
static int proc_maps_open(struct inode *inode, struct file *file,
@@ -363,7 +334,6 @@ done:
static int show_map(struct seq_file *m, void *v)
{
show_map_vma(m, v);
- m_cache_vma(m, v);
return 0;
}
@@ -847,8 +817,6 @@ static int show_smap(struct seq_file *m, void *v)
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
show_smap_vma_flags(m, vma);
- m_cache_vma(m, vma);
-
return 0;
}
@@ -1887,7 +1855,6 @@ static int show_numa_map(struct seq_file *m, void *v)
seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
out:
seq_putc(m, '\n');
- m_cache_vma(m, vma);
return 0;
}
diff --git a/fs/read_write.c b/fs/read_write.c
index 59d819c5b92e..bbfa9b12b15e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -331,7 +331,8 @@ COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned i
}
#endif
-#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT)
+#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
+ defined(__ARCH_WANT_SYS_LLSEEK)
SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
unsigned long, offset_low, loff_t __user *, result,
unsigned int, whence)
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 4075e41408b4..5129efc6f2e6 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -842,7 +842,7 @@ static void balance_leaf_paste_right_whole(struct tree_balance *tb,
struct item_head *pasted;
struct buffer_info bi;
- buffer_info_init_right(tb, &bi);
+ buffer_info_init_right(tb, &bi);
leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
/* append item in R[0] */
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 45e1a5d11af3..adb21bea3d60 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -184,11 +184,12 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
}
/* we need to make sure nobody is changing the file size beneath us */
-{
- int depth = reiserfs_write_unlock_nested(inode->i_sb);
- inode_lock(inode);
- reiserfs_write_lock_nested(inode->i_sb, depth);
-}
+ {
+ int depth = reiserfs_write_unlock_nested(inode->i_sb);
+
+ inode_lock(inode);
+ reiserfs_write_lock_nested(inode->i_sb, depth);
+ }
reiserfs_write_lock(inode->i_sb);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 959a066b7bb0..1594687582f0 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -838,10 +838,10 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
*/
INC_DIR_INODE_NLINK(dir)
- retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ ,
- old_format_only(dir->i_sb) ?
- EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
- dentry, inode, &security);
+ retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */,
+ old_format_only(dir->i_sb) ?
+ EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
+ dentry, inode, &security);
if (retval) {
DEC_DIR_INODE_NLINK(dir)
goto out_failed;
@@ -967,7 +967,7 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
reiserfs_update_sd(&th, inode);
DEC_DIR_INODE_NLINK(dir)
- dir->i_size -= (DEH_SIZE + de.de_entrylen);
+ dir->i_size -= (DEH_SIZE + de.de_entrylen);
reiserfs_update_sd(&th, dir);
/* prevent empty directory from getting lost */
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 1600034a929b..70f5fdf99bf6 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -68,13 +68,6 @@ int seq_open(struct file *file, const struct seq_operations *op)
p->file = file;
/*
- * Wrappers around seq_open(e.g. swaps_open) need to be
- * aware of this. If they set f_version themselves, they
- * should call seq_open first and then set f_version.
- */
- file->f_version = 0;
-
- /*
* seq_files support lseek() and pread(). They do not implement
* write() at all, but we clear FMODE_PWRITE here for historical
* reasons.
@@ -94,7 +87,6 @@ static int traverse(struct seq_file *m, loff_t offset)
int error = 0;
void *p;
- m->version = 0;
m->index = 0;
m->count = m->from = 0;
if (!offset)
@@ -161,25 +153,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
mutex_lock(&m->lock);
/*
- * seq_file->op->..m_start/m_stop/m_next may do special actions
- * or optimisations based on the file->f_version, so we want to
- * pass the file->f_version to those methods.
- *
- * seq_file->version is just copy of f_version, and seq_file
- * methods can treat it simply as file version.
- * It is copied in first and copied out after all operations.
- * It is convenient to have it as part of structure to avoid the
- * need of passing another argument to all the seq_file methods.
- */
- m->version = file->f_version;
-
- /*
* if request is to read from zero offset, reset iterator to first
* record as it might have been already advanced by previous requests
*/
if (*ppos == 0) {
m->index = 0;
- m->version = 0;
m->count = 0;
}
@@ -190,7 +168,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
if (err) {
/* With prejudice... */
m->read_pos = 0;
- m->version = 0;
m->index = 0;
m->count = 0;
goto Done;
@@ -243,7 +220,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
m->buf = seq_buf_alloc(m->size <<= 1);
if (!m->buf)
goto Enomem;
- m->version = 0;
p = m->op->start(m, &m->index);
}
m->op->stop(m, p);
@@ -256,9 +232,12 @@ Fill:
loff_t pos = m->index;
p = m->op->next(m, p, &m->index);
- if (pos == m->index)
- /* Buggy ->next function */
+ if (pos == m->index) {
+ pr_info_ratelimited("buggy seq_file .next function %ps "
+ "did not updated position index\n",
+ m->op->next);
m->index++;
+ }
if (!p || IS_ERR(p)) {
err = PTR_ERR(p);
break;
@@ -287,7 +266,6 @@ Done:
*ppos += copied;
m->read_pos += copied;
}
- file->f_version = m->version;
mutex_unlock(&m->lock);
return copied;
Enomem:
@@ -313,7 +291,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence)
loff_t retval = -EINVAL;
mutex_lock(&m->lock);
- m->version = file->f_version;
switch (whence) {
case SEEK_CUR:
offset += file->f_pos;
@@ -329,7 +306,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence)
/* with extreme prejudice... */
file->f_pos = 0;
m->read_pos = 0;
- m->version = 0;
m->index = 0;
m->count = 0;
} else {
@@ -340,7 +316,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence)
file->f_pos = offset;
}
}
- file->f_version = m->version;
mutex_unlock(&m->lock);
return retval;
}
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 8ceb51478800..7e4bfaf2871f 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -225,7 +225,7 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum)
int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
int offs, int quiet, int must_chk_crc)
{
- int err = -EINVAL, type, node_len;
+ int err = -EINVAL, type, node_len, dump_node = 1;
uint32_t crc, node_crc, magic;
const struct ubifs_ch *ch = buf;
@@ -278,10 +278,22 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
out_len:
if (!quiet)
ubifs_err(c, "bad node length %d", node_len);
+ if (type == UBIFS_DATA_NODE && node_len > UBIFS_DATA_NODE_SZ)
+ dump_node = 0;
out:
if (!quiet) {
ubifs_err(c, "bad node at LEB %d:%d", lnum, offs);
- ubifs_dump_node(c, buf);
+ if (dump_node) {
+ ubifs_dump_node(c, buf);
+ } else {
+ int safe_len = min3(node_len, c->leb_size - offs,
+ (int)UBIFS_MAX_DATA_NODE_SZ);
+ pr_err("\tprevent out-of-bounds memory access\n");
+ pr_err("\ttruncated data node length %d\n", safe_len);
+ pr_err("\tcorrupted data node:\n");
+ print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1,
+ buf, safe_len, 0);
+ }
dump_stack();
}
return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 3bf8b1fda9d7..e5ec1afe1c66 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -905,6 +905,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
ubifs_err(c, "dead directory entry '%s', error %d",
xent->name, err);
ubifs_ro_mode(c, err);
+ kfree(xent);
goto out_release;
}
ubifs_assert(c, ubifs_inode(xino)->xattr);
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index edf43ddd7dce..283f9eb48410 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -157,7 +157,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
int err = 0;
ino_t xattr_inum;
union ubifs_key key;
- struct ubifs_dent_node *xent;
+ struct ubifs_dent_node *xent, *pxent = NULL;
struct fscrypt_name nm = {0};
struct ubifs_orphan *xattr_orphan;
struct ubifs_orphan *orphan;
@@ -181,11 +181,16 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
xattr_inum = le64_to_cpu(xent->inum);
xattr_orphan = orphan_add(c, xattr_inum, orphan);
- if (IS_ERR(xattr_orphan))
+ if (IS_ERR(xattr_orphan)) {
+ kfree(xent);
return PTR_ERR(xattr_orphan);
+ }
+ kfree(pxent);
+ pxent = xent;
key_read(c, &xent->key, &key);
}
+ kfree(pxent);
return 0;
}
@@ -688,14 +693,14 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
ino_key_init(c, &key1, inum);
err = ubifs_tnc_lookup(c, &key1, ino);
- if (err)
+ if (err && err != -ENOENT)
goto out_free;
/*
* Check whether an inode can really get deleted.
* linkat() with O_TMPFILE allows rebirth of an inode.
*/
- if (ino->nlink == 0) {
+ if (err == 0 && ino->nlink == 0) {
dbg_rcvry("deleting orphaned inode %lu",
(unsigned long)inum);
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index 3fd85464abd5..736ebc5dc441 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -5,7 +5,7 @@
* http://www.ecma.ch
*
* Copyright (c) 2001-2002 Ben Fennema
- * Copyright (c) 2017-2019 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (c) 2017-2019 Pali Rohár <pali@kernel.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h
index 35e61b2cacfe..d5fbfab3ddb6 100644
--- a/fs/udf/osta_udf.h
+++ b/fs/udf/osta_udf.h
@@ -5,7 +5,7 @@
* http://www.osta.org
*
* Copyright (c) 2001-2004 Ben Fennema
- * Copyright (c) 2017-2019 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (c) 2017-2019 Pali Rohár <pali@kernel.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 703c1c3faa6e..e39fdec8a0b0 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -314,8 +314,11 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
if (!pmd_present(_pmd))
goto out;
- if (pmd_trans_huge(_pmd))
+ if (pmd_trans_huge(_pmd)) {
+ if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
+ ret = true;
goto out;
+ }
/*
* the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
@@ -328,6 +331,8 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
*/
if (pte_none(*pte))
ret = true;
+ if (!pte_write(*pte) && (reason & VM_UFFD_WP))
+ ret = true;
pte_unmap(pte);
out:
@@ -1287,10 +1292,13 @@ static __always_inline int validate_range(struct mm_struct *mm,
return 0;
}
-static inline bool vma_can_userfault(struct vm_area_struct *vma)
+static inline bool vma_can_userfault(struct vm_area_struct *vma,
+ unsigned long vm_flags)
{
- return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
- vma_is_shmem(vma);
+ /* FIXME: add WP support to hugetlbfs and shmem */
+ return vma_is_anonymous(vma) ||
+ ((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) &&
+ !(vm_flags & VM_UFFD_WP));
}
static int userfaultfd_register(struct userfaultfd_ctx *ctx,
@@ -1322,15 +1330,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vm_flags = 0;
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
vm_flags |= VM_UFFD_MISSING;
- if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
vm_flags |= VM_UFFD_WP;
- /*
- * FIXME: remove the below error constraint by
- * implementing the wprotect tracking mode.
- */
- ret = -EINVAL;
- goto out;
- }
ret = validate_range(mm, &uffdio_register.range.start,
uffdio_register.range.len);
@@ -1380,7 +1381,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
/* check not compatible vmas */
ret = -EINVAL;
- if (!vma_can_userfault(cur))
+ if (!vma_can_userfault(cur, vm_flags))
goto out_unlock;
/*
@@ -1408,6 +1409,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
if (end & (vma_hpagesize - 1))
goto out_unlock;
}
+ if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
+ goto out_unlock;
/*
* Check that this vma isn't already owned by a
@@ -1437,7 +1440,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
do {
cond_resched();
- BUG_ON(!vma_can_userfault(vma));
+ BUG_ON(!vma_can_userfault(vma, vm_flags));
BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
vma->vm_userfaultfd_ctx.ctx != ctx);
WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
@@ -1492,14 +1495,24 @@ out_unlock:
up_write(&mm->mmap_sem);
mmput(mm);
if (!ret) {
+ __u64 ioctls_out;
+
+ ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
+ UFFD_API_RANGE_IOCTLS;
+
+ /*
+ * Declare the WP ioctl only if the WP mode is
+ * specified and all checks passed with the range
+ */
+ if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
+ ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
+
/*
* Now that we scanned all vmas we can already tell
* userland which ioctls methods are guaranteed to
* succeed on this range.
*/
- if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
- UFFD_API_RANGE_IOCTLS,
- &user_uffdio_register->ioctls))
+ if (put_user(ioctls_out, &user_uffdio_register->ioctls))
ret = -EFAULT;
}
out:
@@ -1575,7 +1588,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* provides for more strict behavior to notice
* unregistration errors.
*/
- if (!vma_can_userfault(cur))
+ if (!vma_can_userfault(cur, cur->vm_flags))
goto out_unlock;
found = true;
@@ -1589,7 +1602,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
do {
cond_resched();
- BUG_ON(!vma_can_userfault(vma));
+ BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
/*
* Nothing to do: this vma is already registered into this
@@ -1724,11 +1737,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
ret = -EINVAL;
if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
goto out;
- if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
+ if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
goto out;
if (mmget_not_zero(ctx->mm)) {
ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len, &ctx->mmap_changing);
+ uffdio_copy.len, &ctx->mmap_changing,
+ uffdio_copy.mode);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1801,6 +1815,53 @@ out:
return ret;
}
+static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ int ret;
+ struct uffdio_writeprotect uffdio_wp;
+ struct uffdio_writeprotect __user *user_uffdio_wp;
+ struct userfaultfd_wake_range range;
+ bool mode_wp, mode_dontwake;
+
+ if (READ_ONCE(ctx->mmap_changing))
+ return -EAGAIN;
+
+ user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
+
+ if (copy_from_user(&uffdio_wp, user_uffdio_wp,
+ sizeof(struct uffdio_writeprotect)))
+ return -EFAULT;
+
+ ret = validate_range(ctx->mm, &uffdio_wp.range.start,
+ uffdio_wp.range.len);
+ if (ret)
+ return ret;
+
+ if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
+ UFFDIO_WRITEPROTECT_MODE_WP))
+ return -EINVAL;
+
+ mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
+ mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
+
+ if (mode_wp && mode_dontwake)
+ return -EINVAL;
+
+ ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
+ uffdio_wp.range.len, mode_wp,
+ &ctx->mmap_changing);
+ if (ret)
+ return ret;
+
+ if (!mode_wp && !mode_dontwake) {
+ range.start = uffdio_wp.range.start;
+ range.len = uffdio_wp.range.len;
+ wake_userfault(ctx, &range);
+ }
+ return ret;
+}
+
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
@@ -1882,6 +1943,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
case UFFDIO_ZEROPAGE:
ret = userfaultfd_zeropage(ctx, arg);
break;
+ case UFFDIO_WRITEPROTECT:
+ ret = userfaultfd_writeprotect(ctx, arg);
+ break;
}
return ret;
}
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 00266de58954..c526c5e5ab76 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -328,6 +328,38 @@ xfs_validate_sb_common(
return -EFSCORRUPTED;
}
+ /* Validate the realtime geometry; stolen from xfs_repair */
+ if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
+ sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) {
+ xfs_notice(mp,
+ "realtime extent sanity check failed");
+ return -EFSCORRUPTED;
+ }
+
+ if (sbp->sb_rblocks == 0) {
+ if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
+ sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) {
+ xfs_notice(mp,
+ "realtime zeroed geometry check failed");
+ return -EFSCORRUPTED;
+ }
+ } else {
+ uint64_t rexts;
+ uint64_t rbmblocks;
+
+ rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize);
+ rbmblocks = howmany_64(sbp->sb_rextents,
+ NBBY * sbp->sb_blocksize);
+
+ if (sbp->sb_rextents != rexts ||
+ sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) ||
+ sbp->sb_rbmblocks != rbmblocks) {
+ xfs_notice(mp,
+ "realtime geometry sanity check failed");
+ return -EFSCORRUPTED;
+ }
+ }
+
if (sbp->sb_unit) {
if (!xfs_sb_version_hasdalign(sbp) ||
sbp->sb_unit > sbp->sb_width ||
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f880141a2268..9ec3eaf1c618 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -327,6 +327,9 @@ xfs_buf_free(
__free_page(page);
}
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab +=
+ bp->b_page_count;
} else if (bp->b_flags & _XBF_KMEM)
kmem_free(bp->b_addr);
_xfs_buf_free_pages(bp);
@@ -2114,9 +2117,11 @@ xfs_buf_delwri_pushbuf(
int __init
xfs_buf_init(void)
{
- xfs_buf_zone = kmem_cache_create("xfs_buf",
- sizeof(struct xfs_buf), 0,
- SLAB_HWCACHE_ALIGN, NULL);
+ xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT |
+ SLAB_MEM_SPREAD,
+ NULL);
if (!xfs_buf_zone)
goto out;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 711376ca269f..af2c8e5ceea0 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1105,8 +1105,8 @@ xfs_qm_dqflush(
* Get the buffer containing the on-disk dquot
*/
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0, &bp,
- &xfs_dquot_buf_ops);
+ mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK,
+ &bp, &xfs_dquot_buf_ops);
if (error)
goto out_unlock;
@@ -1177,7 +1177,7 @@ xfs_qm_dqflush(
out_unlock:
xfs_dqfunlock(dqp);
- return -EIO;
+ return error;
}
/*
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index cf65e2e43c6e..baad1748d0d1 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -189,7 +189,8 @@ xfs_qm_dquot_logitem_push(
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
- }
+ } else if (error == -EAGAIN)
+ rval = XFS_ITEM_LOCKED;
spin_lock(&lip->li_ailp->ail_lock);
out_unlock:
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index f1372f9046e3..5a4b0119143a 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -15,7 +15,6 @@
#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_icache.h"
-#include "xfs_log.h"
#include "xfs_pnfs.h"
/*
@@ -221,18 +220,7 @@ STATIC int
xfs_fs_nfs_commit_metadata(
struct inode *inode)
{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- xfs_lsn_t lsn = 0;
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- if (!lsn)
- return 0;
- return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return xfs_log_force_inode(XFS_I(inode));
}
const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index b8a4a3f29b36..4b8bdecc3863 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -80,19 +80,9 @@ xfs_dir_fsync(
int datasync)
{
struct xfs_inode *ip = XFS_I(file->f_mapping->host);
- struct xfs_mount *mp = ip->i_mount;
- xfs_lsn_t lsn = 0;
trace_xfs_dir_fsync(ip);
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- if (!lsn)
- return 0;
- return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return xfs_log_force_inode(ip);
}
STATIC int
@@ -1069,7 +1059,11 @@ xfs_file_remap_range(
ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
remap_flags);
+ if (ret)
+ goto out_unlock;
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_log_force_inode(dest);
out_unlock:
xfs_reflink_remap_unlock(file_in, file_out);
if (ret)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 14b922f2a6db..d1772786af29 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1200,8 +1200,7 @@ xfs_create(
unlock_dp_on_error = false;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
- resblks ?
- resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+ resblks - XFS_IALLOC_SPACE_RES(mp));
if (error) {
ASSERT(error != -ENOSPC);
goto out_trans_cancel;
@@ -2504,6 +2503,88 @@ out:
}
/*
+ * Look up the inode number specified and mark it stale if it is found. If it is
+ * dirty, return the inode so it can be attached to the cluster buffer so it can
+ * be processed appropriately when the cluster free transaction completes.
+ */
+static struct xfs_inode *
+xfs_ifree_get_one_inode(
+ struct xfs_perag *pag,
+ struct xfs_inode *free_ip,
+ xfs_ino_t inum)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_inode *ip;
+
+retry:
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
+
+ /* Inode not in memory, nothing to do */
+ if (!ip)
+ goto out_rcu_unlock;
+
+ /*
+ * because this is an RCU protected lookup, we could find a recently
+ * freed or even reallocated inode during the lookup. We need to check
+ * under the i_flags_lock for a valid inode here. Skip it if it is not
+ * valid, the wrong inode or stale.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) {
+ spin_unlock(&ip->i_flags_lock);
+ goto out_rcu_unlock;
+ }
+ spin_unlock(&ip->i_flags_lock);
+
+ /*
+ * Don't try to lock/unlock the current inode, but we _cannot_ skip the
+ * other inodes that we did not find in the list attached to the buffer
+ * and are not already marked stale. If we can't lock it, back off and
+ * retry.
+ */
+ if (ip != free_ip) {
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+ rcu_read_unlock();
+ delay(1);
+ goto retry;
+ }
+
+ /*
+ * Check the inode number again in case we're racing with
+ * freeing in xfs_reclaim_inode(). See the comments in that
+ * function for more information as to why the initial check is
+ * not sufficient.
+ */
+ if (ip->i_ino != inum) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ goto out_rcu_unlock;
+ }
+ }
+ rcu_read_unlock();
+
+ xfs_iflock(ip);
+ xfs_iflags_set(ip, XFS_ISTALE);
+
+ /*
+ * We don't need to attach clean inodes or those only with unlogged
+ * changes (which we throw away, anyway).
+ */
+ if (!ip->i_itemp || xfs_inode_clean(ip)) {
+ ASSERT(ip != free_ip);
+ xfs_ifunlock(ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ goto out_no_inode;
+ }
+ return ip;
+
+out_rcu_unlock:
+ rcu_read_unlock();
+out_no_inode:
+ return NULL;
+}
+
+/*
* A big issue when freeing the inode cluster is that we _cannot_ skip any
* inodes that are in memory - they all must be marked stale and attached to
* the cluster buffer.
@@ -2603,77 +2684,11 @@ xfs_ifree_cluster(
* even trying to lock them.
*/
for (i = 0; i < igeo->inodes_per_cluster; i++) {
-retry:
- rcu_read_lock();
- ip = radix_tree_lookup(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, (inum + i)));
-
- /* Inode not in memory, nothing to do */
- if (!ip) {
- rcu_read_unlock();
+ ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i);
+ if (!ip)
continue;
- }
-
- /*
- * because this is an RCU protected lookup, we could
- * find a recently freed or even reallocated inode
- * during the lookup. We need to check under the
- * i_flags_lock for a valid inode here. Skip it if it
- * is not valid, the wrong inode or stale.
- */
- spin_lock(&ip->i_flags_lock);
- if (ip->i_ino != inum + i ||
- __xfs_iflags_test(ip, XFS_ISTALE)) {
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
- continue;
- }
- spin_unlock(&ip->i_flags_lock);
-
- /*
- * Don't try to lock/unlock the current inode, but we
- * _cannot_ skip the other inodes that we did not find
- * in the list attached to the buffer and are not
- * already marked stale. If we can't lock it, back off
- * and retry.
- */
- if (ip != free_ip) {
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
- rcu_read_unlock();
- delay(1);
- goto retry;
- }
-
- /*
- * Check the inode number again in case we're
- * racing with freeing in xfs_reclaim_inode().
- * See the comments in that function for more
- * information as to why the initial check is
- * not sufficient.
- */
- if (ip->i_ino != inum + i) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- rcu_read_unlock();
- continue;
- }
- }
- rcu_read_unlock();
-
- xfs_iflock(ip);
- xfs_iflags_set(ip, XFS_ISTALE);
- /*
- * we don't need to attach clean inodes or those only
- * with unlogged changes (which we throw away, anyway).
- */
iip = ip->i_itemp;
- if (!iip || xfs_inode_clean(ip)) {
- ASSERT(ip != free_ip);
- xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- continue;
- }
-
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
iip->ili_fsync_fields = 0;
@@ -3930,3 +3945,22 @@ xfs_irele(
trace_xfs_irele(ip, _RET_IP_);
iput(VFS_I(ip));
}
+
+/*
+ * Ensure all commited transactions touching the inode are written to the log.
+ */
+int
+xfs_log_force_inode(
+ struct xfs_inode *ip)
+{
+ xfs_lsn_t lsn = 0;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(ip))
+ lsn = ip->i_itemp->ili_last_lsn;
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!lsn)
+ return 0;
+ return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 492e53992fa9..c6a63f6764a6 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -426,6 +426,7 @@ int xfs_itruncate_extents_flags(struct xfs_trans **,
struct xfs_inode *, int, xfs_fsize_t, int);
void xfs_iext_realloc(xfs_inode_t *, int, int);
+int xfs_log_force_inode(struct xfs_inode *ip);
void xfs_iunpin_wait(xfs_inode_t *);
#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 4a3d13d4a022..f779cca2346f 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -552,7 +552,8 @@ xfs_inode_item_push(
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
- }
+ } else if (error == -EAGAIN)
+ rval = XFS_ITEM_LOCKED;
spin_lock(&lip->li_ailp->ail_lock);
out_unlock:
@@ -730,29 +731,27 @@ xfs_iflush_done(
* holding the lock before removing the inode from the AIL.
*/
if (need_ail) {
- bool mlip_changed = false;
+ xfs_lsn_t tail_lsn = 0;
/* this is an opencoded batch version of xfs_trans_ail_delete */
spin_lock(&ailp->ail_lock);
list_for_each_entry(blip, &tmp, li_bio_list) {
if (INODE_ITEM(blip)->ili_logged &&
- blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
- mlip_changed |= xfs_ail_delete_one(ailp, blip);
- else {
+ blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) {
+ /*
+ * xfs_ail_update_finish() only cares about the
+ * lsn of the first tail item removed, any
+ * others will be at the same or higher lsn so
+ * we just ignore them.
+ */
+ xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip);
+ if (!tail_lsn && lsn)
+ tail_lsn = lsn;
+ } else {
xfs_clear_li_failed(blip);
}
}
-
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
- xlog_assign_tail_lsn_locked(ailp->ail_mount);
- if (list_empty(&ailp->ail_head))
- wake_up_all(&ailp->ail_empty);
- }
- spin_unlock(&ailp->ail_lock);
-
- if (mlip_changed)
- xfs_log_space_wake(ailp->ail_mount);
+ xfs_ail_update_finish(ailp, tail_lsn);
}
/*
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 4a53768c5397..00fda2e8e738 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -24,13 +24,6 @@
kmem_zone_t *xfs_log_ticket_zone;
/* Local miscellaneous function prototypes */
-STATIC int
-xlog_commit_record(
- struct xlog *log,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- xfs_lsn_t *commitlsnp);
-
STATIC struct xlog *
xlog_alloc_log(
struct xfs_mount *mp,
@@ -66,14 +59,6 @@ xlog_grant_push_ail(
struct xlog *log,
int need_bytes);
STATIC void
-xlog_regrant_reserve_log_space(
- struct xlog *log,
- struct xlog_ticket *ticket);
-STATIC void
-xlog_ungrant_log_space(
- struct xlog *log,
- struct xlog_ticket *ticket);
-STATIC void
xlog_sync(
struct xlog *log,
struct xlog_in_core *iclog);
@@ -478,73 +463,6 @@ out_error:
return error;
}
-
-/*
- * NOTES:
- *
- * 1. currblock field gets updated at startup and after in-core logs
- * marked as with WANT_SYNC.
- */
-
-/*
- * This routine is called when a user of a log manager ticket is done with
- * the reservation. If the ticket was ever used, then a commit record for
- * the associated transaction is written out as a log operation header with
- * no data. The flag XLOG_TIC_INITED is set when the first write occurs with
- * a given ticket. If the ticket was one with a permanent reservation, then
- * a few operations are done differently. Permanent reservation tickets by
- * default don't release the reservation. They just commit the current
- * transaction with the belief that the reservation is still needed. A flag
- * must be passed in before permanent reservations are actually released.
- * When these type of tickets are not released, they need to be set into
- * the inited state again. By doing this, a start record will be written
- * out when the next write occurs.
- */
-xfs_lsn_t
-xfs_log_done(
- struct xfs_mount *mp,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- bool regrant)
-{
- struct xlog *log = mp->m_log;
- xfs_lsn_t lsn = 0;
-
- if (XLOG_FORCED_SHUTDOWN(log) ||
- /*
- * If nothing was ever written, don't write out commit record.
- * If we get an error, just continue and give back the log ticket.
- */
- (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
- (xlog_commit_record(log, ticket, iclog, &lsn)))) {
- lsn = (xfs_lsn_t) -1;
- regrant = false;
- }
-
-
- if (!regrant) {
- trace_xfs_log_done_nonperm(log, ticket);
-
- /*
- * Release ticket if not permanent reservation or a specific
- * request has been made to release a permanent reservation.
- */
- xlog_ungrant_log_space(log, ticket);
- } else {
- trace_xfs_log_done_perm(log, ticket);
-
- xlog_regrant_reserve_log_space(log, ticket);
- /* If this ticket was a permanent reservation and we aren't
- * trying to release it, reset the inited flags; so next time
- * we write, a start record will be written out.
- */
- ticket->t_flags |= XLOG_TIC_INITED;
- }
-
- xfs_log_ticket_put(ticket);
- return lsn;
-}
-
static bool
__xlog_state_release_iclog(
struct xlog *log,
@@ -869,32 +787,44 @@ xlog_wait_on_iclog(
}
/*
- * Final log writes as part of unmount.
- *
- * Mark the filesystem clean as unmount happens. Note that during relocation
- * this routine needs to be executed as part of source-bag while the
- * deallocation must not be done until source-end.
+ * Write out an unmount record using the ticket provided. We have to account for
+ * the data space used in the unmount ticket as this write is not done from a
+ * transaction context that has already done the accounting for us.
*/
-
-/* Actually write the unmount record to disk. */
-static void
-xfs_log_write_unmount_record(
- struct xfs_mount *mp)
+static int
+xlog_write_unmount_record(
+ struct xlog *log,
+ struct xlog_ticket *ticket,
+ xfs_lsn_t *lsn,
+ uint flags)
{
- /* the data section must be 32 bit size aligned */
- struct xfs_unmount_log_format magic = {
+ struct xfs_unmount_log_format ulf = {
.magic = XLOG_UNMOUNT_TYPE,
};
struct xfs_log_iovec reg = {
- .i_addr = &magic,
- .i_len = sizeof(magic),
+ .i_addr = &ulf,
+ .i_len = sizeof(ulf),
.i_type = XLOG_REG_TYPE_UNMOUNT,
};
struct xfs_log_vec vec = {
.lv_niovecs = 1,
.lv_iovecp = &reg,
};
- struct xlog *log = mp->m_log;
+
+ /* account for space used by record data */
+ ticket->t_curr_res -= sizeof(ulf);
+ return xlog_write(log, &vec, ticket, lsn, NULL, flags, false);
+}
+
+/*
+ * Mark the filesystem clean by writing an unmount record to the head of the
+ * log.
+ */
+static void
+xlog_unmount_write(
+ struct xlog *log)
+{
+ struct xfs_mount *mp = log->l_mp;
struct xlog_in_core *iclog;
struct xlog_ticket *tic = NULL;
xfs_lsn_t lsn;
@@ -905,23 +835,7 @@ xfs_log_write_unmount_record(
if (error)
goto out_err;
- /*
- * If we think the summary counters are bad, clear the unmount header
- * flag in the unmount record so that the summary counters will be
- * recalculated during log recovery at next mount. Refer to
- * xlog_check_unmount_rec for more details.
- */
- if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
- XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
- xfs_alert(mp, "%s: will fix summary counters at next mount",
- __func__);
- flags &= ~XLOG_UNMOUNT_TRANS;
- }
-
- /* remove inited flag, and account for space used */
- tic->t_flags = 0;
- tic->t_curr_res -= sizeof(magic);
- error = xlog_write(log, &vec, tic, &lsn, NULL, flags);
+ error = xlog_write_unmount_record(log, tic, &lsn, flags);
/*
* At this point, we're umounting anyway, so there's no point in
* transitioning log state to IOERROR. Just continue...
@@ -943,8 +857,7 @@ out_err:
if (tic) {
trace_xfs_log_umount_write(log, tic);
- xlog_ungrant_log_space(log, tic);
- xfs_log_ticket_put(tic);
+ xfs_log_ticket_ungrant(log, tic);
}
}
@@ -987,8 +900,22 @@ xfs_log_unmount_write(
if (XLOG_FORCED_SHUTDOWN(log))
return;
+
+ /*
+ * If we think the summary counters are bad, avoid writing the unmount
+ * record to force log recovery at next mount, after which the summary
+ * counters will be recalculated. Refer to xlog_check_unmount_rec for
+ * more details.
+ */
+ if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
+ XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
+ xfs_alert(mp, "%s: will fix summary counters at next mount",
+ __func__);
+ return;
+ }
+
xfs_log_unmount_verify_iclog(log);
- xfs_log_write_unmount_record(mp);
+ xlog_unmount_write(log);
}
/*
@@ -1515,20 +1442,17 @@ out:
return ERR_PTR(error);
} /* xlog_alloc_log */
-
/*
* Write out the commit record of a transaction associated with the given
- * ticket. Return the lsn of the commit record.
+ * ticket to close off a running log write. Return the lsn of the commit record.
*/
-STATIC int
+int
xlog_commit_record(
struct xlog *log,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
- xfs_lsn_t *commitlsnp)
+ xfs_lsn_t *lsn)
{
- struct xfs_mount *mp = log->l_mp;
- int error;
struct xfs_log_iovec reg = {
.i_addr = NULL,
.i_len = 0,
@@ -1538,12 +1462,15 @@ xlog_commit_record(
.lv_niovecs = 1,
.lv_iovecp = &reg,
};
+ int error;
+
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return -EIO;
- ASSERT_ALWAYS(iclog);
- error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
- XLOG_COMMIT_TRANS);
+ error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS,
+ false);
if (error)
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
return error;
}
@@ -1761,7 +1688,15 @@ xlog_write_iclog(
iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
iclog->ic_bio.bi_end_io = xlog_bio_end_io;
iclog->ic_bio.bi_private = iclog;
- iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA;
+
+ /*
+ * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more
+ * IOs coming immediately after this one. This prevents the block layer
+ * writeback throttle from throttling log writes behind background
+ * metadata writeback and causing priority inversions.
+ */
+ iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC |
+ REQ_IDLE | REQ_FUA;
if (need_flush)
iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
@@ -1981,7 +1916,7 @@ xlog_dealloc_log(
log->l_mp->m_log = NULL;
destroy_workqueue(log->l_ioend_workqueue);
kmem_free(log);
-} /* xlog_dealloc_log */
+}
/*
* Update counters atomically now that memcpy is done.
@@ -2118,23 +2053,21 @@ xlog_print_trans(
}
/*
- * Calculate the potential space needed by the log vector. Each region gets
- * its own xlog_op_header_t and may need to be double word aligned.
+ * Calculate the potential space needed by the log vector. We may need a start
+ * record, and each region gets its own struct xlog_op_header and may need to be
+ * double word aligned.
*/
static int
xlog_write_calc_vec_length(
struct xlog_ticket *ticket,
- struct xfs_log_vec *log_vector)
+ struct xfs_log_vec *log_vector,
+ bool need_start_rec)
{
struct xfs_log_vec *lv;
- int headers = 0;
+ int headers = need_start_rec ? 1 : 0;
int len = 0;
int i;
- /* acct for start rec of xact */
- if (ticket->t_flags & XLOG_TIC_INITED)
- headers++;
-
for (lv = log_vector; lv; lv = lv->lv_next) {
/* we don't write ordered log vectors */
if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
@@ -2156,27 +2089,16 @@ xlog_write_calc_vec_length(
return len;
}
-/*
- * If first write for transaction, insert start record We can't be trying to
- * commit if we are inited. We can't have any "partial_copy" if we are inited.
- */
-static int
+static void
xlog_write_start_rec(
struct xlog_op_header *ophdr,
struct xlog_ticket *ticket)
{
- if (!(ticket->t_flags & XLOG_TIC_INITED))
- return 0;
-
ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
ophdr->oh_clientid = ticket->t_clientid;
ophdr->oh_len = 0;
ophdr->oh_flags = XLOG_START_TRANS;
ophdr->oh_res2 = 0;
-
- ticket->t_flags &= ~XLOG_TIC_INITED;
-
- return sizeof(struct xlog_op_header);
}
static xlog_op_header_t *
@@ -2365,13 +2287,14 @@ xlog_write(
struct xlog_ticket *ticket,
xfs_lsn_t *start_lsn,
struct xlog_in_core **commit_iclog,
- uint flags)
+ uint flags,
+ bool need_start_rec)
{
struct xlog_in_core *iclog = NULL;
- struct xfs_log_iovec *vecp;
- struct xfs_log_vec *lv;
+ struct xfs_log_vec *lv = log_vector;
+ struct xfs_log_iovec *vecp = lv->lv_iovecp;
+ int index = 0;
int len;
- int index;
int partial_copy = 0;
int partial_copy_len = 0;
int contwr = 0;
@@ -2379,25 +2302,13 @@ xlog_write(
int data_cnt = 0;
int error = 0;
- *start_lsn = 0;
-
- len = xlog_write_calc_vec_length(ticket, log_vector);
-
/*
- * Region headers and bytes are already accounted for.
- * We only need to take into account start records and
- * split regions in this function.
+ * If this is a commit or unmount transaction, we don't need a start
+ * record to be written. We do, however, have to account for the
+ * commit or unmount header that gets written. Hence we always have
+ * to account for an extra xlog_op_header here.
*/
- if (ticket->t_flags & XLOG_TIC_INITED)
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
-
- /*
- * Commit record headers need to be accounted for. These
- * come in as separate writes so are easy to detect.
- */
- if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
-
+ ticket->t_curr_res -= sizeof(struct xlog_op_header);
if (ticket->t_curr_res < 0) {
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"ctx ticket reservation ran out. Need to up reservation");
@@ -2405,9 +2316,8 @@ xlog_write(
xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
}
- index = 0;
- lv = log_vector;
- vecp = lv->lv_iovecp;
+ len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec);
+ *start_lsn = 0;
while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
void *ptr;
int log_offset;
@@ -2431,7 +2341,6 @@ xlog_write(
while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
struct xfs_log_iovec *reg;
struct xlog_op_header *ophdr;
- int start_rec_copy;
int copy_len;
int copy_off;
bool ordered = false;
@@ -2447,11 +2356,15 @@ xlog_write(
ASSERT(reg->i_len % sizeof(int32_t) == 0);
ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
- start_rec_copy = xlog_write_start_rec(ptr, ticket);
- if (start_rec_copy) {
- record_cnt++;
+ /*
+ * Before we start formatting log vectors, we need to
+ * write a start record. Only do this for the first
+ * iclog we write to.
+ */
+ if (need_start_rec) {
+ xlog_write_start_rec(ptr, ticket);
xlog_write_adv_cnt(&ptr, &len, &log_offset,
- start_rec_copy);
+ sizeof(struct xlog_op_header));
}
ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
@@ -2483,8 +2396,13 @@ xlog_write(
xlog_write_adv_cnt(&ptr, &len, &log_offset,
copy_len);
}
- copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+ copy_len += sizeof(struct xlog_op_header);
record_cnt++;
+ if (need_start_rec) {
+ copy_len += sizeof(struct xlog_op_header);
+ record_cnt++;
+ need_start_rec = false;
+ }
data_cnt += contwr ? copy_len : 0;
error = xlog_write_copy_finish(log, iclog, flags,
@@ -2541,14 +2459,6 @@ next_lv:
return error;
}
-
-/*****************************************************************************
- *
- * State Machine functions
- *
- *****************************************************************************
- */
-
static void
xlog_state_activate_iclog(
struct xlog_in_core *iclog,
@@ -2909,7 +2819,7 @@ xlog_state_done_syncing(
*/
wake_up_all(&iclog->ic_write_wait);
spin_unlock(&log->l_icloglock);
- xlog_state_do_callback(log); /* also cleans log */
+ xlog_state_do_callback(log);
}
/*
@@ -3029,21 +2939,21 @@ restart:
*logoffsetp = log_offset;
return 0;
-} /* xlog_state_get_iclog_space */
-
-/* The first cnt-1 times through here we don't need to
- * move the grant write head because the permanent
- * reservation has reserved cnt times the unit amount.
- * Release part of current permanent unit reservation and
- * reset current reservation to be one units worth. Also
- * move grant reservation head forward.
+}
+
+/*
+ * The first cnt-1 times a ticket goes through here we don't need to move the
+ * grant write head because the permanent reservation has reserved cnt times the
+ * unit amount. Release part of current permanent unit reservation and reset
+ * current reservation to be one units worth. Also move grant reservation head
+ * forward.
*/
-STATIC void
-xlog_regrant_reserve_log_space(
+void
+xfs_log_ticket_regrant(
struct xlog *log,
struct xlog_ticket *ticket)
{
- trace_xfs_log_regrant_reserve_enter(log, ticket);
+ trace_xfs_log_ticket_regrant(log, ticket);
if (ticket->t_cnt > 0)
ticket->t_cnt--;
@@ -3055,21 +2965,20 @@ xlog_regrant_reserve_log_space(
ticket->t_curr_res = ticket->t_unit_res;
xlog_tic_reset_res(ticket);
- trace_xfs_log_regrant_reserve_sub(log, ticket);
+ trace_xfs_log_ticket_regrant_sub(log, ticket);
/* just return if we still have some of the pre-reserved space */
- if (ticket->t_cnt > 0)
- return;
+ if (!ticket->t_cnt) {
+ xlog_grant_add_space(log, &log->l_reserve_head.grant,
+ ticket->t_unit_res);
+ trace_xfs_log_ticket_regrant_exit(log, ticket);
- xlog_grant_add_space(log, &log->l_reserve_head.grant,
- ticket->t_unit_res);
-
- trace_xfs_log_regrant_reserve_exit(log, ticket);
-
- ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
-} /* xlog_regrant_reserve_log_space */
+ ticket->t_curr_res = ticket->t_unit_res;
+ xlog_tic_reset_res(ticket);
+ }
+ xfs_log_ticket_put(ticket);
+}
/*
* Give back the space left from a reservation.
@@ -3085,18 +2994,19 @@ xlog_regrant_reserve_log_space(
* space, the count will stay at zero and the only space remaining will be
* in the current reservation field.
*/
-STATIC void
-xlog_ungrant_log_space(
+void
+xfs_log_ticket_ungrant(
struct xlog *log,
struct xlog_ticket *ticket)
{
- int bytes;
+ int bytes;
+
+ trace_xfs_log_ticket_ungrant(log, ticket);
if (ticket->t_cnt > 0)
ticket->t_cnt--;
- trace_xfs_log_ungrant_enter(log, ticket);
- trace_xfs_log_ungrant_sub(log, ticket);
+ trace_xfs_log_ticket_ungrant_sub(log, ticket);
/*
* If this is a permanent reservation ticket, we may be able to free
@@ -3111,18 +3021,15 @@ xlog_ungrant_log_space(
xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
- trace_xfs_log_ungrant_exit(log, ticket);
+ trace_xfs_log_ticket_ungrant_exit(log, ticket);
xfs_log_space_wake(log->l_mp);
+ xfs_log_ticket_put(ticket);
}
/*
- * Mark the current iclog in the ring as WANT_SYNC and move the current iclog
- * pointer to the next iclog in the ring.
- *
- * When called from xlog_state_get_iclog_space(), the exact size of the iclog
- * has not yet been determined, all we know is that we have run out of space in
- * the current iclog.
+ * This routine will mark the current iclog in the ring as WANT_SYNC and move
+ * the current iclog pointer to the next iclog in the ring.
*/
STATIC void
xlog_state_switch_iclogs(
@@ -3167,7 +3074,7 @@ xlog_state_switch_iclogs(
}
ASSERT(iclog == log->l_iclog);
log->l_iclog = iclog->ic_next;
-} /* xlog_state_switch_iclogs */
+}
/*
* Write out all data in the in-core log as of this exact moment in time.
@@ -3374,13 +3281,6 @@ xfs_log_force_lsn(
return ret;
}
-/*****************************************************************************
- *
- * TICKET functions
- *
- *****************************************************************************
- */
-
/*
* Free a used ticket when its refcount falls to zero.
*/
@@ -3529,7 +3429,6 @@ xlog_ticket_alloc(
tic->t_ocnt = cnt;
tic->t_tid = prandom_u32();
tic->t_clientid = client;
- tic->t_flags = XLOG_TIC_INITED;
if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
@@ -3538,13 +3437,6 @@ xlog_ticket_alloc(
return tic;
}
-
-/******************************************************************************
- *
- * Log debug routines
- *
- ******************************************************************************
- */
#if defined(DEBUG)
/*
* Make sure that the destination ptr is within the valid data region of
@@ -3630,7 +3522,7 @@ xlog_verify_tail_lsn(
if (blocks < BTOBB(iclog->ic_offset) + 1)
xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
}
-} /* xlog_verify_tail_lsn */
+}
/*
* Perform a number of checks on the iclog before writing to disk.
@@ -3733,7 +3625,7 @@ xlog_verify_iclog(
}
ptr += sizeof(xlog_op_header_t) + op_len;
}
-} /* xlog_verify_iclog */
+}
#endif
/*
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index cc77cc36560a..1412d6993f1e 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -105,10 +105,6 @@ struct xfs_log_item;
struct xfs_item_ops;
struct xfs_trans;
-xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- bool regrant);
int xfs_log_force(struct xfs_mount *mp, uint flags);
int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags,
int *log_forced);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 64cc0bf2ab3b..b43f0e8f43f2 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -669,6 +669,11 @@ xlog_cil_push_work(
ASSERT(push_seq <= ctx->sequence);
/*
+ * Wake up any background push waiters now this context is being pushed.
+ */
+ wake_up_all(&ctx->push_wait);
+
+ /*
* Check if we've anything to push. If there is nothing, then we don't
* move on to a new sequence number and so we have to be able to push
* this sequence again later.
@@ -744,6 +749,7 @@ xlog_cil_push_work(
*/
INIT_LIST_HEAD(&new_ctx->committing);
INIT_LIST_HEAD(&new_ctx->busy_extents);
+ init_waitqueue_head(&new_ctx->push_wait);
new_ctx->sequence = ctx->sequence + 1;
new_ctx->cil = cil;
cil->xc_ctx = new_ctx;
@@ -801,7 +807,7 @@ xlog_cil_push_work(
lvhdr.lv_iovecp = &lhdr;
lvhdr.lv_next = ctx->lv_chain;
- error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
+ error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true);
if (error)
goto out_abort_free_ticket;
@@ -839,10 +845,11 @@ restart:
}
spin_unlock(&cil->xc_push_lock);
- /* xfs_log_done always frees the ticket on error. */
- commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false);
- if (commit_lsn == -1)
- goto out_abort;
+ error = xlog_commit_record(log, tic, &commit_iclog, &commit_lsn);
+ if (error)
+ goto out_abort_free_ticket;
+
+ xfs_log_ticket_ungrant(log, tic);
spin_lock(&commit_iclog->ic_callback_lock);
if (commit_iclog->ic_state == XLOG_STATE_IOERROR) {
@@ -875,7 +882,7 @@ out_skip:
return;
out_abort_free_ticket:
- xfs_log_ticket_put(tic);
+ xfs_log_ticket_ungrant(log, tic);
out_abort:
ASSERT(XLOG_FORCED_SHUTDOWN(log));
xlog_cil_committed(ctx);
@@ -890,7 +897,7 @@ out_abort:
*/
static void
xlog_cil_push_background(
- struct xlog *log)
+ struct xlog *log) __releases(cil->xc_ctx_lock)
{
struct xfs_cil *cil = log->l_cilp;
@@ -904,14 +911,36 @@ xlog_cil_push_background(
* don't do a background push if we haven't used up all the
* space available yet.
*/
- if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+ if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
+ up_read(&cil->xc_ctx_lock);
return;
+ }
spin_lock(&cil->xc_push_lock);
if (cil->xc_push_seq < cil->xc_current_sequence) {
cil->xc_push_seq = cil->xc_current_sequence;
queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
}
+
+ /*
+ * Drop the context lock now, we can't hold that if we need to sleep
+ * because we are over the blocking threshold. The push_lock is still
+ * held, so blocking threshold sleep/wakeup is still correctly
+ * serialised here.
+ */
+ up_read(&cil->xc_ctx_lock);
+
+ /*
+ * If we are well over the space limit, throttle the work that is being
+ * done until the push work on this context has begun.
+ */
+ if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
+ trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
+ ASSERT(cil->xc_ctx->space_used < log->l_logsize);
+ xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock);
+ return;
+ }
+
spin_unlock(&cil->xc_push_lock);
}
@@ -1007,7 +1036,10 @@ xfs_log_commit_cil(
if (commit_lsn)
*commit_lsn = xc_commit_lsn;
- xfs_log_done(mp, tp->t_ticket, NULL, regrant);
+ if (regrant && !XLOG_FORCED_SHUTDOWN(log))
+ xfs_log_ticket_regrant(log, tp->t_ticket);
+ else
+ xfs_log_ticket_ungrant(log, tp->t_ticket);
tp->t_ticket = NULL;
xfs_trans_unreserve_and_mod_sb(tp);
@@ -1028,9 +1060,9 @@ xfs_log_commit_cil(
if (lip->li_ops->iop_committing)
lip->li_ops->iop_committing(lip, xc_commit_lsn);
}
- xlog_cil_push_background(log);
- up_read(&cil->xc_ctx_lock);
+ /* xlog_cil_push_background() releases cil->xc_ctx_lock */
+ xlog_cil_push_background(log);
}
/*
@@ -1189,6 +1221,7 @@ xlog_cil_init(
INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents);
+ init_waitqueue_head(&ctx->push_wait);
ctx->sequence = 1;
ctx->cil = cil;
cil->xc_ctx = ctx;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 2b0aec37e73e..ec22c7a3867f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -51,13 +51,11 @@ enum xlog_iclog_state {
};
/*
- * Flags to log ticket
+ * Log ticket flags
*/
-#define XLOG_TIC_INITED 0x1 /* has been initialized */
-#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
+#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */
#define XLOG_TIC_FLAGS \
- { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
{ XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
/*
@@ -242,6 +240,7 @@ struct xfs_cil_ctx {
struct xfs_log_vec *lv_chain; /* logvecs being pushed */
struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
+ wait_queue_head_t push_wait; /* background push throttle */
struct work_struct discard_endio_work;
};
@@ -318,13 +317,53 @@ struct xfs_cil {
* tries to keep 25% of the log free, so we need to keep below that limit or we
* risk running out of free log space to start any new transactions.
*
- * In order to keep background CIL push efficient, we will set a lower
- * threshold at which background pushing is attempted without blocking current
- * transaction commits. A separate, higher bound defines when CIL pushes are
- * enforced to ensure we stay within our maximum checkpoint size bounds.
- * threshold, yet give us plenty of space for aggregation on large logs.
+ * In order to keep background CIL push efficient, we only need to ensure the
+ * CIL is large enough to maintain sufficient in-memory relogging to avoid
+ * repeated physical writes of frequently modified metadata. If we allow the CIL
+ * to grow to a substantial fraction of the log, then we may be pinning hundreds
+ * of megabytes of metadata in memory until the CIL flushes. This can cause
+ * issues when we are running low on memory - pinned memory cannot be reclaimed,
+ * and the CIL consumes a lot of memory. Hence we need to set an upper physical
+ * size limit for the CIL that limits the maximum amount of memory pinned by the
+ * CIL but does not limit performance by reducing relogging efficiency
+ * significantly.
+ *
+ * As such, the CIL push threshold ends up being the smaller of two thresholds:
+ * - a threshold large enough that it allows CIL to be pushed and progress to be
+ * made without excessive blocking of incoming transaction commits. This is
+ * defined to be 12.5% of the log space - half the 25% push threshold of the
+ * AIL.
+ * - small enough that it doesn't pin excessive amounts of memory but maintains
+ * close to peak relogging efficiency. This is defined to be 16x the iclog
+ * buffer window (32MB) as measurements have shown this to be roughly the
+ * point of diminishing performance increases under highly concurrent
+ * modification workloads.
+ *
+ * To prevent the CIL from overflowing upper commit size bounds, we introduce a
+ * new threshold at which we block committing transactions until the background
+ * CIL commit commences and switches to a new context. While this is not a hard
+ * limit, it forces the process committing a transaction to the CIL to block and
+ * yeild the CPU, giving the CIL push work a chance to be scheduled and start
+ * work. This prevents a process running lots of transactions from overfilling
+ * the CIL because it is not yielding the CPU. We set the blocking limit at
+ * twice the background push space threshold so we keep in line with the AIL
+ * push thresholds.
+ *
+ * Note: this is not a -hard- limit as blocking is applied after the transaction
+ * is inserted into the CIL and the push has been triggered. It is largely a
+ * throttling mechanism that allows the CIL push to be scheduled and run. A hard
+ * limit will be difficult to implement without introducing global serialisation
+ * in the CIL commit fast path, and it's not at all clear that we actually need
+ * such hard limits given the ~7 years we've run without a hard limit before
+ * finding the first situation where a checkpoint size overflow actually
+ * occurred. Hence the simple throttle, and an ASSERT check to tell us that
+ * we've overrun the max size.
*/
-#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
+#define XLOG_CIL_SPACE_LIMIT(log) \
+ min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4)
+
+#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \
+ (XLOG_CIL_SPACE_LIMIT(log) * 2)
/*
* ticket grant locks, queues and accounting have their own cachlines
@@ -439,14 +478,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
void xlog_print_trans(struct xfs_trans *);
-int
-xlog_write(
- struct xlog *log,
- struct xfs_log_vec *log_vector,
- struct xlog_ticket *tic,
- xfs_lsn_t *start_lsn,
- struct xlog_in_core **commit_iclog,
- uint flags);
+int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector,
+ struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
+ struct xlog_in_core **commit_iclog, uint flags,
+ bool need_start_rec);
+int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket,
+ struct xlog_in_core **iclog, xfs_lsn_t *lsn);
+void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
+void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
/*
* When we crack an atomic LSN, we sample it first so that the value will not
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 88ab09ed29e7..50c43422fa17 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -167,6 +167,7 @@ typedef struct xfs_mount {
struct xfs_kobj m_error_meta_kobj;
struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
struct xstats m_stats; /* per-fs stats */
+ struct ratelimit_state m_flush_inodes_ratelimit;
struct workqueue_struct *m_buf_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index cabdb755adae..c225691fad15 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -121,12 +121,11 @@ xfs_qm_dqpurge(
{
struct xfs_mount *mp = dqp->q_mount;
struct xfs_quotainfo *qi = mp->m_quotainfo;
+ int error = -EAGAIN;
xfs_dqlock(dqp);
- if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
- xfs_dqunlock(dqp);
- return -EAGAIN;
- }
+ if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0)
+ goto out_unlock;
dqp->dq_flags |= XFS_DQ_FREEING;
@@ -139,7 +138,6 @@ xfs_qm_dqpurge(
*/
if (XFS_DQ_IS_DIRTY(dqp)) {
struct xfs_buf *bp = NULL;
- int error;
/*
* We don't care about getting disk errors here. We need
@@ -149,6 +147,8 @@ xfs_qm_dqpurge(
if (!error) {
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
+ } else if (error == -EAGAIN) {
+ goto out_unlock;
}
xfs_dqflock(dqp);
}
@@ -174,6 +174,10 @@ xfs_qm_dqpurge(
xfs_qm_dqdestroy(dqp);
return 0;
+
+out_unlock:
+ xfs_dqunlock(dqp);
+ return error;
}
/*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 2094386af8ac..abf06bf9c3f3 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -528,6 +528,9 @@ xfs_flush_inodes(
{
struct super_block *sb = mp->m_super;
+ if (!__ratelimit(&mp->m_flush_inodes_ratelimit))
+ return;
+
if (down_read_trylock(&sb->s_umount)) {
sync_inodes_sb(sb);
up_read(&sb->s_umount);
@@ -1366,6 +1369,17 @@ xfs_fc_fill_super(
if (error)
goto out_free_names;
+ /*
+ * Cap the number of invocations of xfs_flush_inodes to 16 for every
+ * quarter of a second. The magic numbers here were determined by
+ * observation neither to cause stalls in writeback when there are a
+ * lot of IO threads and the fs is near ENOSPC, nor cause any fstest
+ * regressions. YMMV.
+ */
+ ratelimit_state_init(&mp->m_flush_inodes_ratelimit, HZ / 4, 16);
+ ratelimit_set_flags(&mp->m_flush_inodes_ratelimit,
+ RATELIMIT_MSG_ON_RELEASE);
+
error = xfs_init_mount_workqueues(mp);
if (error)
goto out_close_devices;
@@ -1861,7 +1875,8 @@ xfs_init_zones(void)
xfs_ili_zone = kmem_cache_create("xfs_ili",
sizeof(struct xfs_inode_log_item), 0,
- SLAB_MEM_SPREAD, NULL);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
if (!xfs_ili_zone)
goto out_destroy_inode_zone;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index fa0fa3c70f1a..13fb4b919648 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -176,7 +176,6 @@ xfs_symlink(
return -ENAMETOOLONG;
ASSERT(pathlen > 0);
- udqp = gdqp = NULL;
prid = xfs_get_initial_prid(dp);
/*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index efc7751550d9..a4323a63438d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1001,8 +1001,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
DEFINE_EVENT(xfs_loggrant_class, name, \
TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \
TP_ARGS(log, tic))
-DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
-DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
@@ -1011,12 +1009,13 @@ DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait);
DECLARE_EVENT_CLASS(xfs_log_item_class,
TP_PROTO(struct xfs_log_item *lip),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 1adc6bc53a56..28b983ff8b11 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -9,6 +9,7 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
+#include "xfs_log_priv.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_extent_busy.h"
@@ -150,8 +151,9 @@ xfs_trans_reserve(
uint blocks,
uint rtextents)
{
- int error = 0;
- bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+ struct xfs_mount *mp = tp->t_mountp;
+ int error = 0;
+ bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
/* Mark this thread as being in a transaction */
current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
@@ -162,7 +164,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (blocks > 0) {
- error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
+ error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
if (error != 0) {
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return -ENOSPC;
@@ -191,9 +193,9 @@ xfs_trans_reserve(
if (tp->t_ticket != NULL) {
ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
- error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
+ error = xfs_log_regrant(mp, tp->t_ticket);
} else {
- error = xfs_log_reserve(tp->t_mountp,
+ error = xfs_log_reserve(mp,
resp->tr_logres,
resp->tr_logcount,
&tp->t_ticket, XFS_TRANSACTION,
@@ -213,7 +215,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (rtextents > 0) {
- error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents));
+ error = xfs_mod_frextents(mp, -((int64_t)rtextents));
if (error) {
error = -ENOSPC;
goto undo_log;
@@ -229,7 +231,7 @@ xfs_trans_reserve(
*/
undo_log:
if (resp->tr_logres > 0) {
- xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false);
+ xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
tp->t_log_res = 0;
tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
@@ -237,7 +239,7 @@ undo_log:
undo_blocks:
if (blocks > 0) {
- xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd);
+ xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
tp->t_blk_res = 0;
}
@@ -1004,9 +1006,10 @@ out_unreserve:
*/
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
- if (commit_lsn == -1 && !error)
- error = -EIO;
+ if (regrant && !XLOG_FORCED_SHUTDOWN(mp->m_log))
+ xfs_log_ticket_regrant(mp->m_log, tp->t_ticket);
+ else
+ xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
@@ -1065,7 +1068,7 @@ xfs_trans_cancel(
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- xfs_log_done(mp, tp->t_ticket, NULL, false);
+ xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2ef0dfbfb303..564253550b75 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -109,17 +109,25 @@ xfs_ail_next(
* We need the AIL lock in order to get a coherent read of the lsn of the last
* item in the AIL.
*/
+static xfs_lsn_t
+__xfs_ail_min_lsn(
+ struct xfs_ail *ailp)
+{
+ struct xfs_log_item *lip = xfs_ail_min(ailp);
+
+ if (lip)
+ return lip->li_lsn;
+ return 0;
+}
+
xfs_lsn_t
xfs_ail_min_lsn(
struct xfs_ail *ailp)
{
- xfs_lsn_t lsn = 0;
- struct xfs_log_item *lip;
+ xfs_lsn_t lsn;
spin_lock(&ailp->ail_lock);
- lip = xfs_ail_min(ailp);
- if (lip)
- lsn = lip->li_lsn;
+ lsn = __xfs_ail_min_lsn(ailp);
spin_unlock(&ailp->ail_lock);
return lsn;
@@ -681,6 +689,28 @@ xfs_ail_push_all_sync(
finish_wait(&ailp->ail_empty, &wait);
}
+void
+xfs_ail_update_finish(
+ struct xfs_ail *ailp,
+ xfs_lsn_t old_lsn) __releases(ailp->ail_lock)
+{
+ struct xfs_mount *mp = ailp->ail_mount;
+
+ /* if the tail lsn hasn't changed, don't do updates or wakeups. */
+ if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) {
+ spin_unlock(&ailp->ail_lock);
+ return;
+ }
+
+ if (!XFS_FORCED_SHUTDOWN(mp))
+ xlog_assign_tail_lsn_locked(mp);
+
+ if (list_empty(&ailp->ail_head))
+ wake_up_all(&ailp->ail_empty);
+ spin_unlock(&ailp->ail_lock);
+ xfs_log_space_wake(mp);
+}
+
/*
* xfs_trans_ail_update - bulk AIL insertion operation.
*
@@ -712,7 +742,7 @@ xfs_trans_ail_update_bulk(
xfs_lsn_t lsn) __releases(ailp->ail_lock)
{
struct xfs_log_item *mlip;
- int mlip_changed = 0;
+ xfs_lsn_t tail_lsn = 0;
int i;
LIST_HEAD(tmp);
@@ -727,9 +757,10 @@ xfs_trans_ail_update_bulk(
continue;
trace_xfs_ail_move(lip, lip->li_lsn, lsn);
+ if (mlip == lip && !tail_lsn)
+ tail_lsn = lip->li_lsn;
+
xfs_ail_delete(ailp, lip);
- if (mlip == lip)
- mlip_changed = 1;
} else {
trace_xfs_ail_insert(lip, 0, lsn);
}
@@ -740,23 +771,23 @@ xfs_trans_ail_update_bulk(
if (!list_empty(&tmp))
xfs_ail_splice(ailp, cur, &tmp, lsn);
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
- xlog_assign_tail_lsn_locked(ailp->ail_mount);
- spin_unlock(&ailp->ail_lock);
-
- xfs_log_space_wake(ailp->ail_mount);
- } else {
- spin_unlock(&ailp->ail_lock);
- }
+ xfs_ail_update_finish(ailp, tail_lsn);
}
-bool
+/*
+ * Delete one log item from the AIL.
+ *
+ * If this item was at the tail of the AIL, return the LSN of the log item so
+ * that we can use it to check if the LSN of the tail of the log has moved
+ * when finishing up the AIL delete process in xfs_ail_update_finish().
+ */
+xfs_lsn_t
xfs_ail_delete_one(
struct xfs_ail *ailp,
struct xfs_log_item *lip)
{
struct xfs_log_item *mlip = xfs_ail_min(ailp);
+ xfs_lsn_t lsn = lip->li_lsn;
trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
xfs_ail_delete(ailp, lip);
@@ -764,7 +795,9 @@ xfs_ail_delete_one(
clear_bit(XFS_LI_IN_AIL, &lip->li_flags);
lip->li_lsn = 0;
- return mlip == lip;
+ if (mlip == lip)
+ return lsn;
+ return 0;
}
/**
@@ -792,10 +825,10 @@ void
xfs_trans_ail_delete(
struct xfs_ail *ailp,
struct xfs_log_item *lip,
- int shutdown_type) __releases(ailp->ail_lock)
+ int shutdown_type)
{
struct xfs_mount *mp = ailp->ail_mount;
- bool mlip_changed;
+ xfs_lsn_t tail_lsn;
if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
spin_unlock(&ailp->ail_lock);
@@ -808,17 +841,8 @@ xfs_trans_ail_delete(
return;
}
- mlip_changed = xfs_ail_delete_one(ailp, lip);
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(mp))
- xlog_assign_tail_lsn_locked(mp);
- if (list_empty(&ailp->ail_head))
- wake_up_all(&ailp->ail_empty);
- }
-
- spin_unlock(&ailp->ail_lock);
- if (mlip_changed)
- xfs_log_space_wake(ailp->ail_mount);
+ tail_lsn = xfs_ail_delete_one(ailp, lip);
+ xfs_ail_update_finish(ailp, tail_lsn);
}
int
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 2e073c1c4614..35655eac01a6 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -91,9 +91,11 @@ xfs_trans_ail_update(
xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
}
-bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn)
+ __releases(ailp->ail_lock);
void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
- int shutdown_type) __releases(ailp->ail_lock);
+ int shutdown_type);
static inline void
xfs_trans_ail_remove(
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index e2e2bef07dd2..329b8c8ca703 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -10,6 +10,7 @@
#include <linux/mm_types.h>
#include <linux/bug.h>
#include <linux/errno.h>
+#include <asm-generic/pgtable_uffd.h>
#if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h
new file mode 100644
index 000000000000..828966d4c281
--- /dev/null
+++ b/include/asm-generic/pgtable_uffd.h
@@ -0,0 +1,66 @@
+#ifndef _ASM_GENERIC_PGTABLE_UFFD_H
+#define _ASM_GENERIC_PGTABLE_UFFD_H
+
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static __always_inline int pte_uffd_wp(pte_t pte)
+{
+ return 0;
+}
+
+static __always_inline int pmd_uffd_wp(pmd_t pmd)
+{
+ return 0;
+}
+
+static __always_inline pte_t pte_mkuffd_wp(pte_t pte)
+{
+ return pte;
+}
+
+static __always_inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
+{
+ return pmd;
+}
+
+static __always_inline pte_t pte_clear_uffd_wp(pte_t pte)
+{
+ return pte;
+}
+
+static __always_inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
+{
+ return pmd;
+}
+
+static __always_inline pte_t pte_swp_mkuffd_wp(pte_t pte)
+{
+ return pte;
+}
+
+static __always_inline int pte_swp_uffd_wp(pte_t pte)
+{
+ return 0;
+}
+
+static __always_inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
+{
+ return pte;
+}
+
+static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd)
+{
+ return pmd;
+}
+
+static inline int pmd_swp_uffd_wp(pmd_t pmd)
+{
+ return 0;
+}
+
+static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
+{
+ return pmd;
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
+#endif /* _ASM_GENERIC_PGTABLE_UFFD_H */
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index f391f6b500b4..3f1649a8cf55 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -13,6 +13,7 @@
#include <linux/mmu_notifier.h>
#include <linux/swap.h>
+#include <linux/hugetlb_inline.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>
@@ -398,7 +399,7 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma)
* We rely on tlb_end_vma() to issue a flush, such that when we reset
* these values the batch is empty.
*/
- tlb->vma_huge = !!(vma->vm_flags & VM_HUGETLB);
+ tlb->vma_huge = is_vm_hugetlb_page(vma);
tlb->vma_exec = !!(vma->vm_flags & VM_EXEC);
}
diff --git a/include/clocksource/timer-ti-dm.h b/include/clocksource/timer-ti-dm.h
index 25f05235866e..531ca87fcd08 100644
--- a/include/clocksource/timer-ti-dm.h
+++ b/include/clocksource/timer-ti-dm.h
@@ -248,8 +248,7 @@ int omap_dm_timers_active(void);
/*
* The below are inlined to optimize code size for system timers. Other code
- * should not need these at all, see
- * include/linux/platform_data/pwm_omap_dmtimer.h
+ * should not need these at all.
*/
#if defined(CONFIG_ARCH_OMAP1) || defined(CONFIG_ARCH_OMAP2PLUS)
static inline u32 __omap_dm_timer_read(struct omap_dm_timer *timer, u32 reg,
diff --git a/include/crypto/curve25519.h b/include/crypto/curve25519.h
index 9ecb3c1f0f15..4e6dc840b159 100644
--- a/include/crypto/curve25519.h
+++ b/include/crypto/curve25519.h
@@ -33,8 +33,7 @@ bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE],
const u8 secret[CURVE25519_KEY_SIZE],
const u8 basepoint[CURVE25519_KEY_SIZE])
{
- if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519) &&
- (!IS_ENABLED(CONFIG_CRYPTO_CURVE25519_X86) || IS_ENABLED(CONFIG_AS_ADX)))
+ if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519))
curve25519_arch(mypublic, secret, basepoint);
else
curve25519_generic(mypublic, secret, basepoint);
@@ -50,8 +49,7 @@ __must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE],
CURVE25519_KEY_SIZE)))
return false;
- if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519) &&
- (!IS_ENABLED(CONFIG_CRYPTO_CURVE25519_X86) || IS_ENABLED(CONFIG_AS_ADX)))
+ if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519))
curve25519_base_arch(pub, secret);
else
curve25519_generic(pub, secret, curve25519_base_point);
diff --git a/include/drm/bridge/analogix_dp.h b/include/drm/bridge/analogix_dp.h
index 7aa2f93da49c..b0dcc07334a1 100644
--- a/include/drm/bridge/analogix_dp.h
+++ b/include/drm/bridge/analogix_dp.h
@@ -42,9 +42,10 @@ int analogix_dp_resume(struct analogix_dp_device *dp);
int analogix_dp_suspend(struct analogix_dp_device *dp);
struct analogix_dp_device *
-analogix_dp_bind(struct device *dev, struct drm_device *drm_dev,
- struct analogix_dp_plat_data *plat_data);
+analogix_dp_probe(struct device *dev, struct analogix_dp_plat_data *plat_data);
+int analogix_dp_bind(struct analogix_dp_device *dp, struct drm_device *drm_dev);
void analogix_dp_unbind(struct analogix_dp_device *dp);
+void analogix_dp_remove(struct analogix_dp_device *dp);
int analogix_dp_start_crc(struct drm_connector *connector);
int analogix_dp_stop_crc(struct drm_connector *connector);
diff --git a/include/drm/drm_legacy.h b/include/drm/drm_legacy.h
index dcef3598f49e..aed382c17b26 100644
--- a/include/drm/drm_legacy.h
+++ b/include/drm/drm_legacy.h
@@ -136,7 +136,7 @@ struct drm_sg_mem {
* Kernel side of a mapping
*/
struct drm_local_map {
- resource_size_t offset; /**< Requested physical address (0 for SAREA)*/
+ dma_addr_t offset; /**< Requested physical address (0 for SAREA)*/
unsigned long size; /**< Requested physical size (bytes) */
enum drm_map_type type; /**< Type of memory to map */
enum drm_map_flags flags; /**< Flags */
diff --git a/include/dt-bindings/clock/k210-clk.h b/include/dt-bindings/clock/k210-clk.h
new file mode 100644
index 000000000000..5a2fd64d1a49
--- /dev/null
+++ b/include/dt-bindings/clock/k210-clk.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019-20 Sean Anderson <seanga2@gmail.com>
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
+ */
+#ifndef K210_CLK_H
+#define K210_CLK_H
+
+/*
+ * Arbitrary identifiers for clocks.
+ * The structure is: in0 -> pll0 -> aclk -> cpu
+ *
+ * Since we use the hardware defaults for now, set all these to the same clock.
+ */
+#define K210_CLK_PLL0 0
+#define K210_CLK_PLL1 0
+#define K210_CLK_ACLK 0
+#define K210_CLK_CPU 0
+
+#endif /* K210_CLK_H */
diff --git a/include/dt-bindings/leds/common.h b/include/dt-bindings/leds/common.h
index 9e1256a7c1bf..0ce7dfc00dcb 100644
--- a/include/dt-bindings/leds/common.h
+++ b/include/dt-bindings/leds/common.h
@@ -6,6 +6,7 @@
* Author: Jacek Anaszewski <j.anaszewski@samsung.com>
*
* Copyright (C) 2019 Jacek Anaszewski <jacek.anaszewski@gmail.com>
+ * Copyright (C) 2020 Pavel Machek <pavel@ucw.cz>
*/
#ifndef __DT_BINDINGS_LEDS_H
@@ -32,16 +33,38 @@
#define LED_COLOR_ID_MAX 8
/* Standard LED functions */
+/* Keyboard LEDs, usually it would be input4::capslock etc. */
+/* Obsolete equivalent: "shift-key-light" */
+#define LED_FUNCTION_CAPSLOCK "capslock"
+#define LED_FUNCTION_SCROLLLOCK "scrolllock"
+#define LED_FUNCTION_NUMLOCK "numlock"
+/* Obsolete equivalents: "tpacpi::thinklight" (IBM/Lenovo Thinkpads),
+ "lp5523:kb{1,2,3,4,5,6}" (Nokia N900) */
+#define LED_FUNCTION_KBD_BACKLIGHT "kbd_backlight"
+
+/* System LEDs, usually found on system body.
+ platform::mute (etc) is sometimes seen, :mute would be better */
+#define LED_FUNCTION_POWER "power"
+#define LED_FUNCTION_DISK "disk"
+
+/* Obsolete: "platform:*:charging" (allwinner sun50i) */
+#define LED_FUNCTION_CHARGING "charging"
+/* Used RGB notification LEDs common on phones.
+ Obsolete equivalents: "status-led:{red,green,blue}" (Motorola Droid 4),
+ "lp5523:{r,g,b}" (Nokia N900) */
+#define LED_FUNCTION_STATUS "status"
+
+#define LED_FUNCTION_MICMUTE "micmute"
+#define LED_FUNCTION_MUTE "mute"
+
+/* Miscelleaus functions. Use functions above if you can. */
#define LED_FUNCTION_ACTIVITY "activity"
#define LED_FUNCTION_ALARM "alarm"
#define LED_FUNCTION_BACKLIGHT "backlight"
#define LED_FUNCTION_BLUETOOTH "bluetooth"
#define LED_FUNCTION_BOOT "boot"
#define LED_FUNCTION_CPU "cpu"
-#define LED_FUNCTION_CAPSLOCK "capslock"
-#define LED_FUNCTION_CHARGING "charging"
#define LED_FUNCTION_DEBUG "debug"
-#define LED_FUNCTION_DISK "disk"
#define LED_FUNCTION_DISK_ACTIVITY "disk-activity"
#define LED_FUNCTION_DISK_ERR "disk-err"
#define LED_FUNCTION_DISK_READ "disk-read"
@@ -50,21 +73,14 @@
#define LED_FUNCTION_FLASH "flash"
#define LED_FUNCTION_HEARTBEAT "heartbeat"
#define LED_FUNCTION_INDICATOR "indicator"
-#define LED_FUNCTION_KBD_BACKLIGHT "kbd_backlight"
#define LED_FUNCTION_LAN "lan"
#define LED_FUNCTION_MAIL "mail"
#define LED_FUNCTION_MTD "mtd"
-#define LED_FUNCTION_MICMUTE "micmute"
-#define LED_FUNCTION_MUTE "mute"
-#define LED_FUNCTION_NUMLOCK "numlock"
#define LED_FUNCTION_PANIC "panic"
#define LED_FUNCTION_PROGRAMMING "programming"
-#define LED_FUNCTION_POWER "power"
#define LED_FUNCTION_RX "rx"
#define LED_FUNCTION_SD "sd"
-#define LED_FUNCTION_SCROLLLOCK "scrolllock"
#define LED_FUNCTION_STANDBY "standby"
-#define LED_FUNCTION_STATUS "status"
#define LED_FUNCTION_TORCH "torch"
#define LED_FUNCTION_TX "tx"
#define LED_FUNCTION_USB "usb"
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 9f70b7807d53..d661cd0ee64d 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -416,9 +416,30 @@ extern void acpi_osi_setup(char *str);
extern bool acpi_osi_is_win8(void);
#ifdef CONFIG_ACPI_NUMA
-int acpi_map_pxm_to_online_node(int pxm);
int acpi_map_pxm_to_node(int pxm);
int acpi_get_node(acpi_handle handle);
+
+/**
+ * acpi_map_pxm_to_online_node - Map proximity ID to online node
+ * @pxm: ACPI proximity ID
+ *
+ * This is similar to acpi_map_pxm_to_node(), but always returns an online
+ * node. When the mapped node from a given proximity ID is offline, it
+ * looks up the node distance table and returns the nearest online node.
+ *
+ * ACPI device drivers, which are called after the NUMA initialization has
+ * completed in the kernel, can call this interface to obtain their device
+ * NUMA topology from ACPI tables. Such drivers do not have to deal with
+ * offline nodes. A node may be offline when a device proximity ID is
+ * unique, SRAT memory entry does not exist, or NUMA is disabled, ex.
+ * "numa=off" on x86.
+ */
+static inline int acpi_map_pxm_to_online_node(int pxm)
+{
+ int node = acpi_map_pxm_to_node(pxm);
+
+ return numa_map_to_online_node(node);
+}
#else
static inline int acpi_map_pxm_to_online_node(int pxm)
{
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 47f54b459c26..9acf654f0b19 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -162,7 +162,7 @@ static inline __u8 ror8(__u8 word, unsigned int shift)
*
* This is safe to use for 16- and 8-bit types as well.
*/
-static inline __s32 sign_extend32(__u32 value, int index)
+static __always_inline __s32 sign_extend32(__u32 value, int index)
{
__u8 shift = 31 - index;
return (__s32)(value << shift) >> shift;
@@ -173,7 +173,7 @@ static inline __s32 sign_extend32(__u32 value, int index)
* @value: value to sign extend
* @index: 0 based bit index (0<=index<64) to sign bit
*/
-static inline __s64 sign_extend64(__u64 value, int index)
+static __always_inline __s64 sign_extend64(__u64 value, int index)
{
__u8 shift = 63 - index;
return (__s64)(value << shift) >> shift;
diff --git a/include/linux/bits.h b/include/linux/bits.h
index a740bbcf3cd2..4671fbf28842 100644
--- a/include/linux/bits.h
+++ b/include/linux/bits.h
@@ -18,12 +18,30 @@
* position @h. For example
* GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
*/
-#define GENMASK(h, l) \
+#if !defined(__ASSEMBLY__) && \
+ (!defined(CONFIG_CC_IS_GCC) || CONFIG_GCC_VERSION >= 49000)
+#include <linux/build_bug.h>
+#define GENMASK_INPUT_CHECK(h, l) \
+ (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \
+ __builtin_constant_p((l) > (h)), (l) > (h), 0)))
+#else
+/*
+ * BUILD_BUG_ON_ZERO is not available in h files included from asm files,
+ * disable the input check if that is the case.
+ */
+#define GENMASK_INPUT_CHECK(h, l) 0
+#endif
+
+#define __GENMASK(h, l) \
(((~UL(0)) - (UL(1) << (l)) + 1) & \
(~UL(0) >> (BITS_PER_LONG - 1 - (h))))
+#define GENMASK(h, l) \
+ (GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l))
-#define GENMASK_ULL(h, l) \
+#define __GENMASK_ULL(h, l) \
(((~ULL(0)) - (ULL(1) << (l)) + 1) & \
(~ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h))))
+#define GENMASK_ULL(h, l) \
+ (GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l))
#endif /* __LINUX_BITS_H */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index e4a6949fd171..35f8ffe92b70 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -46,6 +46,7 @@ struct blkcg_gq;
struct blkcg {
struct cgroup_subsys_state css;
spinlock_t lock;
+ refcount_t online_pin;
struct radix_tree_root blkg_tree;
struct blkcg_gq __rcu *blkg_hint;
@@ -56,7 +57,6 @@ struct blkcg {
struct list_head all_blkcgs_node;
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head cgwb_list;
- refcount_t cgwb_refcnt;
#endif
};
@@ -412,47 +412,38 @@ static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
extern void blkcg_destroy_blkgs(struct blkcg *blkcg);
-#ifdef CONFIG_CGROUP_WRITEBACK
-
/**
- * blkcg_cgwb_get - get a reference for blkcg->cgwb_list
+ * blkcg_pin_online - pin online state
* @blkcg: blkcg of interest
*
- * This is used to track the number of active wb's related to a blkcg.
+ * While pinned, a blkcg is kept online. This is primarily used to
+ * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline
+ * while an associated cgwb is still active.
*/
-static inline void blkcg_cgwb_get(struct blkcg *blkcg)
+static inline void blkcg_pin_online(struct blkcg *blkcg)
{
- refcount_inc(&blkcg->cgwb_refcnt);
+ refcount_inc(&blkcg->online_pin);
}
/**
- * blkcg_cgwb_put - put a reference for @blkcg->cgwb_list
+ * blkcg_unpin_online - unpin online state
* @blkcg: blkcg of interest
*
- * This is used to track the number of active wb's related to a blkcg.
- * When this count goes to zero, all active wb has finished so the
+ * This is primarily used to impedance-match blkg and cgwb lifetimes so
+ * that blkg doesn't go offline while an associated cgwb is still active.
+ * When this count goes to zero, all active cgwbs have finished so the
* blkcg can continue destruction by calling blkcg_destroy_blkgs().
- * This work may occur in cgwb_release_workfn() on the cgwb_release
- * workqueue.
*/
-static inline void blkcg_cgwb_put(struct blkcg *blkcg)
+static inline void blkcg_unpin_online(struct blkcg *blkcg)
{
- if (refcount_dec_and_test(&blkcg->cgwb_refcnt))
+ do {
+ if (!refcount_dec_and_test(&blkcg->online_pin))
+ break;
blkcg_destroy_blkgs(blkcg);
+ blkcg = blkcg_parent(blkcg);
+ } while (blkcg);
}
-#else
-
-static inline void blkcg_cgwb_get(struct blkcg *blkcg) { }
-
-static inline void blkcg_cgwb_put(struct blkcg *blkcg)
-{
- /* wb isn't being accounted, so trigger destruction right away */
- blkcg_destroy_blkgs(blkcg);
-}
-
-#endif
-
/**
* blkg_path - format cgroup path of blkg
* @blkg: blkg of interest
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index cb21c5cf12c3..ebf5ba62b772 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -444,8 +444,9 @@ union ceph_mds_request_args {
} __attribute__ ((packed)) lookupino;
} __attribute__ ((packed));
-#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
-#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
+#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
+#define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */
struct ceph_mds_request_head {
__le64 oldest_client_tid;
@@ -530,6 +531,9 @@ struct ceph_mds_reply_lease {
__le32 seq;
} __attribute__ ((packed));
+#define CEPH_LEASE_VALID (1 | 2) /* old and new bit values */
+#define CEPH_LEASE_PRIMARY_LINK 4 /* primary linkage */
+
struct ceph_mds_reply_dirfrag {
__le32 frag; /* fragment */
__le32 auth; /* auth mds, if this is a delegation point */
@@ -564,6 +568,7 @@ struct ceph_filelock {
#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
#define CEPH_FILE_MODE_BITS 4
+#define CEPH_FILE_MODE_MASK ((1 << CEPH_FILE_MODE_BITS) - 1)
int ceph_flags_to_mode(int flags);
@@ -655,10 +660,19 @@ int ceph_flags_to_mode(int flags);
#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
CEPH_CAP_PIN)
+#define CEPH_CAP_ALL_FILE (CEPH_CAP_PIN | CEPH_CAP_ANY_SHARED | \
+ CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL | \
+ CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)
#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
CEPH_LOCK_IXATTR)
+/* cap masks async dir operations */
+#define CEPH_CAP_DIR_CREATE CEPH_CAP_FILE_CACHE
+#define CEPH_CAP_DIR_UNLINK CEPH_CAP_FILE_RD
+#define CEPH_CAP_ANY_DIR_OPS (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | \
+ CEPH_CAP_FILE_WREXTEND | CEPH_CAP_FILE_LAZYIO)
+
int ceph_caps_for_mode(int mode);
enum {
diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h
index cf5e840eec71..8b3a1a7a953a 100644
--- a/include/linux/ceph/debugfs.h
+++ b/include/linux/ceph/debugfs.h
@@ -2,22 +2,8 @@
#ifndef _FS_CEPH_DEBUGFS_H
#define _FS_CEPH_DEBUGFS_H
-#include <linux/ceph/ceph_debug.h>
#include <linux/ceph/types.h>
-#define CEPH_DEFINE_SHOW_FUNC(name) \
-static int name##_open(struct inode *inode, struct file *file) \
-{ \
- return single_open(file, name, inode->i_private); \
-} \
- \
-static const struct file_operations name##_fops = { \
- .open = name##_open, \
- .read = seq_read, \
- .llseek = seq_lseek, \
- .release = single_release, \
-};
-
/* debugfs.c */
extern void ceph_debugfs_init(void);
extern void ceph_debugfs_cleanup(void);
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index ec73ebc4827d..525b7c3f1c81 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -272,6 +272,7 @@ extern struct kmem_cache *ceph_cap_flush_cachep;
extern struct kmem_cache *ceph_dentry_cachep;
extern struct kmem_cache *ceph_file_cachep;
extern struct kmem_cache *ceph_dir_file_cachep;
+extern struct kmem_cache *ceph_mds_request_cachep;
/* ceph_common.c */
extern bool libceph_compatible(void *data);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 5a62dbd3f4c2..9d9f745b98a1 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -509,23 +509,6 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
struct page *req_page, size_t req_len,
struct page **resp_pages, size_t *resp_len);
-extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u32 truncate_seq, u64 truncate_size,
- struct page **pages, int nr_pages,
- int page_align);
-
-extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- struct ceph_snap_context *sc,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- struct timespec64 *mtime,
- struct page **pages, int nr_pages);
-
int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
u64 src_snapid, u64 src_version,
struct ceph_object_id *src_oid,
diff --git a/include/linux/cma.h b/include/linux/cma.h
index 190184b5ff32..6ff79fefd01f 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -4,6 +4,7 @@
#include <linux/init.h>
#include <linux/types.h>
+#include <linux/numa.h>
/*
* There is always at least global CMA area and a few optional
@@ -24,10 +25,19 @@ extern phys_addr_t cma_get_base(const struct cma *cma);
extern unsigned long cma_get_size(const struct cma *cma);
extern const char *cma_get_name(const struct cma *cma);
-extern int __init cma_declare_contiguous(phys_addr_t base,
+extern int __init cma_declare_contiguous_nid(phys_addr_t base,
phys_addr_t size, phys_addr_t limit,
phys_addr_t alignment, unsigned int order_per_bit,
- bool fixed, const char *name, struct cma **res_cma);
+ bool fixed, const char *name, struct cma **res_cma,
+ int nid);
+static inline int __init cma_declare_contiguous(phys_addr_t base,
+ phys_addr_t size, phys_addr_t limit,
+ phys_addr_t alignment, unsigned int order_per_bit,
+ bool fixed, const char *name, struct cma **res_cma)
+{
+ return cma_declare_contiguous_nid(base, size, limit, alignment,
+ order_per_bit, fixed, name, res_cma, NUMA_NO_NODE);
+}
extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
unsigned int order_per_bit,
const char *name,
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 5e88e7e33abe..034b0a644efc 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -347,7 +347,7 @@ static inline void *offset_to_ptr(const int *off)
* compiler has support to do so.
*/
#define compiletime_assert(condition, msg) \
- _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
+ _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
#define compiletime_assert_atomic_type(t) \
compiletime_assert(__native_word(t), \
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 72393a8c1a6c..e970f97a7fcb 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -129,22 +129,13 @@ struct ftrace_likely_data {
#define __compiler_offsetof(a, b) __builtin_offsetof(a, b)
/*
- * Force always-inline if the user requests it so via the .config.
* Prefer gnu_inline, so that extern inline functions do not emit an
* externally visible function. This makes extern inline behave as per gnu89
* semantics rather than c99. This prevents multiple symbol definition errors
* of extern inline functions at link time.
* A lot of inline functions can cause havoc with function tracing.
- * Do not use __always_inline here, since currently it expands to inline again
- * (which would break users of __always_inline).
*/
-#if !defined(CONFIG_OPTIMIZE_INLINING)
-#define inline inline __attribute__((__always_inline__)) __gnu_inline \
- __inline_maybe_unused notrace
-#else
-#define inline inline __gnu_inline \
- __inline_maybe_unused notrace
-#endif
+#define inline inline __gnu_inline __inline_maybe_unused notrace
/*
* gcc provides both __inline__ and __inline as alternate spellings of
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 328c2dbb4409..d7af5d243f24 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -13,6 +13,7 @@
typedef unsigned long dax_entry_t;
struct iomap_ops;
+struct iomap;
struct dax_device;
struct dax_operations {
/*
@@ -34,6 +35,8 @@ struct dax_operations {
/* copy_to_iter: required operation for fs-dax direct-i/o */
size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t,
struct iov_iter *);
+ /* zero_page_range: required operation. Zero page range */
+ int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
};
extern struct attribute_group dax_attribute_group;
@@ -199,6 +202,8 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
size_t bytes, struct iov_iter *i);
size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
size_t bytes, struct iov_iter *i);
+int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
+ size_t nr_pages);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
@@ -210,20 +215,8 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
-
-#ifdef CONFIG_FS_DAX
-int __dax_zero_page_range(struct block_device *bdev,
- struct dax_device *dax_dev, sector_t sector,
- unsigned int offset, unsigned int length);
-#else
-static inline int __dax_zero_page_range(struct block_device *bdev,
- struct dax_device *dax_dev, sector_t sector,
- unsigned int offset, unsigned int length)
-{
- return -ENXIO;
-}
-#endif
-
+int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
+ struct iomap *iomap);
static inline bool dax_mapping(struct address_space *mapping)
{
return mapping->host && IS_DAX(mapping->host);
diff --git a/include/linux/devfreq_cooling.h b/include/linux/devfreq_cooling.h
index 4635f95000a4..79a6e37a1d6f 100644
--- a/include/linux/devfreq_cooling.h
+++ b/include/linux/devfreq_cooling.h
@@ -75,7 +75,7 @@ void devfreq_cooling_unregister(struct thermal_cooling_device *dfc);
#else /* !CONFIG_DEVFREQ_THERMAL */
-struct thermal_cooling_device *
+static inline struct thermal_cooling_device *
of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
struct devfreq_cooling_power *dfc_power)
{
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 475668c69dbc..af48d9da3916 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -141,6 +141,8 @@ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn);
typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i);
+typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff,
+ size_t nr_pages);
#define PAGE_SECTORS (PAGE_SIZE / 512)
void dm_error(const char *message);
@@ -195,6 +197,7 @@ struct target_type {
dm_dax_direct_access_fn direct_access;
dm_dax_copy_iter_fn dax_copy_from_iter;
dm_dax_copy_iter_fn dax_copy_to_iter;
+ dm_dax_zero_page_range_fn dax_zero_page_range;
/* For internal device-mapper use. */
struct list_head list;
diff --git a/include/linux/device.h b/include/linux/device.h
index 1311f276f533..ac8e37cd716a 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -42,9 +42,8 @@ struct device_node;
struct fwnode_handle;
struct iommu_ops;
struct iommu_group;
-struct iommu_fwspec;
struct dev_pin_info;
-struct iommu_param;
+struct dev_iommu;
/**
* struct subsys_interface - interfaces to device functions
@@ -513,8 +512,7 @@ struct dev_links_info {
* gone away. This should be set by the allocator of the
* device (i.e. the bus driver that discovered the device).
* @iommu_group: IOMMU group the device belongs to.
- * @iommu_fwspec: IOMMU-specific properties supplied by firmware.
- * @iommu_param: Per device generic IOMMU runtime data
+ * @iommu: Per device generic IOMMU runtime data
*
* @offline_disabled: If set, the device is permanently online.
* @offline: Set after successful invocation of bus type's .offline().
@@ -613,8 +611,7 @@ struct device {
void (*release)(struct device *dev);
struct iommu_group *iommu_group;
- struct iommu_fwspec *iommu_fwspec;
- struct iommu_param *iommu_param;
+ struct dev_iommu *iommu;
bool offline_disabled:1;
bool offline:1;
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index ac3f4888b3df..3c383ddd92dd 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -125,6 +125,7 @@ struct f2fs_super_block {
/*
* For checkpoint
*/
+#define CP_RESIZEFS_FLAG 0x00004000
#define CP_DISABLED_QUICK_FLAG 0x00002000
#define CP_DISABLED_FLAG 0x00001000
#define CP_QUOTA_NEED_FSCK_FLAG 0x00000800
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index be2754841369..4aba4c86c626 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -124,6 +124,8 @@ struct vm_area_struct;
*
* Reclaim modifiers
* ~~~~~~~~~~~~~~~~~
+ * Please note that all the following flags are only applicable to sleepable
+ * allocations (e.g. %GFP_NOWAIT and %GFP_ATOMIC will ignore them).
*
* %__GFP_IO can start physical IO.
*
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f2df2247026a..cfbb0a87c5f0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -46,7 +46,7 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
pmd_t *old_pmd, pmd_t *new_pmd);
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot,
- int prot_numa);
+ unsigned long cp_flags);
vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
pgprot_t pgprot, bool write);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5ea05879a0a9..43a1cef8f0f1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -895,4 +895,16 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h,
return ptl;
}
+#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
+extern void __init hugetlb_cma_reserve(int order);
+extern void __init hugetlb_cma_check(void);
+#else
+static inline __init void hugetlb_cma_reserve(int order)
+{
+}
+static inline __init void hugetlb_cma_check(void)
+{
+}
+#endif
+
#endif /* _LINUX_HUGETLB_H */
diff --git a/include/linux/iio/common/cros_ec_sensors_core.h b/include/linux/iio/common/cros_ec_sensors_core.h
index bb331e6356a9..7bc961defa87 100644
--- a/include/linux/iio/common/cros_ec_sensors_core.h
+++ b/include/linux/iio/common/cros_ec_sensors_core.h
@@ -12,6 +12,7 @@
#include <linux/irqreturn.h>
#include <linux/platform_data/cros_ec_commands.h>
#include <linux/platform_data/cros_ec_proto.h>
+#include <linux/platform_data/cros_ec_sensorhub.h>
enum {
CROS_EC_SENSOR_X,
@@ -29,8 +30,7 @@ enum {
*/
#define CROS_EC_SAMPLE_SIZE (sizeof(s64) * 2)
-/* Minimum sampling period to use when device is suspending */
-#define CROS_EC_MIN_SUSPEND_SAMPLING_FREQUENCY 1000 /* 1 second */
+typedef irqreturn_t (*cros_ec_sensors_capture_t)(int irq, void *p);
/**
* struct cros_ec_sensors_core_state - state data for EC sensors IIO driver
@@ -50,7 +50,9 @@ enum {
* the timestamp. The timestamp is always last and
* is always 8-byte aligned.
* @read_ec_sensors_data: function used for accessing sensors values
- * @cuur_sampl_freq: current sampling period
+ * @fifo_max_event_count: Size of the EC sensor FIFO
+ * @frequencies: Table of known available frequencies:
+ * 0, Min and Max in mHz
*/
struct cros_ec_sensors_core_state {
struct cros_ec_device *ec;
@@ -73,101 +75,34 @@ struct cros_ec_sensors_core_state {
int (*read_ec_sensors_data)(struct iio_dev *indio_dev,
unsigned long scan_mask, s16 *data);
- int curr_sampl_freq;
-
- /* Table of known available frequencies : 0, Min and Max in mHz */
- int frequencies[3];
+ u32 fifo_max_event_count;
+ int frequencies[6];
};
-/**
- * cros_ec_sensors_read_lpc() - retrieve data from EC shared memory
- * @indio_dev: pointer to IIO device
- * @scan_mask: bitmap of the sensor indices to scan
- * @data: location to store data
- *
- * This is the safe function for reading the EC data. It guarantees that the
- * data sampled was not modified by the EC while being read.
- *
- * Return: 0 on success, -errno on failure.
- */
int cros_ec_sensors_read_lpc(struct iio_dev *indio_dev, unsigned long scan_mask,
s16 *data);
-/**
- * cros_ec_sensors_read_cmd() - retrieve data using the EC command protocol
- * @indio_dev: pointer to IIO device
- * @scan_mask: bitmap of the sensor indices to scan
- * @data: location to store data
- *
- * Return: 0 on success, -errno on failure.
- */
int cros_ec_sensors_read_cmd(struct iio_dev *indio_dev, unsigned long scan_mask,
s16 *data);
struct platform_device;
-/**
- * cros_ec_sensors_core_init() - basic initialization of the core structure
- * @pdev: platform device created for the sensors
- * @indio_dev: iio device structure of the device
- * @physical_device: true if the device refers to a physical device
- *
- * Return: 0 on success, -errno on failure.
- */
int cros_ec_sensors_core_init(struct platform_device *pdev,
- struct iio_dev *indio_dev, bool physical_device);
+ struct iio_dev *indio_dev, bool physical_device,
+ cros_ec_sensors_capture_t trigger_capture,
+ cros_ec_sensorhub_push_data_cb_t push_data);
-/**
- * cros_ec_sensors_capture() - the trigger handler function
- * @irq: the interrupt number.
- * @p: a pointer to the poll function.
- *
- * On a trigger event occurring, if the pollfunc is attached then this
- * handler is called as a threaded interrupt (and hence may sleep). It
- * is responsible for grabbing data from the device and pushing it into
- * the associated buffer.
- *
- * Return: IRQ_HANDLED
- */
irqreturn_t cros_ec_sensors_capture(int irq, void *p);
+int cros_ec_sensors_push_data(struct iio_dev *indio_dev,
+ s16 *data,
+ s64 timestamp);
-/**
- * cros_ec_motion_send_host_cmd() - send motion sense host command
- * @st: pointer to state information for device
- * @opt_length: optional length to reduce the response size, useful on the data
- * path. Otherwise, the maximal allowed response size is used
- *
- * When called, the sub-command is assumed to be set in param->cmd.
- *
- * Return: 0 on success, -errno on failure.
- */
int cros_ec_motion_send_host_cmd(struct cros_ec_sensors_core_state *st,
u16 opt_length);
-/**
- * cros_ec_sensors_core_read() - function to request a value from the sensor
- * @st: pointer to state information for device
- * @chan: channel specification structure table
- * @val: will contain one element making up the returned value
- * @val2: will contain another element making up the returned value
- * @mask: specifies which values to be requested
- *
- * Return: the type of value returned by the device
- */
int cros_ec_sensors_core_read(struct cros_ec_sensors_core_state *st,
struct iio_chan_spec const *chan,
int *val, int *val2, long mask);
-/**
- * cros_ec_sensors_core_read_avail() - get available values
- * @indio_dev: pointer to state information for device
- * @chan: channel specification structure table
- * @vals: list of available values
- * @type: type of data returned
- * @length: number of data returned in the array
- * @mask: specifies which values to be requested
- *
- * Return: an error code, IIO_AVAIL_RANGE or IIO_AVAIL_LIST
- */
int cros_ec_sensors_core_read_avail(struct iio_dev *indio_dev,
struct iio_chan_spec const *chan,
const int **vals,
@@ -175,23 +110,12 @@ int cros_ec_sensors_core_read_avail(struct iio_dev *indio_dev,
int *length,
long mask);
-/**
- * cros_ec_sensors_core_write() - function to write a value to the sensor
- * @st: pointer to state information for device
- * @chan: channel specification structure table
- * @val: first part of value to write
- * @val2: second part of value to write
- * @mask: specifies which values to write
- *
- * Return: the type of value returned by the device
- */
int cros_ec_sensors_core_write(struct cros_ec_sensors_core_state *st,
struct iio_chan_spec const *chan,
int val, int val2, long mask);
-extern const struct dev_pm_ops cros_ec_sensors_pm_ops;
-
/* List of extended channel specification for all sensors */
extern const struct iio_chan_spec_ext_info cros_ec_sensors_ext_info[];
+extern const struct attribute *cros_ec_sensor_fifo_attributes[];
#endif /* __CROS_EC_SENSORS_CORE_H */
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index eed58ed2f368..17f56a070b20 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -629,6 +629,8 @@ static inline clockid_t iio_device_get_clock(const struct iio_dev *indio_dev)
return indio_dev->clock_id;
}
+int iio_device_set_clock(struct iio_dev *indio_dev, clockid_t clock_id);
+
/**
* dev_to_iio_dev() - Get IIO device struct from a device struct
* @dev: The device embedded in the IIO device
diff --git a/include/linux/io.h b/include/linux/io.h
index b1c44bb4b2d7..8394c56babc2 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -77,8 +77,6 @@ void *devm_memremap(struct device *dev, resource_size_t offset,
size_t size, unsigned long flags);
void devm_memunmap(struct device *dev, void *addr);
-void *__devm_memremap_pages(struct device *dev, struct resource *res);
-
#ifdef CONFIG_PCI
/*
* The PCI specifications (Rev 3.0, 3.2.5 "Transaction Ordering and
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d1b5f4d98569..7ef8b0bda695 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -365,17 +365,20 @@ struct iommu_fault_param {
};
/**
- * struct iommu_param - collection of per-device IOMMU data
+ * struct dev_iommu - Collection of per-device IOMMU data
*
* @fault_param: IOMMU detected device fault reporting data
+ * @fwspec: IOMMU fwspec data
+ * @priv: IOMMU Driver private data
*
* TODO: migrate other per device data pointers under iommu_dev_data, e.g.
* struct iommu_group *iommu_group;
- * struct iommu_fwspec *iommu_fwspec;
*/
-struct iommu_param {
+struct dev_iommu {
struct mutex lock;
- struct iommu_fault_param *fault_param;
+ struct iommu_fault_param *fault_param;
+ struct iommu_fwspec *fwspec;
+ void *priv;
};
int iommu_device_register(struct iommu_device *iommu);
@@ -588,11 +591,10 @@ struct iommu_group *fsl_mc_device_group(struct device *dev);
struct iommu_fwspec {
const struct iommu_ops *ops;
struct fwnode_handle *iommu_fwnode;
- void *iommu_priv;
u32 flags;
u32 num_pasid_bits;
unsigned int num_ids;
- u32 ids[1];
+ u32 ids[];
};
/* ATS is supported */
@@ -614,13 +616,26 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode);
static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
{
- return dev->iommu_fwspec;
+ if (dev->iommu)
+ return dev->iommu->fwspec;
+ else
+ return NULL;
}
static inline void dev_iommu_fwspec_set(struct device *dev,
struct iommu_fwspec *fwspec)
{
- dev->iommu_fwspec = fwspec;
+ dev->iommu->fwspec = fwspec;
+}
+
+static inline void *dev_iommu_priv_get(struct device *dev)
+{
+ return dev->iommu->priv;
+}
+
+static inline void dev_iommu_priv_set(struct device *dev, void *priv)
+{
+ dev->iommu->priv = priv;
}
int iommu_probe_device(struct device *dev);
@@ -1073,6 +1088,10 @@ static inline int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
return -ENODEV;
}
+static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
+{
+ return NULL;
+}
#endif /* CONFIG_IOMMU_API */
#ifdef CONFIG_IOMMU_DEBUGFS
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 75353e5f9d13..2451962d1ec5 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -25,6 +25,7 @@ struct device_node;
* LED Core
*/
+/* This is obsolete/useless. We now support variable maximum brightness. */
enum led_brightness {
LED_OFF = 0,
LED_ON = 1,
diff --git a/include/linux/leds_pwm.h b/include/linux/leds_pwm.h
deleted file mode 100644
index 93d101d28943..000000000000
--- a/include/linux/leds_pwm.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PWM LED driver data - see drivers/leds/leds-pwm.c
- */
-#ifndef __LINUX_LEDS_PWM_H
-#define __LINUX_LEDS_PWM_H
-
-struct led_pwm {
- const char *name;
- const char *default_trigger;
- unsigned pwm_id __deprecated;
- u8 active_low;
- unsigned max_brightness;
- unsigned pwm_period_ns;
-};
-
-struct led_pwm_platform_data {
- int num_leds;
- struct led_pwm *leds;
-};
-
-#endif
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 9df091bd30ba..18da4059be09 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -37,6 +37,8 @@ enum {
NDD_WORK_PENDING = 4,
/* ignore / filter NSLABEL_FLAG_LOCAL for this DIMM, i.e. no aliasing */
NDD_NOBLK = 5,
+ /* dimm supports namespace labels */
+ NDD_LABELING = 6,
/* need to set a limit somewhere, but yes, this is likely overkill */
ND_IOCTL_MAX_BUFLEN = SZ_4M,
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 079d17d96410..6bc37a731d27 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -348,6 +348,9 @@ static inline int memblock_get_region_node(const struct memblock_region *r)
phys_addr_t memblock_phys_alloc_range(phys_addr_t size, phys_addr_t align,
phys_addr_t start, phys_addr_t end);
+phys_addr_t memblock_alloc_range_nid(phys_addr_t size,
+ phys_addr_t align, phys_addr_t start,
+ phys_addr_t end, int nid, bool exact_nid);
phys_addr_t memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);
static inline phys_addr_t memblock_phys_alloc(phys_addr_t size,
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 0b8d791b6669..439a89e758d8 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -26,7 +26,6 @@
struct memory_block {
unsigned long start_section_nr;
unsigned long state; /* serialized by the dev->lock */
- int section_count; /* serialized by mem_sysfs_mutex */
int online_type; /* for passing data to online routine */
int phys_device; /* to which fru does this belong? */
struct device dev;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index f4d59155f3d4..93d9ada74ddd 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -47,20 +47,25 @@ enum {
/* Types for control the zone type of onlined and offlined memory */
enum {
- MMOP_OFFLINE = -1,
- MMOP_ONLINE_KEEP,
+ /* Offline the memory. */
+ MMOP_OFFLINE = 0,
+ /* Online the memory. Zone depends, see default_zone_for_pfn(). */
+ MMOP_ONLINE,
+ /* Online the memory to ZONE_NORMAL. */
MMOP_ONLINE_KERNEL,
+ /* Online the memory to ZONE_MOVABLE. */
MMOP_ONLINE_MOVABLE,
};
/*
- * Restrictions for the memory hotplug:
- * flags: MHP_ flags
- * altmap: alternative allocator for memmap array
+ * Extended parameters for memory hotplug:
+ * altmap: alternative allocator for memmap array (optional)
+ * pgprot: page protection flags to apply to newly created page tables
+ * (required)
*/
-struct mhp_restrictions {
- unsigned long flags;
+struct mhp_params {
struct vmem_altmap *altmap;
+ pgprot_t pgprot;
};
/*
@@ -110,10 +115,13 @@ extern int restore_online_page_callback(online_page_callback_t callback);
extern int try_online_node(int nid);
extern int arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_restrictions *restrictions);
+ struct mhp_params *params);
extern u64 max_mem_size;
-extern bool memhp_auto_online;
+extern int memhp_online_type_from_str(const char *str);
+
+/* Default online_type (MMOP_*) when new memory blocks are added. */
+extern int memhp_default_online_type;
/* If movable_node boot option specified */
extern bool movable_node_enabled;
static inline bool movable_node_is_enabled(void)
@@ -128,17 +136,17 @@ extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages,
/* reasonably generic interface to expand the physical pages */
extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
- struct mhp_restrictions *restrictions);
+ struct mhp_params *params);
#ifndef CONFIG_ARCH_HAS_ADD_PAGES
static inline int add_pages(int nid, unsigned long start_pfn,
- unsigned long nr_pages, struct mhp_restrictions *restrictions)
+ unsigned long nr_pages, struct mhp_params *params)
{
- return __add_pages(nid, start_pfn, nr_pages, restrictions);
+ return __add_pages(nid, start_pfn, nr_pages, params);
}
#else /* ARCH_HAS_ADD_PAGES */
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
- struct mhp_restrictions *restrictions);
+ struct mhp_params *params);
#endif /* ARCH_HAS_ADD_PAGES */
#ifdef CONFIG_NUMA
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 60d97e8fd3c0..5f5b2df06e61 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -98,8 +98,6 @@ struct dev_pagemap_ops {
* @ref: reference count that pins the devm_memremap_pages() mapping
* @internal_ref: internal reference if @ref is not provided by the caller
* @done: completion for @internal_ref
- * @dev: host device of the mapping for debug
- * @data: private data pointer for page_free()
* @type: memory type: see MEMORY_* in memory_hotplug.h
* @flags: PGMAP_* flags to specify defailed behavior
* @ops: method table
@@ -136,6 +134,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
+unsigned long memremap_compat_align(void);
#else
static inline void *devm_memremap_pages(struct device *dev,
struct dev_pagemap *pgmap)
@@ -169,6 +168,12 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap,
unsigned long nr_pfns)
{
}
+
+/* when memremap_pages() is disabled all archs can remap a single page */
+static inline unsigned long memremap_compat_align(void)
+{
+ return PAGE_SIZE;
+}
#endif /* CONFIG_ZONE_DEVICE */
static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
@@ -176,4 +181,5 @@ static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
if (pgmap)
percpu_ref_put(pgmap->ref);
}
+
#endif /* _LINUX_MEMREMAP_H_ */
diff --git a/include/linux/mfd/iqs62x.h b/include/linux/mfd/iqs62x.h
new file mode 100644
index 000000000000..043d3b6de9ec
--- /dev/null
+++ b/include/linux/mfd/iqs62x.h
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Azoteq IQS620A/621/622/624/625 Multi-Function Sensors
+ *
+ * Copyright (C) 2019 Jeff LaBundy <jeff@labundy.com>
+ */
+
+#ifndef __LINUX_MFD_IQS62X_H
+#define __LINUX_MFD_IQS62X_H
+
+#define IQS620_PROD_NUM 0x41
+#define IQS621_PROD_NUM 0x46
+#define IQS622_PROD_NUM 0x42
+#define IQS624_PROD_NUM 0x43
+#define IQS625_PROD_NUM 0x4E
+
+#define IQS621_ALS_FLAGS 0x16
+#define IQS622_ALS_FLAGS 0x14
+
+#define IQS624_HALL_UI 0x70
+#define IQS624_HALL_UI_WHL_EVENT BIT(4)
+#define IQS624_HALL_UI_INT_EVENT BIT(3)
+#define IQS624_HALL_UI_AUTO_CAL BIT(2)
+
+#define IQS624_INTERVAL_DIV 0x7D
+
+#define IQS620_GLBL_EVENT_MASK 0xD7
+#define IQS620_GLBL_EVENT_MASK_PMU BIT(6)
+
+#define IQS62X_NUM_KEYS 16
+#define IQS62X_NUM_EVENTS (IQS62X_NUM_KEYS + 5)
+
+#define IQS62X_EVENT_SIZE 10
+
+enum iqs62x_ui_sel {
+ IQS62X_UI_PROX,
+ IQS62X_UI_SAR1,
+};
+
+enum iqs62x_event_reg {
+ IQS62X_EVENT_NONE,
+ IQS62X_EVENT_SYS,
+ IQS62X_EVENT_PROX,
+ IQS62X_EVENT_HYST,
+ IQS62X_EVENT_HALL,
+ IQS62X_EVENT_ALS,
+ IQS62X_EVENT_IR,
+ IQS62X_EVENT_WHEEL,
+ IQS62X_EVENT_INTER,
+ IQS62X_EVENT_UI_LO,
+ IQS62X_EVENT_UI_HI,
+};
+
+enum iqs62x_event_flag {
+ /* keys */
+ IQS62X_EVENT_PROX_CH0_T,
+ IQS62X_EVENT_PROX_CH0_P,
+ IQS62X_EVENT_PROX_CH1_T,
+ IQS62X_EVENT_PROX_CH1_P,
+ IQS62X_EVENT_PROX_CH2_T,
+ IQS62X_EVENT_PROX_CH2_P,
+ IQS62X_EVENT_HYST_POS_T,
+ IQS62X_EVENT_HYST_POS_P,
+ IQS62X_EVENT_HYST_NEG_T,
+ IQS62X_EVENT_HYST_NEG_P,
+ IQS62X_EVENT_SAR1_ACT,
+ IQS62X_EVENT_SAR1_QRD,
+ IQS62X_EVENT_SAR1_MOVE,
+ IQS62X_EVENT_SAR1_HALT,
+ IQS62X_EVENT_WHEEL_UP,
+ IQS62X_EVENT_WHEEL_DN,
+
+ /* switches */
+ IQS62X_EVENT_HALL_N_T,
+ IQS62X_EVENT_HALL_N_P,
+ IQS62X_EVENT_HALL_S_T,
+ IQS62X_EVENT_HALL_S_P,
+
+ /* everything else */
+ IQS62X_EVENT_SYS_RESET,
+};
+
+struct iqs62x_event_data {
+ u16 ui_data;
+ u8 als_flags;
+ u8 ir_flags;
+ u8 interval;
+};
+
+struct iqs62x_event_desc {
+ enum iqs62x_event_reg reg;
+ u8 mask;
+ u8 val;
+};
+
+struct iqs62x_dev_desc {
+ const char *dev_name;
+ const struct mfd_cell *sub_devs;
+ int num_sub_devs;
+
+ u8 prod_num;
+ u8 sw_num;
+ const u8 *cal_regs;
+ int num_cal_regs;
+
+ u8 prox_mask;
+ u8 sar_mask;
+ u8 hall_mask;
+ u8 hyst_mask;
+ u8 temp_mask;
+ u8 als_mask;
+ u8 ir_mask;
+
+ u8 prox_settings;
+ u8 als_flags;
+ u8 hall_flags;
+ u8 hyst_shift;
+
+ u8 interval;
+ u8 interval_div;
+
+ u8 clk_div;
+ const char *fw_name;
+ const enum iqs62x_event_reg (*event_regs)[IQS62X_EVENT_SIZE];
+};
+
+struct iqs62x_core {
+ const struct iqs62x_dev_desc *dev_desc;
+ struct i2c_client *client;
+ struct regmap *regmap;
+ struct blocking_notifier_head nh;
+ struct list_head fw_blk_head;
+ struct completion fw_done;
+ enum iqs62x_ui_sel ui_sel;
+};
+
+extern const struct iqs62x_event_desc iqs62x_events[IQS62X_NUM_EVENTS];
+
+#endif /* __LINUX_MFD_IQS62X_H */
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index a59bf323f713..e07f6e61cd38 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -620,7 +620,5 @@ struct rk808 {
long variant;
const struct regmap_config *regmap_cfg;
const struct regmap_irq_chip *regmap_irq_chip;
- void (*pm_pwroff_fn)(void);
- void (*pm_pwroff_prep_fn)(void);
};
#endif /* __LINUX_REGULATOR_RK808_H */
diff --git a/include/linux/mfd/rn5t618.h b/include/linux/mfd/rn5t618.h
index d62ef48060b5..fba0df13d9a8 100644
--- a/include/linux/mfd/rn5t618.h
+++ b/include/linux/mfd/rn5t618.h
@@ -139,6 +139,17 @@
#define RN5T618_INTPOL 0x9c
#define RN5T618_INTEN 0x9d
#define RN5T618_INTMON 0x9e
+
+#define RN5T618_RTC_SECONDS 0xA0
+#define RN5T618_RTC_MDAY 0xA4
+#define RN5T618_RTC_MONTH 0xA5
+#define RN5T618_RTC_YEAR 0xA6
+#define RN5T618_RTC_ADJUST 0xA7
+#define RN5T618_RTC_ALARM_Y_SEC 0xA8
+#define RN5T618_RTC_DAL_MONTH 0xAC
+#define RN5T618_RTC_CTRL1 0xAE
+#define RN5T618_RTC_CTRL2 0xAF
+
#define RN5T618_PREVINDAC 0xb0
#define RN5T618_BATDAC 0xb1
#define RN5T618_CHGCTL1 0xb3
@@ -242,9 +253,24 @@ enum {
RC5T619,
};
+/* RN5T618 IRQ definitions */
+enum {
+ RN5T618_IRQ_SYS = 0,
+ RN5T618_IRQ_DCDC,
+ RN5T618_IRQ_RTC,
+ RN5T618_IRQ_ADC,
+ RN5T618_IRQ_GPIO,
+ RN5T618_IRQ_CHG,
+ RN5T618_NR_IRQS,
+};
+
struct rn5t618 {
struct regmap *regmap;
+ struct device *dev;
long variant;
+
+ int irq;
+ struct regmap_irq_chip_data *irq_data;
};
#endif /* __LINUX_MFD_RN5T618_H */
diff --git a/include/linux/mfd/sc27xx-pmic.h b/include/linux/mfd/sc27xx-pmic.h
new file mode 100644
index 000000000000..57e45c0b3ae2
--- /dev/null
+++ b/include/linux/mfd/sc27xx-pmic.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_MFD_SC27XX_PMIC_H
+#define __LINUX_MFD_SC27XX_PMIC_H
+
+extern enum usb_charger_type sprd_pmic_detect_charger_type(struct device *dev);
+
+#endif /* __LINUX_MFD_SC27XX_PMIC_H */
diff --git a/include/linux/mfd/wm831x/pdata.h b/include/linux/mfd/wm831x/pdata.h
index 986986fe4e4e..75aa94dadf1c 100644
--- a/include/linux/mfd/wm831x/pdata.h
+++ b/include/linux/mfd/wm831x/pdata.h
@@ -89,7 +89,6 @@ enum wm831x_watchdog_action {
struct wm831x_watchdog_pdata {
enum wm831x_watchdog_action primary, secondary;
- int update_gpio;
unsigned int software:1;
};
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7dd5c4ccbf85..5a323422d783 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -343,6 +343,20 @@ extern unsigned int kobjsize(const void *objp);
/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
+#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)
+
+/* Common data flag combinations */
+#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \
+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \
+ VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \
+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC
+#endif
+
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
#endif
@@ -355,6 +369,10 @@ extern unsigned int kobjsize(const void *objp);
#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
+/* VMA basic access permission flags */
+#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
+
+
/*
* Special vmas that are non-mergable, non-mlock()able.
*/
@@ -629,6 +647,12 @@ static inline bool vma_is_foreign(struct vm_area_struct *vma)
return false;
}
+
+static inline bool vma_is_accessible(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & VM_ACCESS_FLAGS;
+}
+
#ifdef CONFIG_SHMEM
/*
* The vma_is_shmem is not inline because it is used only by slow
@@ -1765,9 +1789,26 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len,
bool need_rmap_locks);
+
+/*
+ * Flags used by change_protection(). For now we make it a bitmap so
+ * that we can pass in multiple flags just like parameters. However
+ * for now all the callers are only use one of the flags at the same
+ * time.
+ */
+/* Whether we should allow dirty bit accounting */
+#define MM_CP_DIRTY_ACCT (1UL << 0)
+/* Whether this protection change is for NUMA hints */
+#define MM_CP_PROT_NUMA (1UL << 1)
+/* Whether this change is for write protecting */
+#define MM_CP_UFFD_WP (1UL << 2) /* do wp */
+#define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */
+#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
+ MM_CP_UFFD_WP_RESOLVE)
+
extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
- int dirty_accountable, int prot_numa);
+ unsigned long cp_flags);
extern int mprotect_fixup(struct vm_area_struct *vma,
struct vm_area_struct **pprev, unsigned long start,
unsigned long end, unsigned long newflags);
@@ -1886,6 +1927,18 @@ static inline void sync_mm_rss(struct mm_struct *mm)
}
#endif
+#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
+static inline int pte_special(pte_t pte)
+{
+ return 0;
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+ return pte;
+}
+#endif
+
#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pte_devmap(pte_t pte)
{
@@ -2666,6 +2719,8 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t);
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
+int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
+ struct page **pages, unsigned long *num);
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
unsigned long num);
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 6f2fef7b0784..219bef41d87c 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -6,19 +6,20 @@
#include <linux/swap.h>
/**
- * page_is_file_cache - should the page be on a file LRU or anon LRU?
+ * page_is_file_lru - should the page be on a file LRU or anon LRU?
* @page: the page to test
*
- * Returns 1 if @page is page cache page backed by a regular filesystem,
- * or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed.
- * Used by functions that manipulate the LRU lists, to sort a page
- * onto the right LRU list.
+ * Returns 1 if @page is a regular filesystem backed page cache page or a lazily
+ * freed anonymous page (e.g. via MADV_FREE). Returns 0 if @page is a normal
+ * anonymous page, a tmpfs page or otherwise ram or swap backed page. Used by
+ * functions that manipulate the LRU lists, to sort a page onto the right LRU
+ * list.
*
* We would like to get this info without a page flag, but the state
* needs to survive until the page is last deleted from the LRU, which
* could be as far down as __page_cache_release.
*/
-static inline int page_is_file_cache(struct page *page)
+static inline int page_is_file_lru(struct page *page)
{
return !PageSwapBacked(page);
}
@@ -75,7 +76,7 @@ static __always_inline void del_page_from_lru_list(struct page *page,
*/
static inline enum lru_list page_lru_base_type(struct page *page)
{
- if (page_is_file_cache(page))
+ if (page_is_file_lru(page))
return LRU_INACTIVE_FILE;
return LRU_INACTIVE_ANON;
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index dd555e6d23f3..4aba6c0c2ba8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -289,8 +289,8 @@ struct vm_userfaultfd_ctx {};
#endif /* CONFIG_USERFAULTFD */
/*
- * This struct defines a memory VMM memory area. There is one of these
- * per VM-area/task. A VM area is any part of the process virtual memory
+ * This struct describes a virtual memory area. There is one of these
+ * per VM-area/task. A VM area is any part of the process virtual memory
* space that has a special rule for the page-fault handlers (ie a shared
* library, the executable area etc).
*/
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e84d448988b6..1b9de7d220fb 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -100,41 +100,6 @@ struct free_area {
unsigned long nr_free;
};
-/* Used for pages not on another list */
-static inline void add_to_free_area(struct page *page, struct free_area *area,
- int migratetype)
-{
- list_add(&page->lru, &area->free_list[migratetype]);
- area->nr_free++;
-}
-
-/* Used for pages not on another list */
-static inline void add_to_free_area_tail(struct page *page, struct free_area *area,
- int migratetype)
-{
- list_add_tail(&page->lru, &area->free_list[migratetype]);
- area->nr_free++;
-}
-
-#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
-/* Used to preserve page allocation order entropy */
-void add_to_free_area_random(struct page *page, struct free_area *area,
- int migratetype);
-#else
-static inline void add_to_free_area_random(struct page *page,
- struct free_area *area, int migratetype)
-{
- add_to_free_area(page, area, migratetype);
-}
-#endif
-
-/* Used for pages which are on another list */
-static inline void move_to_free_area(struct page *page, struct free_area *area,
- int migratetype)
-{
- list_move(&page->lru, &area->free_list[migratetype]);
-}
-
static inline struct page *get_page_from_free_area(struct free_area *area,
int migratetype)
{
@@ -142,15 +107,6 @@ static inline struct page *get_page_from_free_area(struct free_area *area,
struct page, lru);
}
-static inline void del_page_from_free_area(struct page *page,
- struct free_area *area)
-{
- list_del(&page->lru);
- __ClearPageBuddy(page);
- set_page_private(page, 0);
- area->nr_free--;
-}
-
static inline bool free_area_empty(struct free_area *area, int migratetype)
{
return list_empty(&area->free_list[migratetype]);
@@ -708,7 +664,6 @@ struct deferred_split {
* Memory statistics and page replacement data structures are maintained on a
* per-zone basis.
*/
-struct bootmem_data;
typedef struct pglist_data {
struct zone node_zones[MAX_NR_ZONES];
struct zonelist node_zonelists[MAX_ZONELISTS];
@@ -1172,6 +1127,7 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
#define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK)
#define SUBSECTION_SHIFT 21
+#define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT)
#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
@@ -1187,7 +1143,9 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)
struct mem_section_usage {
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
+#endif
/* See declaration of similar field in struct zone */
unsigned long pageblock_flags[0];
};
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 5d5b91e54f73..73eda45f1cfd 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -354,6 +354,7 @@ static inline unsigned long nfs_save_change_attribute(struct inode *dir)
extern int nfs_sync_mapping(struct address_space *mapping);
extern void nfs_zap_mapping(struct inode *inode, struct address_space *mapping);
extern void nfs_zap_caches(struct inode *);
+extern void nfs_set_inode_stale(struct inode *inode);
extern void nfs_invalidate_atime(struct inode *);
extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *,
struct nfs_fattr *, struct nfs4_label *);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 0bbd587fac6a..c32c15216da3 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -139,9 +139,14 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
extern int nfs_wait_on_request(struct nfs_page *);
extern void nfs_unlock_request(struct nfs_page *req);
extern void nfs_unlock_and_release_request(struct nfs_page *);
+extern struct nfs_page *nfs_page_group_lock_head(struct nfs_page *req);
+extern int nfs_page_group_lock_subrequests(struct nfs_page *head);
+extern void nfs_join_page_group(struct nfs_page *head, struct inode *inode);
extern int nfs_page_group_lock(struct nfs_page *);
extern void nfs_page_group_unlock(struct nfs_page *);
extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
+extern int nfs_page_set_headlock(struct nfs_page *req);
+extern void nfs_page_clear_headlock(struct nfs_page *req);
extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *);
/*
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6838c149f335..440230488025 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1266,16 +1266,25 @@ struct nfstime4 {
struct pnfs_commit_bucket {
struct list_head written;
struct list_head committing;
- struct pnfs_layout_segment *wlseg;
- struct pnfs_layout_segment *clseg;
+ struct pnfs_layout_segment *lseg;
struct nfs_writeverf direct_verf;
};
+struct pnfs_commit_array {
+ struct list_head cinfo_list;
+ struct list_head lseg_list;
+ struct pnfs_layout_segment *lseg;
+ struct rcu_head rcu;
+ refcount_t refcount;
+ unsigned int nbuckets;
+ struct pnfs_commit_bucket buckets[];
+};
+
struct pnfs_ds_commit_info {
- int nwritten;
- int ncommitting;
- int nbuckets;
- struct pnfs_commit_bucket *buckets;
+ struct list_head commits;
+ unsigned int nwritten;
+ unsigned int ncommitting;
+ const struct pnfs_commit_ops *ops;
};
struct nfs41_state_protection {
@@ -1386,22 +1395,11 @@ struct nfs41_free_stateid_res {
unsigned int status;
};
-static inline void
-nfs_free_pnfs_ds_cinfo(struct pnfs_ds_commit_info *cinfo)
-{
- kfree(cinfo->buckets);
-}
-
#else
struct pnfs_ds_commit_info {
};
-static inline void
-nfs_free_pnfs_ds_cinfo(struct pnfs_ds_commit_info *cinfo)
-{
-}
-
#endif /* CONFIG_NFS_V4_1 */
#ifdef CONFIG_NFS_V4_2
diff --git a/include/linux/numa.h b/include/linux/numa.h
index 110b0e5d0fb0..a42df804679e 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_NUMA_H
#define _LINUX_NUMA_H
-
+#include <linux/types.h>
#ifdef CONFIG_NODES_SHIFT
#define NODES_SHIFT CONFIG_NODES_SHIFT
@@ -13,4 +13,32 @@
#define NUMA_NO_NODE (-1)
+/* optionally keep NUMA memory info available post init */
+#ifdef CONFIG_NUMA_KEEP_MEMINFO
+#define __initdata_or_meminfo
+#else
+#define __initdata_or_meminfo __initdata
+#endif
+
+#ifdef CONFIG_NUMA
+/* Generic implementation available */
+int numa_map_to_online_node(int node);
+
+/*
+ * Optional architecture specific implementation, users need a "depends
+ * on $ARCH"
+ */
+int phys_to_target_node(phys_addr_t addr);
+#else
+static inline int numa_map_to_online_node(int node)
+{
+ return NUMA_NO_NODE;
+}
+
+static inline int phys_to_target_node(phys_addr_t addr)
+{
+ return NUMA_NO_NODE;
+}
+#endif
+
#endif /* _LINUX_NUMA_H */
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 6d0d70f3219c..10f81629b9ce 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -270,8 +270,6 @@ struct nvme_fc_remote_port {
*
* Host/Initiator Transport Entrypoints/Parameters:
*
- * @module: The LLDD module using the interface
- *
* @localport_delete: The LLDD initiates deletion of a localport via
* nvme_fc_deregister_localport(). However, the teardown is
* asynchronous. This routine is called upon the completion of the
@@ -385,8 +383,6 @@ struct nvme_fc_remote_port {
* Value is Mandatory. Allowed to be zero.
*/
struct nvme_fc_port_template {
- struct module *module;
-
/* initiator-based functions */
void (*localport_delete)(struct nvme_fc_local_port *);
void (*remoteport_delete)(struct nvme_fc_remote_port *);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 77de28bfefb0..222f6f7b2bb3 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -63,6 +63,11 @@
* page_waitqueue(page) is a wait queue of all tasks waiting for the page
* to become unlocked.
*
+ * PG_swapbacked is set when a page uses swap as a backing storage. This are
+ * usually PageAnon or shmem pages but please note that even anonymous pages
+ * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as
+ * a result of MADV_FREE).
+ *
* PG_uptodate tells whether the page's contents is valid. When a read
* completes, the page becomes uptodate, unless a disk I/O error happened.
*
@@ -163,6 +168,9 @@ enum pageflags {
/* non-lru isolated movable page */
PG_isolated = PG_reclaim,
+
+ /* Only valid for buddy pages. Used to track pages that are reported */
+ PG_reported = PG_uptodate,
};
#ifndef __GENERATING_BOUNDS_H
@@ -432,6 +440,14 @@ PAGEFLAG(Idle, idle, PF_ANY)
#endif
/*
+ * PageReported() is used to track reported free pages within the Buddy
+ * allocator. We can use the non-atomic version of the test and set
+ * operations as both should be shielded with the zone lock to prevent
+ * any possible races on the setting or clearing of the bit.
+ */
+__PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
+
+/*
* On an anonymous page mapped into a user virtual memory area,
* page->mapping points to its anon_vma, not to a struct address_space;
* with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h.
diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
new file mode 100644
index 000000000000..3b99e0ec24f2
--- /dev/null
+++ b/include/linux/page_reporting.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PAGE_REPORTING_H
+#define _LINUX_PAGE_REPORTING_H
+
+#include <linux/mmzone.h>
+#include <linux/scatterlist.h>
+
+/* This value should always be a power of 2, see page_reporting_cycle() */
+#define PAGE_REPORTING_CAPACITY 32
+
+struct page_reporting_dev_info {
+ /* function that alters pages to make them "reported" */
+ int (*report)(struct page_reporting_dev_info *prdev,
+ struct scatterlist *sg, unsigned int nents);
+
+ /* work struct for processing reports */
+ struct delayed_work work;
+
+ /* Current state of page reporting */
+ atomic_t state;
+};
+
+/* Tear-down and bring-up for page reporting devices */
+void page_reporting_unregister(struct page_reporting_dev_info *prdev);
+int page_reporting_register(struct page_reporting_dev_info *prdev);
+#endif /*_LINUX_PAGE_REPORTING_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index f56282491a48..a8f7bd8ea1c6 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -341,9 +341,7 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index)
if (PageHuge(head))
return head;
- VM_BUG_ON_PAGE(PageTail(head), head);
-
- return head + (index & (compound_nr(head) - 1));
+ return head + (index & (hpage_nr_pages(head) - 1));
}
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 4f052496cdfd..0a4f54dd4737 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -78,9 +78,9 @@ static inline s64 percpu_counter_read(struct percpu_counter *fbc)
*/
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
- s64 ret = fbc->count;
+ /* Prevent reloads of fbc->count */
+ s64 ret = READ_ONCE(fbc->count);
- barrier(); /* Prevent reloads of fbc->count */
if (ret >= 0)
return ret;
return 0;
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 01a0d4e28506..cc896f0fc4e3 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -60,6 +60,7 @@ struct pid
{
refcount_t count;
unsigned int level;
+ spinlock_t lock;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
struct hlist_head inodes;
diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index ba5914770191..383243326676 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -125,6 +125,9 @@ struct cros_ec_command {
* @host_event_wake_mask: Mask of host events that cause wake from suspend.
* @last_event_time: exact time from the hard irq when we got notified of
* a new event.
+ * @notifier_ready: The notifier_block to let the kernel re-query EC
+ * communication protocol when the EC sends
+ * EC_HOST_EVENT_INTERFACE_READY.
* @ec: The platform_device used by the mfd driver to interface with the
* main EC.
* @pd: The platform_device used by the mfd driver to interface with the
@@ -166,6 +169,7 @@ struct cros_ec_device {
u32 host_event_wake_mask;
u32 last_resume_result;
ktime_t last_event_time;
+ struct notifier_block notifier_ready;
/* The platform devices used by the mfd driver */
struct platform_device *ec;
diff --git a/include/linux/platform_data/cros_ec_sensorhub.h b/include/linux/platform_data/cros_ec_sensorhub.h
index bef7ffc7fce1..c588be843f61 100644
--- a/include/linux/platform_data/cros_ec_sensorhub.h
+++ b/include/linux/platform_data/cros_ec_sensorhub.h
@@ -8,8 +8,13 @@
#ifndef __LINUX_PLATFORM_DATA_CROS_EC_SENSORHUB_H
#define __LINUX_PLATFORM_DATA_CROS_EC_SENSORHUB_H
+#include <linux/ktime.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
#include <linux/platform_data/cros_ec_commands.h>
+struct iio_dev;
+
/**
* struct cros_ec_sensor_platform - ChromeOS EC sensor platform information.
* @sensor_num: Id of the sensor, as reported by the EC.
@@ -19,12 +24,170 @@ struct cros_ec_sensor_platform {
};
/**
+ * typedef cros_ec_sensorhub_push_data_cb_t - Callback function to send datum
+ * to specific sensors.
+ *
+ * @indio_dev: The IIO device that will process the sample.
+ * @data: Vector array of the ring sample.
+ * @timestamp: Timestamp in host timespace when the sample was acquired by
+ * the EC.
+ */
+typedef int (*cros_ec_sensorhub_push_data_cb_t)(struct iio_dev *indio_dev,
+ s16 *data,
+ s64 timestamp);
+
+struct cros_ec_sensorhub_sensor_push_data {
+ struct iio_dev *indio_dev;
+ cros_ec_sensorhub_push_data_cb_t push_data_cb;
+};
+
+enum {
+ CROS_EC_SENSOR_LAST_TS,
+ CROS_EC_SENSOR_NEW_TS,
+ CROS_EC_SENSOR_ALL_TS
+};
+
+struct cros_ec_sensors_ring_sample {
+ u8 sensor_id;
+ u8 flag;
+ s16 vector[3];
+ s64 timestamp;
+} __packed;
+
+/* State used for cros_ec_ring_fix_overflow */
+struct cros_ec_sensors_ec_overflow_state {
+ s64 offset;
+ s64 last;
+};
+
+/* Length of the filter, how long to remember entries for */
+#define CROS_EC_SENSORHUB_TS_HISTORY_SIZE 64
+
+/**
+ * struct cros_ec_sensors_ts_filter_state - Timestamp filetr state.
+ *
+ * @x_offset: x is EC interrupt time. x_offset its last value.
+ * @y_offset: y is the difference between AP and EC time, y_offset its last
+ * value.
+ * @x_history: The past history of x, relative to x_offset.
+ * @y_history: The past history of y, relative to y_offset.
+ * @m_history: rate between y and x.
+ * @history_len: Amount of valid historic data in the arrays.
+ * @temp_buf: Temporary buffer used when updating the filter.
+ * @median_m: median value of m_history
+ * @median_error: final error to apply to AP interrupt timestamp to get the
+ * "true timestamp" the event occurred.
+ */
+struct cros_ec_sensors_ts_filter_state {
+ s64 x_offset, y_offset;
+ s64 x_history[CROS_EC_SENSORHUB_TS_HISTORY_SIZE];
+ s64 y_history[CROS_EC_SENSORHUB_TS_HISTORY_SIZE];
+ s64 m_history[CROS_EC_SENSORHUB_TS_HISTORY_SIZE];
+ int history_len;
+
+ s64 temp_buf[CROS_EC_SENSORHUB_TS_HISTORY_SIZE];
+
+ s64 median_m;
+ s64 median_error;
+};
+
+/* struct cros_ec_sensors_ts_batch_state - State of batch of a single sensor.
+ *
+ * Use to store information to batch data using median fileter information.
+ *
+ * @penul_ts: last but one batch timestamp (penultimate timestamp).
+ * Used for timestamp spreading calculations
+ * when a batch shows up.
+ * @penul_len: last but one batch length.
+ * @last_ts: Last batch timestam.
+ * @last_len: Last batch length.
+ * @newest_sensor_event: Last sensor timestamp.
+ */
+struct cros_ec_sensors_ts_batch_state {
+ s64 penul_ts;
+ int penul_len;
+ s64 last_ts;
+ int last_len;
+ s64 newest_sensor_event;
+};
+
+/*
* struct cros_ec_sensorhub - Sensor Hub device data.
*
+ * @dev: Device object, mostly used for logging.
* @ec: Embedded Controller where the hub is located.
+ * @sensor_num: Number of MEMS sensors present in the EC.
+ * @msg: Structure to send FIFO requests.
+ * @params: Pointer to parameters in msg.
+ * @resp: Pointer to responses in msg.
+ * @cmd_lock : Lock for sending msg.
+ * @notifier: Notifier to kick the FIFO interrupt.
+ * @ring: Preprocessed ring to store events.
+ * @fifo_timestamp: Array for event timestamp and spreading.
+ * @fifo_info: Copy of FIFO information coming from the EC.
+ * @fifo_size: Size of the ring.
+ * @batch_state: Per sensor information of the last batches received.
+ * @overflow_a: For handling timestamp overflow for a time (sensor events)
+ * @overflow_b: For handling timestamp overflow for b time (ec interrupts)
+ * @filter: Medium fileter structure.
+ * @tight_timestamps: Set to truen when EC support tight timestamping:
+ * The timestamps reported from the EC have low jitter.
+ * Timestamps also come before every sample. Set either
+ * by feature bits coming from the EC or userspace.
+ * @future_timestamp_count: Statistics used to compute shaved time.
+ * This occurs when timestamp interpolation from EC
+ * time to AP time accidentally puts timestamps in
+ * the future. These timestamps are clamped to
+ * `now` and these count/total_ns maintain the
+ * statistics for how much time was removed in a
+ * given period.
+ * @future_timestamp_total_ns: Total amount of time shaved.
+ * @push_data: Array of callback to send datums to iio sensor object.
*/
struct cros_ec_sensorhub {
+ struct device *dev;
struct cros_ec_dev *ec;
+ int sensor_num;
+
+ struct cros_ec_command *msg;
+ struct ec_params_motion_sense *params;
+ struct ec_response_motion_sense *resp;
+ struct mutex cmd_lock; /* Lock for protecting msg structure. */
+
+ struct notifier_block notifier;
+
+ struct cros_ec_sensors_ring_sample *ring;
+
+ ktime_t fifo_timestamp[CROS_EC_SENSOR_ALL_TS];
+ struct ec_response_motion_sense_fifo_info *fifo_info;
+ int fifo_size;
+
+ struct cros_ec_sensors_ts_batch_state *batch_state;
+
+ struct cros_ec_sensors_ec_overflow_state overflow_a;
+ struct cros_ec_sensors_ec_overflow_state overflow_b;
+
+ struct cros_ec_sensors_ts_filter_state filter;
+
+ int tight_timestamps;
+
+ s32 future_timestamp_count;
+ s64 future_timestamp_total_ns;
+
+ struct cros_ec_sensorhub_sensor_push_data *push_data;
};
+int cros_ec_sensorhub_register_push_data(struct cros_ec_sensorhub *sensorhub,
+ u8 sensor_num,
+ struct iio_dev *indio_dev,
+ cros_ec_sensorhub_push_data_cb_t cb);
+
+void cros_ec_sensorhub_unregister_push_data(struct cros_ec_sensorhub *sensorhub,
+ u8 sensor_num);
+
+int cros_ec_sensorhub_ring_add(struct cros_ec_sensorhub *sensorhub);
+void cros_ec_sensorhub_ring_remove(void *arg);
+int cros_ec_sensorhub_ring_fifo_enable(struct cros_ec_sensorhub *sensorhub,
+ bool on);
+
#endif /* __LINUX_PLATFORM_DATA_CROS_EC_SENSORHUB_H */
diff --git a/include/linux/platform_data/cros_usbpd_notify.h b/include/linux/platform_data/cros_usbpd_notify.h
new file mode 100644
index 000000000000..4f2791722b6d
--- /dev/null
+++ b/include/linux/platform_data/cros_usbpd_notify.h
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ChromeOS EC Power Delivery Notifier Driver
+ *
+ * Copyright 2020 Google LLC
+ */
+
+#ifndef __LINUX_PLATFORM_DATA_CROS_USBPD_NOTIFY_H
+#define __LINUX_PLATFORM_DATA_CROS_USBPD_NOTIFY_H
+
+#include <linux/notifier.h>
+
+int cros_usbpd_register_notify(struct notifier_block *nb);
+
+void cros_usbpd_unregister_notify(struct notifier_block *nb);
+
+#endif /* __LINUX_PLATFORM_DATA_CROS_USBPD_NOTIFY_H */
diff --git a/include/linux/platform_data/leds-kirkwood-ns2.h b/include/linux/platform_data/leds-kirkwood-ns2.h
deleted file mode 100644
index eb8a6860e816..000000000000
--- a/include/linux/platform_data/leds-kirkwood-ns2.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Platform data structure for Network Space v2 LED driver
- *
- * This file is licensed under the terms of the GNU General Public
- * License version 2. This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
- */
-
-#ifndef __LEDS_KIRKWOOD_NS2_H
-#define __LEDS_KIRKWOOD_NS2_H
-
-enum ns2_led_modes {
- NS_V2_LED_OFF,
- NS_V2_LED_ON,
- NS_V2_LED_SATA,
-};
-
-struct ns2_led_modval {
- enum ns2_led_modes mode;
- int cmd_level;
- int slow_level;
-};
-
-struct ns2_led {
- const char *name;
- const char *default_trigger;
- unsigned cmd;
- unsigned slow;
- int num_modes;
- struct ns2_led_modval *modval;
-};
-
-struct ns2_led_platform_data {
- int num_leds;
- struct ns2_led *leds;
-};
-
-#endif /* __LEDS_KIRKWOOD_NS2_H */
diff --git a/include/linux/platform_data/pwm_omap_dmtimer.h b/include/linux/platform_data/pwm_omap_dmtimer.h
deleted file mode 100644
index e7d521e48855..000000000000
--- a/include/linux/platform_data/pwm_omap_dmtimer.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * include/linux/platform_data/pwm_omap_dmtimer.h
- *
- * OMAP Dual-Mode Timer PWM platform data
- *
- * Copyright (C) 2010 Texas Instruments Incorporated - http://www.ti.com/
- * Tarun Kanti DebBarma <tarun.kanti@ti.com>
- * Thara Gopinath <thara@ti.com>
- *
- * Platform device conversion and hwmod support.
- *
- * Copyright (C) 2005 Nokia Corporation
- * Author: Lauri Leukkunen <lauri.leukkunen@nokia.com>
- * PWM and clock framework support by Timo Teras.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
- * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#ifndef __PWM_OMAP_DMTIMER_PDATA_H
-#define __PWM_OMAP_DMTIMER_PDATA_H
-
-/* clock sources */
-#define PWM_OMAP_DMTIMER_SRC_SYS_CLK 0x00
-#define PWM_OMAP_DMTIMER_SRC_32_KHZ 0x01
-#define PWM_OMAP_DMTIMER_SRC_EXT_CLK 0x02
-
-/* timer interrupt enable bits */
-#define PWM_OMAP_DMTIMER_INT_CAPTURE (1 << 2)
-#define PWM_OMAP_DMTIMER_INT_OVERFLOW (1 << 1)
-#define PWM_OMAP_DMTIMER_INT_MATCH (1 << 0)
-
-/* trigger types */
-#define PWM_OMAP_DMTIMER_TRIGGER_NONE 0x00
-#define PWM_OMAP_DMTIMER_TRIGGER_OVERFLOW 0x01
-#define PWM_OMAP_DMTIMER_TRIGGER_OVERFLOW_AND_COMPARE 0x02
-
-struct omap_dm_timer;
-typedef struct omap_dm_timer pwm_omap_dmtimer;
-
-struct pwm_omap_dmtimer_pdata {
- pwm_omap_dmtimer *(*request_by_node)(struct device_node *np);
- pwm_omap_dmtimer *(*request_specific)(int timer_id);
- pwm_omap_dmtimer *(*request)(void);
-
- int (*free)(pwm_omap_dmtimer *timer);
-
- void (*enable)(pwm_omap_dmtimer *timer);
- void (*disable)(pwm_omap_dmtimer *timer);
-
- int (*get_irq)(pwm_omap_dmtimer *timer);
- int (*set_int_enable)(pwm_omap_dmtimer *timer, unsigned int value);
- int (*set_int_disable)(pwm_omap_dmtimer *timer, u32 mask);
-
- struct clk *(*get_fclk)(pwm_omap_dmtimer *timer);
-
- int (*start)(pwm_omap_dmtimer *timer);
- int (*stop)(pwm_omap_dmtimer *timer);
- int (*set_source)(pwm_omap_dmtimer *timer, int source);
-
- int (*set_load)(pwm_omap_dmtimer *timer, int autoreload,
- unsigned int value);
- int (*set_match)(pwm_omap_dmtimer *timer, int enable,
- unsigned int match);
- int (*set_pwm)(pwm_omap_dmtimer *timer, int def_on,
- int toggle, int trigger);
- int (*set_prescaler)(pwm_omap_dmtimer *timer, int prescaler);
-
- unsigned int (*read_counter)(pwm_omap_dmtimer *timer);
- int (*write_counter)(pwm_omap_dmtimer *timer, unsigned int value);
- unsigned int (*read_status)(pwm_omap_dmtimer *timer);
- int (*write_status)(pwm_omap_dmtimer *timer, unsigned int value);
-};
-
-#endif /* __PWM_OMAP_DMTIMER_PDATA_H */
diff --git a/include/linux/platform_data/wilco-ec.h b/include/linux/platform_data/wilco-ec.h
index afede15a95bf..25f46a939637 100644
--- a/include/linux/platform_data/wilco-ec.h
+++ b/include/linux/platform_data/wilco-ec.h
@@ -8,8 +8,8 @@
#ifndef WILCO_EC_H
#define WILCO_EC_H
-#include <linux/device.h>
-#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
/* Message flags for using the mailbox() interface */
#define WILCO_EC_FLAG_NO_RESPONSE BIT(0) /* EC does not respond */
@@ -17,6 +17,10 @@
/* Normal commands have a maximum 32 bytes of data */
#define EC_MAILBOX_DATA_SIZE 32
+struct device;
+struct resource;
+struct platform_device;
+
/**
* struct wilco_ec_device - Wilco Embedded Controller handle.
* @dev: Device handle.
diff --git a/include/linux/power/bq2415x_charger.h b/include/linux/power/bq2415x_charger.h
index 7a91b357e3ac..4ca08321e251 100644
--- a/include/linux/power/bq2415x_charger.h
+++ b/include/linux/power/bq2415x_charger.h
@@ -2,7 +2,7 @@
/*
* bq2415x charger driver
*
- * Copyright (C) 2011-2013 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright (C) 2011-2013 Pali Rohár <pali@kernel.org>
*/
#ifndef BQ2415X_CHARGER_H
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 1e6108b8d15f..e061635e0409 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -202,7 +202,6 @@ __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
void dump_stack_print_info(const char *log_lvl);
void show_regs_print_info(const char *log_lvl);
extern asmlinkage void dump_stack(void) __cold;
-extern void printk_safe_init(void);
extern void printk_safe_flush(void);
extern void printk_safe_flush_on_panic(void);
#else
@@ -269,10 +268,6 @@ static inline void dump_stack(void)
{
}
-static inline void printk_safe_init(void)
-{
-}
-
static inline void printk_safe_flush(void)
{
}
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 40a7982b7285..45c05fd9c99d 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -5,6 +5,7 @@
#ifndef _LINUX_PROC_FS_H
#define _LINUX_PROC_FS_H
+#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/fs.h>
@@ -12,7 +13,21 @@ struct proc_dir_entry;
struct seq_file;
struct seq_operations;
+enum {
+ /*
+ * All /proc entries using this ->proc_ops instance are never removed.
+ *
+ * If in doubt, ignore this flag.
+ */
+#ifdef MODULE
+ PROC_ENTRY_PERMANENT = 0U,
+#else
+ PROC_ENTRY_PERMANENT = 1U << 0,
+#endif
+};
+
struct proc_ops {
+ unsigned int proc_flags;
int (*proc_open)(struct inode *, struct file *);
ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *);
ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *);
@@ -25,7 +40,7 @@ struct proc_ops {
#endif
int (*proc_mmap)(struct file *, struct vm_area_struct *);
unsigned long (*proc_get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
-};
+} __randomize_layout;
#ifdef CONFIG_PROC_FS
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 0ef808d925bb..2635b2a55090 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -71,7 +71,8 @@ struct pwm_state {
* @chip: PWM chip providing this PWM device
* @chip_data: chip-private data associated with the PWM device
* @args: PWM arguments
- * @state: curent PWM channel state
+ * @state: last applied state
+ * @last: last implemented state (for PWM_DEBUG)
*/
struct pwm_device {
const char *label;
@@ -83,6 +84,7 @@ struct pwm_device {
struct pwm_args args;
struct pwm_state state;
+ struct pwm_state last;
};
/**
diff --git a/include/linux/pwm_backlight.h b/include/linux/pwm_backlight.h
index 8ea265a022fd..06086cb93b6f 100644
--- a/include/linux/pwm_backlight.h
+++ b/include/linux/pwm_backlight.h
@@ -16,8 +16,6 @@ struct platform_pwm_backlight_data {
unsigned int *levels;
unsigned int post_pwm_on_delay;
unsigned int pwm_off_delay;
- /* TODO remove once all users are switched to gpiod_* API */
- int enable_gpio;
int (*init)(struct device *dev);
int (*notify)(struct device *dev, int brightness);
void (*notify_after)(struct device *dev, int brightness);
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 0ac50cf62d06..0e3ee25eb156 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -38,11 +38,24 @@
* atomic operations, then the count will continue to edge closer to 0. If it
* reaches a value of 1 before /any/ of the threads reset it to the saturated
* value, then a concurrent refcount_dec_and_test() may erroneously free the
- * underlying object. Given the precise timing details involved with the
- * round-robin scheduling of each thread manipulating the refcount and the need
- * to hit the race multiple times in succession, there doesn't appear to be a
- * practical avenue of attack even if using refcount_add() operations with
- * larger increments.
+ * underlying object.
+ * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently
+ * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK).
+ * With the current PID limit, if no batched refcounting operations are used and
+ * the attacker can't repeatedly trigger kernel oopses in the middle of refcount
+ * operations, this makes it impossible for a saturated refcount to leave the
+ * saturation range, even if it is possible for multiple uses of the same
+ * refcount to nest in the context of a single task:
+ *
+ * (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT =
+ * 0x40000000 / 0x400000 = 0x100 = 256
+ *
+ * If hundreds of references are added/removed with a single refcounting
+ * operation, it may potentially be possible to leave the saturation range; but
+ * given the precise timing details involved with the round-robin scheduling of
+ * each thread manipulating the refcount and the need to hit the race multiple
+ * times in succession, there doesn't appear to be a practical avenue of attack
+ * even if using refcount_add() operations with larger increments.
*
* Memory ordering
* ===============
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 770c2bf3aa43..1672cf6f7614 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -21,7 +21,6 @@ struct seq_file {
size_t pad_until;
loff_t index;
loff_t read_pos;
- u64 version;
struct mutex lock;
const struct seq_operations *op;
int poll_event;
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index d56fefef8905..7a35a6901221 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -78,6 +78,7 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
extern int shmem_unuse(unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse);
+extern bool shmem_huge_enabled(struct vm_area_struct *vma);
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end);
@@ -114,15 +115,6 @@ static inline bool shmem_file(struct file *file)
extern bool shmem_charge(struct inode *inode, long pages);
extern void shmem_uncharge(struct inode *inode, long pages);
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
-extern bool shmem_huge_enabled(struct vm_area_struct *vma);
-#else
-static inline bool shmem_huge_enabled(struct vm_area_struct *vma)
-{
- return false;
-}
-#endif
-
#ifdef CONFIG_SHMEM
extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 28b1a2b4459e..3a2ac7072dbb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -47,8 +47,8 @@
* A. IP checksum related features
*
* Drivers advertise checksum offload capabilities in the features of a device.
- * From the stack's point of view these are capabilities offered by the driver,
- * a driver typically only advertises features that it is capable of offloading
+ * From the stack's point of view these are capabilities offered by the driver.
+ * A driver typically only advertises features that it is capable of offloading
* to its device.
*
* The checksum related features are:
@@ -63,7 +63,7 @@
* TCP or UDP packets over IPv4. These are specifically
* unencapsulated packets of the form IPv4|TCP or
* IPv4|UDP where the Protocol field in the IPv4 header
- * is TCP or UDP. The IPv4 header may contain IP options
+ * is TCP or UDP. The IPv4 header may contain IP options.
* This feature cannot be set in features for a device
* with NETIF_F_HW_CSUM also set. This feature is being
* DEPRECATED (see below).
@@ -79,13 +79,13 @@
* DEPRECATED (see below).
*
* NETIF_F_RXCSUM - Driver (device) performs receive checksum offload.
- * This flag is used only used to disable the RX checksum
+ * This flag is only used to disable the RX checksum
* feature for a device. The stack will accept receive
* checksum indication in packets received on a device
* regardless of whether NETIF_F_RXCSUM is set.
*
* B. Checksumming of received packets by device. Indication of checksum
- * verification is in set skb->ip_summed. Possible values are:
+ * verification is set in skb->ip_summed. Possible values are:
*
* CHECKSUM_NONE:
*
@@ -115,16 +115,16 @@
* the packet minus one that have been verified as CHECKSUM_UNNECESSARY.
* For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet
* and a device is able to verify the checksums for UDP (possibly zero),
- * GRE (checksum flag is set), and TCP-- skb->csum_level would be set to
+ * GRE (checksum flag is set) and TCP, skb->csum_level would be set to
* two. If the device were only able to verify the UDP checksum and not
- * GRE, either because it doesn't support GRE checksum of because GRE
+ * GRE, either because it doesn't support GRE checksum or because GRE
* checksum is bad, skb->csum_level would be set to zero (TCP checksum is
* not considered in this case).
*
* CHECKSUM_COMPLETE:
*
* This is the most generic way. The device supplied checksum of the _whole_
- * packet as seen by netif_rx() and fills out in skb->csum. Meaning, the
+ * packet as seen by netif_rx() and fills in skb->csum. This means the
* hardware doesn't need to parse L3/L4 headers to implement this.
*
* Notes:
@@ -153,8 +153,8 @@
* from skb->csum_start up to the end, and to record/write the checksum at
* offset skb->csum_start + skb->csum_offset. A driver may verify that the
* csum_start and csum_offset values are valid values given the length and
- * offset of the packet, however they should not attempt to validate that the
- * checksum refers to a legitimate transport layer checksum-- it is the
+ * offset of the packet, but it should not attempt to validate that the
+ * checksum refers to a legitimate transport layer checksum -- it is the
* purview of the stack to validate that csum_start and csum_offset are set
* correctly.
*
@@ -178,18 +178,18 @@
*
* CHECKSUM_UNNECESSARY:
*
- * This has the same meaning on as CHECKSUM_NONE for checksum offload on
+ * This has the same meaning as CHECKSUM_NONE for checksum offload on
* output.
*
* CHECKSUM_COMPLETE:
* Not used in checksum output. If a driver observes a packet with this value
- * set in skbuff, if should treat as CHECKSUM_NONE being set.
+ * set in skbuff, it should treat the packet as if CHECKSUM_NONE were set.
*
* D. Non-IP checksum (CRC) offloads
*
* NETIF_F_SCTP_CRC - This feature indicates that a device is capable of
* offloading the SCTP CRC in a packet. To perform this offload the stack
- * will set set csum_start and csum_offset accordingly, set ip_summed to
+ * will set csum_start and csum_offset accordingly, set ip_summed to
* CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication in
* the skbuff that the CHECKSUM_PARTIAL refers to CRC32c.
* A driver that supports both IP checksum offload and SCTP CRC32c offload
@@ -200,10 +200,10 @@
* NETIF_F_FCOE_CRC - This feature indicates that a device is capable of
* offloading the FCOE CRC in a packet. To perform this offload the stack
* will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset
- * accordingly. Note the there is no indication in the skbuff that the
- * CHECKSUM_PARTIAL refers to an FCOE checksum, a driver that supports
+ * accordingly. Note that there is no indication in the skbuff that the
+ * CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports
* both IP checksum offload and FCOE CRC offload must verify which offload
- * is configured for a packet presumably by inspecting packet headers.
+ * is configured for a packet, presumably by inspecting packet headers.
*
* E. Checksumming on output with GSO.
*
@@ -211,9 +211,9 @@
* is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the
* gso_type is SKB_GSO_TCPV4 or SKB_GSO_TCPV6, TCP checksum offload as
* part of the GSO operation is implied. If a checksum is being offloaded
- * with GSO then ip_summed is CHECKSUM_PARTIAL, csum_start and csum_offset
- * are set to refer to the outermost checksum being offload (two offloaded
- * checksums are possible with UDP encapsulation).
+ * with GSO then ip_summed is CHECKSUM_PARTIAL, and both csum_start and
+ * csum_offset are set to refer to the outermost checksum being offloaded
+ * (two offloaded checksums are possible with UDP encapsulation).
*/
/* Don't change this without changing skb_csum_unnecessary! */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 03a389358562..6d454886bcaf 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -501,7 +501,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
* :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
*
* The recommended usage of the @flags is described at
- * :ref:`Documentation/core-api/memory-allocation.rst <memory-allocation>`
+ * :ref:`Documentation/core-api/memory-allocation.rst <memory_allocation>`
*
* Below is a brief outline of the most useful GFP flags
*
diff --git a/include/linux/spi/corgi_lcd.h b/include/linux/spi/corgi_lcd.h
index edf4beccdadb..0b857616919c 100644
--- a/include/linux/spi/corgi_lcd.h
+++ b/include/linux/spi/corgi_lcd.h
@@ -11,9 +11,6 @@ struct corgi_lcd_platform_data {
int default_intensity;
int limit_mask;
- int gpio_backlight_on; /* -1 if n/a */
- int gpio_backlight_cont; /* -1 if n/a */
-
void (*notify)(int intensity);
void (*kick_battery)(void);
};
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index 3efa97d482cb..24d49c732341 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -19,4 +19,6 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
unsigned int stack_depot_fetch(depot_stack_handle_t handle,
unsigned long **entries);
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
+
#endif
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index a6ef35184ef1..df696efdd675 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -132,6 +132,7 @@ struct rpc_task_setup {
#define RPC_TASK_TIMEOUT 0x1000 /* fail with ETIMEDOUT on timeout */
#define RPC_TASK_NOCONNECT 0x2000 /* return ENOTCONN if not connected */
#define RPC_TASK_NO_RETRANS_TIMEOUT 0x4000 /* wait forever for a reply */
+#define RPC_TASK_CRED_NOREF 0x8000 /* No refcount on the credential */
#define RPC_IS_ASYNC(t) ((t)->tk_flags & RPC_TASK_ASYNC)
#define RPC_IS_SWAPPER(t) ((t)->tk_flags & RPC_TASK_SWAPPER)
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 8529d6e33137..01bb41908c93 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -184,7 +184,6 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p)
extern void xdr_shift_buf(struct xdr_buf *, size_t);
extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *);
extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int);
-extern int xdr_buf_read_mic(struct xdr_buf *, struct xdr_netobj *, unsigned int);
extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 877fd239b6ff..d9b7c9132c2f 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -68,6 +68,8 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte)
if (pte_swp_soft_dirty(pte))
pte = pte_swp_clear_soft_dirty(pte);
+ if (pte_swp_uffd_wp(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
arch_entry = __pte_to_swp_entry(pte);
return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
}
@@ -348,7 +350,8 @@ static inline void num_poisoned_pages_inc(void)
}
#endif
-#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
+#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || \
+ defined(CONFIG_DEVICE_PRIVATE)
static inline int non_swap_entry(swp_entry_t entry)
{
return swp_type(entry) >= MAX_SWAPFILES;
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 126913c6a53b..c91b1e344d56 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -364,6 +364,9 @@ struct thermal_trip {
/* Function declarations */
#ifdef CONFIG_THERMAL_OF
+int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
+ struct device_node *sensor_np,
+ u32 *id);
struct thermal_zone_device *
thermal_zone_of_sensor_register(struct device *dev, int id, void *data,
const struct thermal_zone_of_device_ops *ops);
@@ -375,6 +378,13 @@ struct thermal_zone_device *devm_thermal_zone_of_sensor_register(
void devm_thermal_zone_of_sensor_unregister(struct device *dev,
struct thermal_zone_device *tz);
#else
+
+static inline int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
+ struct device_node *sensor_np,
+ u32 *id)
+{
+ return -ENOENT;
+}
static inline struct thermal_zone_device *
thermal_zone_of_sensor_register(struct device *dev, int id, void *data,
const struct thermal_zone_of_device_ops *ops)
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index ac9d71e24b81..a8e5f3ea9bb2 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -14,6 +14,8 @@
#include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */
#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <asm-generic/pgtable_uffd.h>
/*
* CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
@@ -34,11 +36,14 @@ extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len,
- bool *mmap_changing);
+ bool *mmap_changing, __u64 mode);
extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long len,
bool *mmap_changing);
+extern int mwriteprotect_range(struct mm_struct *dst_mm,
+ unsigned long start, unsigned long len,
+ bool enable_wp, bool *mmap_changing);
/* mm helpers */
static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
@@ -52,6 +57,23 @@ static inline bool userfaultfd_missing(struct vm_area_struct *vma)
return vma->vm_flags & VM_UFFD_MISSING;
}
+static inline bool userfaultfd_wp(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & VM_UFFD_WP;
+}
+
+static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
+ pte_t pte)
+{
+ return userfaultfd_wp(vma) && pte_uffd_wp(pte);
+}
+
+static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma,
+ pmd_t pmd)
+{
+ return userfaultfd_wp(vma) && pmd_uffd_wp(pmd);
+}
+
static inline bool userfaultfd_armed(struct vm_area_struct *vma)
{
return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
@@ -96,6 +118,24 @@ static inline bool userfaultfd_missing(struct vm_area_struct *vma)
return false;
}
+static inline bool userfaultfd_wp(struct vm_area_struct *vma)
+{
+ return false;
+}
+
+static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
+ pte_t pte)
+{
+ return false;
+}
+
+static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma,
+ pmd_t pmd)
+{
+ return false;
+}
+
+
static inline bool userfaultfd_armed(struct vm_area_struct *vma)
{
return false;
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
new file mode 100644
index 000000000000..733acfb7ef84
--- /dev/null
+++ b/include/linux/vdpa.h
@@ -0,0 +1,253 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VDPA_H
+#define _LINUX_VDPA_H
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/vhost_iotlb.h>
+
+/**
+ * vDPA callback definition.
+ * @callback: interrupt callback function
+ * @private: the data passed to the callback function
+ */
+struct vdpa_callback {
+ irqreturn_t (*callback)(void *data);
+ void *private;
+};
+
+/**
+ * vDPA device - representation of a vDPA device
+ * @dev: underlying device
+ * @dma_dev: the actual device that is performing DMA
+ * @config: the configuration ops for this device.
+ * @index: device index
+ */
+struct vdpa_device {
+ struct device dev;
+ struct device *dma_dev;
+ const struct vdpa_config_ops *config;
+ unsigned int index;
+};
+
+/**
+ * vDPA_config_ops - operations for configuring a vDPA device.
+ * Note: vDPA device drivers are required to implement all of the
+ * operations unless it is mentioned to be optional in the following
+ * list.
+ *
+ * @set_vq_address: Set the address of virtqueue
+ * @vdev: vdpa device
+ * @idx: virtqueue index
+ * @desc_area: address of desc area
+ * @driver_area: address of driver area
+ * @device_area: address of device area
+ * Returns integer: success (0) or error (< 0)
+ * @set_vq_num: Set the size of virtqueue
+ * @vdev: vdpa device
+ * @idx: virtqueue index
+ * @num: the size of virtqueue
+ * @kick_vq: Kick the virtqueue
+ * @vdev: vdpa device
+ * @idx: virtqueue index
+ * @set_vq_cb: Set the interrupt callback function for
+ * a virtqueue
+ * @vdev: vdpa device
+ * @idx: virtqueue index
+ * @cb: virtio-vdev interrupt callback structure
+ * @set_vq_ready: Set ready status for a virtqueue
+ * @vdev: vdpa device
+ * @idx: virtqueue index
+ * @ready: ready (true) not ready(false)
+ * @get_vq_ready: Get ready status for a virtqueue
+ * @vdev: vdpa device
+ * @idx: virtqueue index
+ * Returns boolean: ready (true) or not (false)
+ * @set_vq_state: Set the state for a virtqueue
+ * @vdev: vdpa device
+ * @idx: virtqueue index
+ * @state: virtqueue state (last_avail_idx)
+ * Returns integer: success (0) or error (< 0)
+ * @get_vq_state: Get the state for a virtqueue
+ * @vdev: vdpa device
+ * @idx: virtqueue index
+ * Returns virtqueue state (last_avail_idx)
+ * @get_vq_align: Get the virtqueue align requirement
+ * for the device
+ * @vdev: vdpa device
+ * Returns virtqueue algin requirement
+ * @get_features: Get virtio features supported by the device
+ * @vdev: vdpa device
+ * Returns the virtio features support by the
+ * device
+ * @set_features: Set virtio features supported by the driver
+ * @vdev: vdpa device
+ * @features: feature support by the driver
+ * Returns integer: success (0) or error (< 0)
+ * @set_config_cb: Set the config interrupt callback
+ * @vdev: vdpa device
+ * @cb: virtio-vdev interrupt callback structure
+ * @get_vq_num_max: Get the max size of virtqueue
+ * @vdev: vdpa device
+ * Returns u16: max size of virtqueue
+ * @get_device_id: Get virtio device id
+ * @vdev: vdpa device
+ * Returns u32: virtio device id
+ * @get_vendor_id: Get id for the vendor that provides this device
+ * @vdev: vdpa device
+ * Returns u32: virtio vendor id
+ * @get_status: Get the device status
+ * @vdev: vdpa device
+ * Returns u8: virtio device status
+ * @set_status: Set the device status
+ * @vdev: vdpa device
+ * @status: virtio device status
+ * @get_config: Read from device specific configuration space
+ * @vdev: vdpa device
+ * @offset: offset from the beginning of
+ * configuration space
+ * @buf: buffer used to read to
+ * @len: the length to read from
+ * configuration space
+ * @set_config: Write to device specific configuration space
+ * @vdev: vdpa device
+ * @offset: offset from the beginning of
+ * configuration space
+ * @buf: buffer used to write from
+ * @len: the length to write to
+ * configuration space
+ * @get_generation: Get device config generation (optional)
+ * @vdev: vdpa device
+ * Returns u32: device generation
+ * @set_map: Set device memory mapping (optional)
+ * Needed for device that using device
+ * specific DMA translation (on-chip IOMMU)
+ * @vdev: vdpa device
+ * @iotlb: vhost memory mapping to be
+ * used by the vDPA
+ * Returns integer: success (0) or error (< 0)
+ * @dma_map: Map an area of PA to IOVA (optional)
+ * Needed for device that using device
+ * specific DMA translation (on-chip IOMMU)
+ * and preferring incremental map.
+ * @vdev: vdpa device
+ * @iova: iova to be mapped
+ * @size: size of the area
+ * @pa: physical address for the map
+ * @perm: device access permission (VHOST_MAP_XX)
+ * Returns integer: success (0) or error (< 0)
+ * @dma_unmap: Unmap an area of IOVA (optional but
+ * must be implemented with dma_map)
+ * Needed for device that using device
+ * specific DMA translation (on-chip IOMMU)
+ * and preferring incremental unmap.
+ * @vdev: vdpa device
+ * @iova: iova to be unmapped
+ * @size: size of the area
+ * Returns integer: success (0) or error (< 0)
+ * @free: Free resources that belongs to vDPA (optional)
+ * @vdev: vdpa device
+ */
+struct vdpa_config_ops {
+ /* Virtqueue ops */
+ int (*set_vq_address)(struct vdpa_device *vdev,
+ u16 idx, u64 desc_area, u64 driver_area,
+ u64 device_area);
+ void (*set_vq_num)(struct vdpa_device *vdev, u16 idx, u32 num);
+ void (*kick_vq)(struct vdpa_device *vdev, u16 idx);
+ void (*set_vq_cb)(struct vdpa_device *vdev, u16 idx,
+ struct vdpa_callback *cb);
+ void (*set_vq_ready)(struct vdpa_device *vdev, u16 idx, bool ready);
+ bool (*get_vq_ready)(struct vdpa_device *vdev, u16 idx);
+ int (*set_vq_state)(struct vdpa_device *vdev, u16 idx, u64 state);
+ u64 (*get_vq_state)(struct vdpa_device *vdev, u16 idx);
+
+ /* Device ops */
+ u16 (*get_vq_align)(struct vdpa_device *vdev);
+ u64 (*get_features)(struct vdpa_device *vdev);
+ int (*set_features)(struct vdpa_device *vdev, u64 features);
+ void (*set_config_cb)(struct vdpa_device *vdev,
+ struct vdpa_callback *cb);
+ u16 (*get_vq_num_max)(struct vdpa_device *vdev);
+ u32 (*get_device_id)(struct vdpa_device *vdev);
+ u32 (*get_vendor_id)(struct vdpa_device *vdev);
+ u8 (*get_status)(struct vdpa_device *vdev);
+ void (*set_status)(struct vdpa_device *vdev, u8 status);
+ void (*get_config)(struct vdpa_device *vdev, unsigned int offset,
+ void *buf, unsigned int len);
+ void (*set_config)(struct vdpa_device *vdev, unsigned int offset,
+ const void *buf, unsigned int len);
+ u32 (*get_generation)(struct vdpa_device *vdev);
+
+ /* DMA ops */
+ int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb);
+ int (*dma_map)(struct vdpa_device *vdev, u64 iova, u64 size,
+ u64 pa, u32 perm);
+ int (*dma_unmap)(struct vdpa_device *vdev, u64 iova, u64 size);
+
+ /* Free device resources */
+ void (*free)(struct vdpa_device *vdev);
+};
+
+struct vdpa_device *__vdpa_alloc_device(struct device *parent,
+ const struct vdpa_config_ops *config,
+ size_t size);
+
+#define vdpa_alloc_device(dev_struct, member, parent, config) \
+ container_of(__vdpa_alloc_device( \
+ parent, config, \
+ sizeof(dev_struct) + \
+ BUILD_BUG_ON_ZERO(offsetof( \
+ dev_struct, member))), \
+ dev_struct, member)
+
+int vdpa_register_device(struct vdpa_device *vdev);
+void vdpa_unregister_device(struct vdpa_device *vdev);
+
+/**
+ * vdpa_driver - operations for a vDPA driver
+ * @driver: underlying device driver
+ * @probe: the function to call when a device is found. Returns 0 or -errno.
+ * @remove: the function to call when a device is removed.
+ */
+struct vdpa_driver {
+ struct device_driver driver;
+ int (*probe)(struct vdpa_device *vdev);
+ void (*remove)(struct vdpa_device *vdev);
+};
+
+#define vdpa_register_driver(drv) \
+ __vdpa_register_driver(drv, THIS_MODULE)
+int __vdpa_register_driver(struct vdpa_driver *drv, struct module *owner);
+void vdpa_unregister_driver(struct vdpa_driver *drv);
+
+#define module_vdpa_driver(__vdpa_driver) \
+ module_driver(__vdpa_driver, vdpa_register_driver, \
+ vdpa_unregister_driver)
+
+static inline struct vdpa_driver *drv_to_vdpa(struct device_driver *driver)
+{
+ return container_of(driver, struct vdpa_driver, driver);
+}
+
+static inline struct vdpa_device *dev_to_vdpa(struct device *_dev)
+{
+ return container_of(_dev, struct vdpa_device, dev);
+}
+
+static inline void *vdpa_get_drvdata(const struct vdpa_device *vdev)
+{
+ return dev_get_drvdata(&vdev->dev);
+}
+
+static inline void vdpa_set_drvdata(struct vdpa_device *vdev, void *data)
+{
+ dev_set_drvdata(&vdev->dev, data);
+}
+
+static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev)
+{
+ return vdev->dma_dev;
+}
+#endif /* _LINUX_VDPA_H */
diff --git a/include/linux/vhost_iotlb.h b/include/linux/vhost_iotlb.h
new file mode 100644
index 000000000000..6b09b786a762
--- /dev/null
+++ b/include/linux/vhost_iotlb.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VHOST_IOTLB_H
+#define _LINUX_VHOST_IOTLB_H
+
+#include <linux/interval_tree_generic.h>
+
+struct vhost_iotlb_map {
+ struct rb_node rb;
+ struct list_head link;
+ u64 start;
+ u64 last;
+ u64 size;
+ u64 addr;
+#define VHOST_MAP_RO 0x1
+#define VHOST_MAP_WO 0x2
+#define VHOST_MAP_RW 0x3
+ u32 perm;
+ u32 flags_padding;
+ u64 __subtree_last;
+};
+
+#define VHOST_IOTLB_FLAG_RETIRE 0x1
+
+struct vhost_iotlb {
+ struct rb_root_cached root;
+ struct list_head list;
+ unsigned int limit;
+ unsigned int nmaps;
+ unsigned int flags;
+};
+
+int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, u64 start, u64 last,
+ u64 addr, unsigned int perm);
+void vhost_iotlb_del_range(struct vhost_iotlb *iotlb, u64 start, u64 last);
+
+struct vhost_iotlb *vhost_iotlb_alloc(unsigned int limit, unsigned int flags);
+void vhost_iotlb_free(struct vhost_iotlb *iotlb);
+void vhost_iotlb_reset(struct vhost_iotlb *iotlb);
+
+struct vhost_iotlb_map *
+vhost_iotlb_itree_first(struct vhost_iotlb *iotlb, u64 start, u64 last);
+struct vhost_iotlb_map *
+vhost_iotlb_itree_next(struct vhost_iotlb_map *map, u64 start, u64 last);
+
+void vhost_iotlb_map_free(struct vhost_iotlb *iotlb,
+ struct vhost_iotlb_map *map);
+#endif
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 47a3441cf4c4..ffef0f279747 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -73,9 +73,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
THP_FAULT_ALLOC,
THP_FAULT_FALLBACK,
+ THP_FAULT_FALLBACK_CHARGE,
THP_COLLAPSE_ALLOC,
THP_COLLAPSE_ALLOC_FAILED,
THP_FILE_ALLOC,
+ THP_FILE_FALLBACK,
+ THP_FILE_FALLBACK_CHARGE,
THP_FILE_MAPPED,
THP_SPLIT_PAGE,
THP_SPLIT_PAGE_FAILED,
@@ -115,6 +118,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
#define THP_FILE_ALLOC ({ BUILD_BUG(); 0; })
+#define THP_FILE_FALLBACK ({ BUILD_BUG(); 0; })
+#define THP_FILE_FALLBACK_CHARGE ({ BUILD_BUG(); 0; })
#define THP_FILE_MAPPED ({ BUILD_BUG(); 0; })
#endif
diff --git a/include/linux/vringh.h b/include/linux/vringh.h
index d237087eb257..bd0503ca6f8f 100644
--- a/include/linux/vringh.h
+++ b/include/linux/vringh.h
@@ -14,6 +14,8 @@
#include <linux/virtio_byteorder.h>
#include <linux/uio.h>
#include <linux/slab.h>
+#include <linux/dma-direction.h>
+#include <linux/vhost_iotlb.h>
#include <asm/barrier.h>
/* virtio_ring with information needed for host access. */
@@ -39,6 +41,9 @@ struct vringh {
/* The vring (note: it may contain user pointers!) */
struct vring vring;
+ /* IOTLB for this vring */
+ struct vhost_iotlb *iotlb;
+
/* The function to call to notify the guest about added buffers */
void (*notify)(struct vringh *);
};
@@ -248,4 +253,35 @@ static inline __virtio64 cpu_to_vringh64(const struct vringh *vrh, u64 val)
{
return __cpu_to_virtio64(vringh_is_little_endian(vrh), val);
}
+
+void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb);
+
+int vringh_init_iotlb(struct vringh *vrh, u64 features,
+ unsigned int num, bool weak_barriers,
+ struct vring_desc *desc,
+ struct vring_avail *avail,
+ struct vring_used *used);
+
+int vringh_getdesc_iotlb(struct vringh *vrh,
+ struct vringh_kiov *riov,
+ struct vringh_kiov *wiov,
+ u16 *head,
+ gfp_t gfp);
+
+ssize_t vringh_iov_pull_iotlb(struct vringh *vrh,
+ struct vringh_kiov *riov,
+ void *dst, size_t len);
+ssize_t vringh_iov_push_iotlb(struct vringh *vrh,
+ struct vringh_kiov *wiov,
+ const void *src, size_t len);
+
+void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num);
+
+int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len);
+
+bool vringh_notify_enable_iotlb(struct vringh *vrh);
+void vringh_notify_disable_iotlb(struct vringh *vrh);
+
+int vringh_need_notify_iotlb(struct vringh *vrh);
+
#endif /* _LINUX_VRINGH_H */
diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h
index 78bac995db15..d4825b82c7a3 100644
--- a/include/sound/soc-dai.h
+++ b/include/sound/soc-dai.h
@@ -351,7 +351,7 @@ struct snd_soc_dai {
/* bit field */
unsigned int probed:1;
- unsigned int started:1;
+ unsigned int started[SNDRV_PCM_STREAM_LAST + 1];
};
static inline struct snd_soc_pcm_stream *
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 67a97838c2a0..d97adfc327f0 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -153,7 +153,8 @@ TRACE_DEFINE_ENUM(CP_PAUSE);
#define show_compress_algorithm(type) \
__print_symbolic(type, \
{ COMPRESS_LZO, "LZO" }, \
- { COMPRESS_LZ4, "LZ4" })
+ { COMPRESS_LZ4, "LZ4" }, \
+ { COMPRESS_ZSTD, "ZSTD" })
struct f2fs_sb_info;
struct f2fs_io_info;
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index d82a0f4e824d..70e32ff096ec 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -13,6 +13,7 @@
EM( SCAN_PMD_NULL, "pmd_null") \
EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \
EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \
+ EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \
EM( SCAN_PAGE_RO, "no_writable_page") \
EM( SCAN_LACK_REFERENCED_PAGE, "lack_referenced_page") \
EM( SCAN_PAGE_NULL, "page_null") \
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index a1675d43777e..5fb752034386 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -154,6 +154,7 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" )
{VM_ACCOUNT, "account" }, \
{VM_NORESERVE, "noreserve" }, \
{VM_HUGETLB, "hugetlb" }, \
+ {VM_SYNC, "sync" }, \
__VM_ARCH_SPECIFIC_1 , \
{VM_WIPEONFORK, "wipeonfork" }, \
{VM_DONTDUMP, "dontdump" }, \
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 9238d233f8cf..051f26fedc4d 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -104,12 +104,12 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class,
TP_fast_assign(
__entry->r_xprt = r_xprt;
__entry->rc = rc;
- __entry->connect_status = r_xprt->rx_ep.rep_connected;
+ __entry->connect_status = r_xprt->rx_ep->re_connect_status;
__assign_str(addr, rpcrdma_addrstr(r_xprt));
__assign_str(port, rpcrdma_portstr(r_xprt));
),
- TP_printk("peer=[%s]:%s r_xprt=%p: rc=%d connect status=%d",
+ TP_printk("peer=[%s]:%s r_xprt=%p: rc=%d connection status=%d",
__get_str(addr), __get_str(port), __entry->r_xprt,
__entry->rc, __entry->connect_status
)
@@ -228,20 +228,20 @@ DECLARE_EVENT_CLASS(xprtrdma_frwr_done,
TP_ARGS(wc, frwr),
TP_STRUCT__entry(
- __field(const void *, mr)
+ __field(u32, mr_id)
__field(unsigned int, status)
__field(unsigned int, vendor_err)
),
TP_fast_assign(
- __entry->mr = container_of(frwr, struct rpcrdma_mr, frwr);
+ __entry->mr_id = frwr->fr_mr->res.id;
__entry->status = wc->status;
__entry->vendor_err = __entry->status ? wc->vendor_err : 0;
),
TP_printk(
- "mr=%p: %s (%u/0x%x)",
- __entry->mr, rdma_show_wc_status(__entry->status),
+ "mr.id=%u: %s (%u/0x%x)",
+ __entry->mr_id, rdma_show_wc_status(__entry->status),
__entry->status, __entry->vendor_err
)
);
@@ -274,7 +274,8 @@ DECLARE_EVENT_CLASS(xprtrdma_mr,
TP_ARGS(mr),
TP_STRUCT__entry(
- __field(const void *, mr)
+ __field(u32, mr_id)
+ __field(int, nents)
__field(u32, handle)
__field(u32, length)
__field(u64, offset)
@@ -282,15 +283,16 @@ DECLARE_EVENT_CLASS(xprtrdma_mr,
),
TP_fast_assign(
- __entry->mr = mr;
+ __entry->mr_id = mr->frwr.fr_mr->res.id;
+ __entry->nents = mr->mr_nents;
__entry->handle = mr->mr_handle;
__entry->length = mr->mr_length;
__entry->offset = mr->mr_offset;
__entry->dir = mr->mr_dir;
),
- TP_printk("mr=%p %u@0x%016llx:0x%08x (%s)",
- __entry->mr, __entry->length,
+ TP_printk("mr.id=%u nents=%d %u@0x%016llx:0x%08x (%s)",
+ __entry->mr_id, __entry->nents, __entry->length,
(unsigned long long)__entry->offset, __entry->handle,
xprtrdma_show_direction(__entry->dir)
)
@@ -340,68 +342,37 @@ DECLARE_EVENT_CLASS(xprtrdma_cb_event,
** Connection events
**/
-TRACE_EVENT(xprtrdma_cm_event,
- TP_PROTO(
- const struct rpcrdma_xprt *r_xprt,
- struct rdma_cm_event *event
- ),
-
- TP_ARGS(r_xprt, event),
-
- TP_STRUCT__entry(
- __field(const void *, r_xprt)
- __field(unsigned int, event)
- __field(int, status)
- __string(addr, rpcrdma_addrstr(r_xprt))
- __string(port, rpcrdma_portstr(r_xprt))
- ),
-
- TP_fast_assign(
- __entry->r_xprt = r_xprt;
- __entry->event = event->event;
- __entry->status = event->status;
- __assign_str(addr, rpcrdma_addrstr(r_xprt));
- __assign_str(port, rpcrdma_portstr(r_xprt));
- ),
-
- TP_printk("peer=[%s]:%s r_xprt=%p: %s (%u/%d)",
- __get_str(addr), __get_str(port),
- __entry->r_xprt, rdma_show_cm_event(__entry->event),
- __entry->event, __entry->status
- )
-);
-
TRACE_EVENT(xprtrdma_inline_thresh,
TP_PROTO(
- const struct rpcrdma_xprt *r_xprt
+ const struct rpcrdma_ep *ep
),
- TP_ARGS(r_xprt),
+ TP_ARGS(ep),
TP_STRUCT__entry(
- __field(const void *, r_xprt)
__field(unsigned int, inline_send)
__field(unsigned int, inline_recv)
__field(unsigned int, max_send)
__field(unsigned int, max_recv)
- __string(addr, rpcrdma_addrstr(r_xprt))
- __string(port, rpcrdma_portstr(r_xprt))
+ __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6))
+ __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6))
),
TP_fast_assign(
- const struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ const struct rdma_cm_id *id = ep->re_id;
- __entry->r_xprt = r_xprt;
- __entry->inline_send = ep->rep_inline_send;
- __entry->inline_recv = ep->rep_inline_recv;
- __entry->max_send = ep->rep_max_inline_send;
- __entry->max_recv = ep->rep_max_inline_recv;
- __assign_str(addr, rpcrdma_addrstr(r_xprt));
- __assign_str(port, rpcrdma_portstr(r_xprt));
+ __entry->inline_send = ep->re_inline_send;
+ __entry->inline_recv = ep->re_inline_recv;
+ __entry->max_send = ep->re_max_inline_send;
+ __entry->max_recv = ep->re_max_inline_recv;
+ memcpy(__entry->srcaddr, &id->route.addr.src_addr,
+ sizeof(struct sockaddr_in6));
+ memcpy(__entry->dstaddr, &id->route.addr.dst_addr,
+ sizeof(struct sockaddr_in6));
),
- TP_printk("peer=[%s]:%s r_xprt=%p neg send/recv=%u/%u, calc send/recv=%u/%u",
- __get_str(addr), __get_str(port), __entry->r_xprt,
+ TP_printk("%pISpc -> %pISpc neg send/recv=%u/%u, calc send/recv=%u/%u",
+ __entry->srcaddr, __entry->dstaddr,
__entry->inline_send, __entry->inline_recv,
__entry->max_send, __entry->max_recv
)
@@ -409,11 +380,10 @@ TRACE_EVENT(xprtrdma_inline_thresh,
DEFINE_CONN_EVENT(connect);
DEFINE_CONN_EVENT(disconnect);
+DEFINE_CONN_EVENT(flush_dct);
DEFINE_RXPRT_EVENT(xprtrdma_create);
DEFINE_RXPRT_EVENT(xprtrdma_op_destroy);
-DEFINE_RXPRT_EVENT(xprtrdma_remove);
-DEFINE_RXPRT_EVENT(xprtrdma_reinsert);
DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc);
DEFINE_RXPRT_EVENT(xprtrdma_op_close);
DEFINE_RXPRT_EVENT(xprtrdma_op_setport);
@@ -480,32 +450,33 @@ TRACE_EVENT(xprtrdma_op_set_cto,
TRACE_EVENT(xprtrdma_qp_event,
TP_PROTO(
- const struct rpcrdma_xprt *r_xprt,
+ const struct rpcrdma_ep *ep,
const struct ib_event *event
),
- TP_ARGS(r_xprt, event),
+ TP_ARGS(ep, event),
TP_STRUCT__entry(
- __field(const void *, r_xprt)
- __field(unsigned int, event)
+ __field(unsigned long, event)
__string(name, event->device->name)
- __string(addr, rpcrdma_addrstr(r_xprt))
- __string(port, rpcrdma_portstr(r_xprt))
+ __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6))
+ __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6))
),
TP_fast_assign(
- __entry->r_xprt = r_xprt;
+ const struct rdma_cm_id *id = ep->re_id;
+
__entry->event = event->event;
__assign_str(name, event->device->name);
- __assign_str(addr, rpcrdma_addrstr(r_xprt));
- __assign_str(port, rpcrdma_portstr(r_xprt));
+ memcpy(__entry->srcaddr, &id->route.addr.src_addr,
+ sizeof(struct sockaddr_in6));
+ memcpy(__entry->dstaddr, &id->route.addr.dst_addr,
+ sizeof(struct sockaddr_in6));
),
- TP_printk("peer=[%s]:%s r_xprt=%p: dev %s: %s (%u)",
- __get_str(addr), __get_str(port), __entry->r_xprt,
- __get_str(name), rdma_show_ib_event(__entry->event),
- __entry->event
+ TP_printk("%pISpc -> %pISpc device=%s %s (%lu)",
+ __entry->srcaddr, __entry->dstaddr, __get_str(name),
+ rdma_show_ib_event(__entry->event), __entry->event
)
);
@@ -801,7 +772,7 @@ TRACE_EVENT(xprtrdma_post_recvs,
__entry->r_xprt = r_xprt;
__entry->count = count;
__entry->status = status;
- __entry->posted = r_xprt->rx_ep.rep_receive_count;
+ __entry->posted = r_xprt->rx_ep->re_receive_count;
__assign_str(addr, rpcrdma_addrstr(r_xprt));
__assign_str(port, rpcrdma_portstr(r_xprt));
),
@@ -920,17 +891,17 @@ TRACE_EVENT(xprtrdma_frwr_alloc,
TP_ARGS(mr, rc),
TP_STRUCT__entry(
- __field(const void *, mr)
+ __field(u32, mr_id)
__field(int, rc)
),
TP_fast_assign(
- __entry->mr = mr;
- __entry->rc = rc;
+ __entry->mr_id = mr->frwr.fr_mr->res.id;
+ __entry->rc = rc;
),
- TP_printk("mr=%p: rc=%d",
- __entry->mr, __entry->rc
+ TP_printk("mr.id=%u: rc=%d",
+ __entry->mr_id, __entry->rc
)
);
@@ -943,7 +914,8 @@ TRACE_EVENT(xprtrdma_frwr_dereg,
TP_ARGS(mr, rc),
TP_STRUCT__entry(
- __field(const void *, mr)
+ __field(u32, mr_id)
+ __field(int, nents)
__field(u32, handle)
__field(u32, length)
__field(u64, offset)
@@ -952,7 +924,8 @@ TRACE_EVENT(xprtrdma_frwr_dereg,
),
TP_fast_assign(
- __entry->mr = mr;
+ __entry->mr_id = mr->frwr.fr_mr->res.id;
+ __entry->nents = mr->mr_nents;
__entry->handle = mr->mr_handle;
__entry->length = mr->mr_length;
__entry->offset = mr->mr_offset;
@@ -960,8 +933,8 @@ TRACE_EVENT(xprtrdma_frwr_dereg,
__entry->rc = rc;
),
- TP_printk("mr=%p %u@0x%016llx:0x%08x (%s): rc=%d",
- __entry->mr, __entry->length,
+ TP_printk("mr.id=%u nents=%d %u@0x%016llx:0x%08x (%s): rc=%d",
+ __entry->mr_id, __entry->nents, __entry->length,
(unsigned long long)__entry->offset, __entry->handle,
xprtrdma_show_direction(__entry->dir),
__entry->rc
@@ -977,21 +950,21 @@ TRACE_EVENT(xprtrdma_frwr_sgerr,
TP_ARGS(mr, sg_nents),
TP_STRUCT__entry(
- __field(const void *, mr)
+ __field(u32, mr_id)
__field(u64, addr)
__field(u32, dir)
__field(int, nents)
),
TP_fast_assign(
- __entry->mr = mr;
+ __entry->mr_id = mr->frwr.fr_mr->res.id;
__entry->addr = mr->mr_sg->dma_address;
__entry->dir = mr->mr_dir;
__entry->nents = sg_nents;
),
- TP_printk("mr=%p dma addr=0x%llx (%s) sg_nents=%d",
- __entry->mr, __entry->addr,
+ TP_printk("mr.id=%u DMA addr=0x%llx (%s) sg_nents=%d",
+ __entry->mr_id, __entry->addr,
xprtrdma_show_direction(__entry->dir),
__entry->nents
)
@@ -1006,7 +979,7 @@ TRACE_EVENT(xprtrdma_frwr_maperr,
TP_ARGS(mr, num_mapped),
TP_STRUCT__entry(
- __field(const void *, mr)
+ __field(u32, mr_id)
__field(u64, addr)
__field(u32, dir)
__field(int, num_mapped)
@@ -1014,15 +987,15 @@ TRACE_EVENT(xprtrdma_frwr_maperr,
),
TP_fast_assign(
- __entry->mr = mr;
+ __entry->mr_id = mr->frwr.fr_mr->res.id;
__entry->addr = mr->mr_sg->dma_address;
__entry->dir = mr->mr_dir;
__entry->num_mapped = num_mapped;
__entry->nents = mr->mr_nents;
),
- TP_printk("mr=%p dma addr=0x%llx (%s) nents=%d of %d",
- __entry->mr, __entry->addr,
+ TP_printk("mr.id=%u DMA addr=0x%llx (%s) nents=%d of %d",
+ __entry->mr_id, __entry->addr,
xprtrdma_show_direction(__entry->dir),
__entry->num_mapped, __entry->nents
)
@@ -1031,7 +1004,7 @@ TRACE_EVENT(xprtrdma_frwr_maperr,
DEFINE_MR_EVENT(localinv);
DEFINE_MR_EVENT(map);
DEFINE_MR_EVENT(unmap);
-DEFINE_MR_EVENT(remoteinv);
+DEFINE_MR_EVENT(reminv);
DEFINE_MR_EVENT(recycle);
TRACE_EVENT(xprtrdma_dma_maperr,
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index a5ab2973e8dc..74bb594ccb25 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -323,7 +323,7 @@ TRACE_EVENT(mm_vmscan_writepage,
TP_fast_assign(
__entry->pfn = page_to_pfn(page);
__entry->reclaim_flags = trace_reclaim_flags(
- page_is_file_cache(page));
+ page_is_file_lru(page));
),
TP_printk("page=%p pfn=%lu flags=%s",
diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h
index 6923dc7e0298..b6a835d37826 100644
--- a/include/uapi/linux/input-event-codes.h
+++ b/include/uapi/linux/input-event-codes.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */
/*
* Input event codes
*
diff --git a/include/uapi/linux/um_timetravel.h b/include/uapi/linux/um_timetravel.h
new file mode 100644
index 000000000000..ca3238222b6d
--- /dev/null
+++ b/include/uapi/linux/um_timetravel.h
@@ -0,0 +1,128 @@
+/*
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Copyright (C) 2019 Intel Corporation
+ */
+#ifndef _UAPI_LINUX_UM_TIMETRAVEL_H
+#define _UAPI_LINUX_UM_TIMETRAVEL_H
+#include <linux/types.h>
+
+/**
+ * struct um_timetravel_msg - UM time travel message
+ *
+ * This is the basic message type, going in both directions.
+ *
+ * This is the message passed between the host (user-mode Linux instance)
+ * and the calendar (the application on the other side of the socket) in
+ * order to implement common scheduling.
+ *
+ * Whenever UML has an event it will request runtime for it from the
+ * calendar, and then wait for its turn until it can run, etc. Note
+ * that it will only ever request the single next runtime, i.e. multiple
+ * REQUEST messages override each other.
+ */
+struct um_timetravel_msg {
+ /**
+ * @op: operation value from &enum um_timetravel_ops
+ */
+ __u32 op;
+
+ /**
+ * @seq: sequence number for the message - shall be reflected in
+ * the ACK response, and should be checked while processing
+ * the response to see if it matches
+ */
+ __u32 seq;
+
+ /**
+ * @time: time in nanoseconds
+ */
+ __u64 time;
+};
+
+/**
+ * enum um_timetravel_ops - Operation codes
+ */
+enum um_timetravel_ops {
+ /**
+ * @UM_TIMETRAVEL_ACK: response (ACK) to any previous message,
+ * this usually doesn't carry any data in the 'time' field
+ * unless otherwise specified below
+ */
+ UM_TIMETRAVEL_ACK = 0,
+
+ /**
+ * @UM_TIMETRAVEL_START: initialize the connection, the time
+ * field contains an (arbitrary) ID to possibly be able
+ * to distinguish the connections.
+ */
+ UM_TIMETRAVEL_START = 1,
+
+ /**
+ * @UM_TIMETRAVEL_REQUEST: request to run at the given time
+ * (host -> calendar)
+ */
+ UM_TIMETRAVEL_REQUEST = 2,
+
+ /**
+ * @UM_TIMETRAVEL_WAIT: Indicate waiting for the previously requested
+ * runtime, new requests may be made while waiting (e.g. due to
+ * interrupts); the time field is ignored. The calendar must process
+ * this message and later send a %UM_TIMETRAVEL_RUN message when
+ * the host can run again.
+ * (host -> calendar)
+ */
+ UM_TIMETRAVEL_WAIT = 3,
+
+ /**
+ * @UM_TIMETRAVEL_GET: return the current time from the calendar in the
+ * ACK message, the time in the request message is ignored
+ * (host -> calendar)
+ */
+ UM_TIMETRAVEL_GET = 4,
+
+ /**
+ * @UM_TIMETRAVEL_UPDATE: time update to the calendar, must be sent e.g.
+ * before kicking an interrupt to another calendar
+ * (host -> calendar)
+ */
+ UM_TIMETRAVEL_UPDATE = 5,
+
+ /**
+ * @UM_TIMETRAVEL_RUN: run time request granted, current time is in
+ * the time field
+ * (calendar -> host)
+ */
+ UM_TIMETRAVEL_RUN = 6,
+
+ /**
+ * @UM_TIMETRAVEL_FREE_UNTIL: Enable free-running until the given time,
+ * this is a message from the calendar telling the host that it can
+ * freely do its own scheduling for anything before the indicated
+ * time.
+ * Note that if a calendar sends this message once, the host may
+ * assume that it will also do so in the future, if it implements
+ * wraparound semantics for the time field.
+ * (calendar -> host)
+ */
+ UM_TIMETRAVEL_FREE_UNTIL = 7,
+
+ /**
+ * @UM_TIMETRAVEL_GET_TOD: Return time of day, typically used once at
+ * boot by the virtual machines to get a synchronized time from
+ * the simulation.
+ */
+ UM_TIMETRAVEL_GET_TOD = 8,
+};
+
+#endif /* _UAPI_LINUX_UM_TIMETRAVEL_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 48f1a7c2f1f0..e7e98bde221f 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -19,7 +19,8 @@
* means the userland is reading).
*/
#define UFFD_API ((__u64)0xAA)
-#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \
+#define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
+ UFFD_FEATURE_EVENT_FORK | \
UFFD_FEATURE_EVENT_REMAP | \
UFFD_FEATURE_EVENT_REMOVE | \
UFFD_FEATURE_EVENT_UNMAP | \
@@ -34,7 +35,8 @@
#define UFFD_API_RANGE_IOCTLS \
((__u64)1 << _UFFDIO_WAKE | \
(__u64)1 << _UFFDIO_COPY | \
- (__u64)1 << _UFFDIO_ZEROPAGE)
+ (__u64)1 << _UFFDIO_ZEROPAGE | \
+ (__u64)1 << _UFFDIO_WRITEPROTECT)
#define UFFD_API_RANGE_IOCTLS_BASIC \
((__u64)1 << _UFFDIO_WAKE | \
(__u64)1 << _UFFDIO_COPY)
@@ -52,6 +54,7 @@
#define _UFFDIO_WAKE (0x02)
#define _UFFDIO_COPY (0x03)
#define _UFFDIO_ZEROPAGE (0x04)
+#define _UFFDIO_WRITEPROTECT (0x06)
#define _UFFDIO_API (0x3F)
/* userfaultfd ioctl ids */
@@ -68,6 +71,8 @@
struct uffdio_copy)
#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \
struct uffdio_zeropage)
+#define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
+ struct uffdio_writeprotect)
/* read() structure */
struct uffd_msg {
@@ -203,13 +208,14 @@ struct uffdio_copy {
__u64 dst;
__u64 src;
__u64 len;
+#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0)
/*
- * There will be a wrprotection flag later that allows to map
- * pages wrprotected on the fly. And such a flag will be
- * available if the wrprotection ioctl are implemented for the
- * range according to the uffdio_register.ioctls.
+ * UFFDIO_COPY_MODE_WP will map the page write protected on
+ * the fly. UFFDIO_COPY_MODE_WP is available only if the
+ * write protected ioctl is implemented for the range
+ * according to the uffdio_register.ioctls.
*/
-#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0)
+#define UFFDIO_COPY_MODE_WP ((__u64)1<<1)
__u64 mode;
/*
@@ -231,4 +237,24 @@ struct uffdio_zeropage {
__s64 zeropage;
};
+struct uffdio_writeprotect {
+ struct uffdio_range range;
+/*
+ * UFFDIO_WRITEPROTECT_MODE_WP: set the flag to write protect a range,
+ * unset the flag to undo protection of a range which was previously
+ * write protected.
+ *
+ * UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up
+ * any wait thread after the operation succeeds.
+ *
+ * NOTE: Write protecting a region (WP=1) is unrelated to page faults,
+ * therefore DONTWAKE flag is meaningless with WP=1. Removing write
+ * protection (WP=0) in response to a page fault wakes the faulting
+ * task unless DONTWAKE is set.
+ */
+#define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0)
+#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1)
+ __u64 mode;
+};
+
#endif /* _LINUX_USERFAULTFD_H */
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 40d028eed645..9fe72e4b1373 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -116,4 +116,28 @@
#define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u64)
#define VHOST_VSOCK_SET_RUNNING _IOW(VHOST_VIRTIO, 0x61, int)
+/* VHOST_VDPA specific defines */
+
+/* Get the device id. The device ids follow the same definition of
+ * the device id defined in virtio-spec.
+ */
+#define VHOST_VDPA_GET_DEVICE_ID _IOR(VHOST_VIRTIO, 0x70, __u32)
+/* Get and set the status. The status bits follow the same definition
+ * of the device status defined in virtio-spec.
+ */
+#define VHOST_VDPA_GET_STATUS _IOR(VHOST_VIRTIO, 0x71, __u8)
+#define VHOST_VDPA_SET_STATUS _IOW(VHOST_VIRTIO, 0x72, __u8)
+/* Get and set the device config. The device config follows the same
+ * definition of the device config defined in virtio-spec.
+ */
+#define VHOST_VDPA_GET_CONFIG _IOR(VHOST_VIRTIO, 0x73, \
+ struct vhost_vdpa_config)
+#define VHOST_VDPA_SET_CONFIG _IOW(VHOST_VIRTIO, 0x74, \
+ struct vhost_vdpa_config)
+/* Enable/disable the ring. */
+#define VHOST_VDPA_SET_VRING_ENABLE _IOW(VHOST_VIRTIO, 0x75, \
+ struct vhost_vring_state)
+/* Get the max ring size. */
+#define VHOST_VDPA_GET_VRING_NUM _IOR(VHOST_VIRTIO, 0x76, __u16)
+
#endif
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index c907290ff065..669457ce5c48 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -119,6 +119,14 @@ struct vhost_scsi_target {
unsigned short reserved;
};
+/* VHOST_VDPA specific definitions */
+
+struct vhost_vdpa_config {
+ __u32 off;
+ __u32 len;
+ __u8 buf[0];
+};
+
/* Feature bits */
/* Log all write descriptors. Can be changed while device is active. */
#define VHOST_F_LOG_ALL 26
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index a1966cd7b677..19974392d324 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -36,6 +36,7 @@
#define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */
#define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */
#define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */
+#define VIRTIO_BALLOON_F_REPORTING 5 /* Page reporting virtqueue */
/* Size of a PFN in the balloon interface. */
#define VIRTIO_BALLOON_PFN_SHIFT 12
diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h
index 237e36a280cb..48e3c29223b5 100644
--- a/include/uapi/linux/virtio_iommu.h
+++ b/include/uapi/linux/virtio_iommu.h
@@ -18,24 +18,24 @@
#define VIRTIO_IOMMU_F_MMIO 5
struct virtio_iommu_range_64 {
- __le64 start;
- __le64 end;
+ __u64 start;
+ __u64 end;
};
struct virtio_iommu_range_32 {
- __le32 start;
- __le32 end;
+ __u32 start;
+ __u32 end;
};
struct virtio_iommu_config {
/* Supported page sizes */
- __le64 page_size_mask;
+ __u64 page_size_mask;
/* Supported IOVA range */
struct virtio_iommu_range_64 input_range;
/* Max domain ID size */
struct virtio_iommu_range_32 domain_range;
/* Probe buffer size */
- __le32 probe_size;
+ __u32 probe_size;
};
/* Request types */
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index a3715a3224c1..19d23e5baa4e 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -57,6 +57,9 @@
* Steering */
#define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */
+#define VIRTIO_NET_F_HASH_REPORT 57 /* Supports hash report */
+#define VIRTIO_NET_F_RSS 60 /* Supports RSS RX steering */
+#define VIRTIO_NET_F_RSC_EXT 61 /* extended coalescing info */
#define VIRTIO_NET_F_STANDBY 62 /* Act as standby for another device
* with the same MAC.
*/
@@ -69,6 +72,17 @@
#define VIRTIO_NET_S_LINK_UP 1 /* Link is up */
#define VIRTIO_NET_S_ANNOUNCE 2 /* Announcement is needed */
+/* supported/enabled hash types */
+#define VIRTIO_NET_RSS_HASH_TYPE_IPv4 (1 << 0)
+#define VIRTIO_NET_RSS_HASH_TYPE_TCPv4 (1 << 1)
+#define VIRTIO_NET_RSS_HASH_TYPE_UDPv4 (1 << 2)
+#define VIRTIO_NET_RSS_HASH_TYPE_IPv6 (1 << 3)
+#define VIRTIO_NET_RSS_HASH_TYPE_TCPv6 (1 << 4)
+#define VIRTIO_NET_RSS_HASH_TYPE_UDPv6 (1 << 5)
+#define VIRTIO_NET_RSS_HASH_TYPE_IP_EX (1 << 6)
+#define VIRTIO_NET_RSS_HASH_TYPE_TCP_EX (1 << 7)
+#define VIRTIO_NET_RSS_HASH_TYPE_UDP_EX (1 << 8)
+
struct virtio_net_config {
/* The config defining mac address (if VIRTIO_NET_F_MAC) */
__u8 mac[ETH_ALEN];
@@ -92,6 +106,12 @@ struct virtio_net_config {
* Any other value stands for unknown.
*/
__u8 duplex;
+ /* maximum size of RSS key */
+ __u8 rss_max_key_size;
+ /* maximum number of indirection table entries */
+ __le16 rss_max_indirection_table_length;
+ /* bitmask of supported VIRTIO_NET_RSS_HASH_ types */
+ __le32 supported_hash_types;
} __attribute__((packed));
/*
@@ -104,6 +124,7 @@ struct virtio_net_config {
struct virtio_net_hdr_v1 {
#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */
#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */
+#define VIRTIO_NET_HDR_F_RSC_INFO 4 /* rsc info in csum_ fields */
__u8 flags;
#define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */
#define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */
@@ -113,11 +134,46 @@ struct virtio_net_hdr_v1 {
__u8 gso_type;
__virtio16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */
__virtio16 gso_size; /* Bytes to append to hdr_len per frame */
- __virtio16 csum_start; /* Position to start checksumming from */
- __virtio16 csum_offset; /* Offset after that to place checksum */
+ union {
+ struct {
+ __virtio16 csum_start;
+ __virtio16 csum_offset;
+ };
+ /* Checksum calculation */
+ struct {
+ /* Position to start checksumming from */
+ __virtio16 start;
+ /* Offset after that to place checksum */
+ __virtio16 offset;
+ } csum;
+ /* Receive Segment Coalescing */
+ struct {
+ /* Number of coalesced segments */
+ __le16 segments;
+ /* Number of duplicated acks */
+ __le16 dup_acks;
+ } rsc;
+ };
__virtio16 num_buffers; /* Number of merged rx buffers */
};
+struct virtio_net_hdr_v1_hash {
+ struct virtio_net_hdr_v1 hdr;
+ __le32 hash_value;
+#define VIRTIO_NET_HASH_REPORT_NONE 0
+#define VIRTIO_NET_HASH_REPORT_IPv4 1
+#define VIRTIO_NET_HASH_REPORT_TCPv4 2
+#define VIRTIO_NET_HASH_REPORT_UDPv4 3
+#define VIRTIO_NET_HASH_REPORT_IPv6 4
+#define VIRTIO_NET_HASH_REPORT_TCPv6 5
+#define VIRTIO_NET_HASH_REPORT_UDPv6 6
+#define VIRTIO_NET_HASH_REPORT_IPv6_EX 7
+#define VIRTIO_NET_HASH_REPORT_TCPv6_EX 8
+#define VIRTIO_NET_HASH_REPORT_UDPv6_EX 9
+ __le16 hash_report;
+ __le16 padding;
+};
+
#ifndef VIRTIO_NET_NO_LEGACY
/* This header comes first in the scatter-gather list.
* For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated, it must
@@ -228,7 +284,9 @@ struct virtio_net_ctrl_mac {
/*
* Control Receive Flow Steering
- *
+ */
+#define VIRTIO_NET_CTRL_MQ 4
+/*
* The command VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET
* enables Receive Flow Steering, specifying the number of the transmit and
* receive queues that will be used. After the command is consumed and acked by
@@ -241,12 +299,48 @@ struct virtio_net_ctrl_mq {
__virtio16 virtqueue_pairs;
};
-#define VIRTIO_NET_CTRL_MQ 4
#define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET 0
#define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN 1
#define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX 0x8000
/*
+ * The command VIRTIO_NET_CTRL_MQ_RSS_CONFIG has the same effect as
+ * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET does and additionally configures
+ * the receive steering to use a hash calculated for incoming packet
+ * to decide on receive virtqueue to place the packet. The command
+ * also provides parameters to calculate a hash and receive virtqueue.
+ */
+struct virtio_net_rss_config {
+ __le32 hash_types;
+ __le16 indirection_table_mask;
+ __le16 unclassified_queue;
+ __le16 indirection_table[1/* + indirection_table_mask */];
+ __le16 max_tx_vq;
+ __u8 hash_key_length;
+ __u8 hash_key_data[/* hash_key_length */];
+};
+
+ #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG 1
+
+/*
+ * The command VIRTIO_NET_CTRL_MQ_HASH_CONFIG requests the device
+ * to include in the virtio header of the packet the value of the
+ * calculated hash and the report type of hash. It also provides
+ * parameters for hash calculation. The command requires feature
+ * VIRTIO_NET_F_HASH_REPORT to be negotiated to extend the
+ * layout of virtio header as defined in virtio_net_hdr_v1_hash.
+ */
+struct virtio_net_hash_config {
+ __le32 hash_types;
+ /* for compatibility with virtio_net_rss_config */
+ __le16 reserved[4];
+ __u8 hash_key_length;
+ __u8 hash_key_data[/* hash_key_length */];
+};
+
+ #define VIRTIO_NET_CTRL_MQ_HASH_CONFIG 2
+
+/*
* Control network offloads
*
* Reconfigures the network offloads that Guest can handle.
diff --git a/include/xen/events.h b/include/xen/events.h
index c0e6a0598397..12b0dcb6a120 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -14,8 +14,8 @@
unsigned xen_evtchn_nr_channels(void);
-int bind_evtchn_to_irq(unsigned int evtchn);
-int bind_evtchn_to_irqhandler(unsigned int evtchn,
+int bind_evtchn_to_irq(evtchn_port_t evtchn);
+int bind_evtchn_to_irqhandler(evtchn_port_t evtchn,
irq_handler_t handler,
unsigned long irqflags, const char *devname,
void *dev_id);
@@ -31,9 +31,9 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
const char *devname,
void *dev_id);
int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
- unsigned int remote_port);
+ evtchn_port_t remote_port);
int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
- unsigned int remote_port,
+ evtchn_port_t remote_port,
irq_handler_t handler,
unsigned long irqflags,
const char *devname,
@@ -54,15 +54,15 @@ int xen_set_irq_priority(unsigned irq, unsigned priority);
/*
* Allow extra references to event channels exposed to userspace by evtchn
*/
-int evtchn_make_refcounted(unsigned int evtchn);
-int evtchn_get(unsigned int evtchn);
-void evtchn_put(unsigned int evtchn);
+int evtchn_make_refcounted(evtchn_port_t evtchn);
+int evtchn_get(evtchn_port_t evtchn);
+void evtchn_put(evtchn_port_t evtchn);
void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector);
-void rebind_evtchn_irq(int evtchn, int irq);
+void rebind_evtchn_irq(evtchn_port_t evtchn, int irq);
int xen_set_affinity_evtchn(struct irq_desc *desc, unsigned int tcpu);
-static inline void notify_remote_via_evtchn(int port)
+static inline void notify_remote_via_evtchn(evtchn_port_t port)
{
struct evtchn_send send = { .port = port };
(void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
@@ -86,9 +86,9 @@ void xen_poll_irq(int irq);
void xen_poll_irq_timeout(int irq, u64 timeout);
/* Determine the IRQ which is bound to an event channel */
-unsigned irq_from_evtchn(unsigned int evtchn);
+unsigned int irq_from_evtchn(evtchn_port_t evtchn);
int irq_from_virq(unsigned int cpu, unsigned int virq);
-unsigned int evtchn_from_irq(unsigned irq);
+evtchn_port_t evtchn_from_irq(unsigned irq);
#ifdef CONFIG_XEN_PVHVM
/* Xen HVM evtchn vector callback */
diff --git a/include/xen/interface/event_channel.h b/include/xen/interface/event_channel.h
index 45650c9a06d5..cf80e338fbb0 100644
--- a/include/xen/interface/event_channel.h
+++ b/include/xen/interface/event_channel.h
@@ -220,7 +220,7 @@ struct evtchn_expand_array {
#define EVTCHNOP_set_priority 13
struct evtchn_set_priority {
/* IN parameters. */
- uint32_t port;
+ evtchn_port_t port;
uint32_t priority;
};
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index 8c0d1edc121c..5a8315e6d8a6 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -47,6 +47,7 @@
#include <xen/interface/grant_table.h>
#include <xen/interface/io/xenbus.h>
#include <xen/interface/io/xs_wire.h>
+#include <xen/interface/event_channel.h>
#define XENBUS_MAX_RING_GRANT_ORDER 4
#define XENBUS_MAX_RING_GRANTS (1U << XENBUS_MAX_RING_GRANT_ORDER)
@@ -212,8 +213,8 @@ int xenbus_map_ring_valloc(struct xenbus_device *dev, grant_ref_t *gnt_refs,
int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr);
-int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
-int xenbus_free_evtchn(struct xenbus_device *dev, int port);
+int xenbus_alloc_evtchn(struct xenbus_device *dev, evtchn_port_t *port);
+int xenbus_free_evtchn(struct xenbus_device *dev, evtchn_port_t port);
enum xenbus_state xenbus_read_driver_state(const char *path);
diff --git a/init/Kconfig b/init/Kconfig
index 1c12059e0f7e..9e22ee8fbd75 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -16,6 +16,10 @@ config GCC_VERSION
default $(shell,$(srctree)/scripts/gcc-version.sh $(CC)) if CC_IS_GCC
default 0
+config LD_VERSION
+ int
+ default $(shell,$(LD) --version | $(srctree)/scripts/ld-version.sh)
+
config CC_IS_CLANG
def_bool $(success,$(CC) --version | head -n 1 | grep -q clang)
@@ -872,7 +876,7 @@ config BLK_CGROUP
This option only enables generic Block IO controller infrastructure.
One needs to also enable actual IO controlling logic/policy. For
enabling proportional weight division of disk bandwidth in CFQ, set
- CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
+ CONFIG_BFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
CONFIG_BLK_DEV_THROTTLING=y.
See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
@@ -1538,7 +1542,6 @@ config AIO
config IO_URING
bool "Enable IO uring support" if EXPERT
- select ANON_INODES
select IO_WQ
default y
help
@@ -1556,6 +1559,11 @@ config ADVISE_SYSCALLS
applications use these syscalls, you can disable this option to save
space.
+config HAVE_ARCH_USERFAULTFD_WP
+ bool
+ help
+ Arch has userfaultfd write protection support
+
config MEMBARRIER
bool "Enable membarrier() system call" if EXPERT
default y
diff --git a/init/Makefile b/init/Makefile
index 6246a06364d0..d45e967483b2 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -35,4 +35,4 @@ include/generated/compile.h: FORCE
@$($(quiet)chk_compile.h)
$(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" \
- "$(CONFIG_PREEMPT_RT)" "$(CC) $(KBUILD_CFLAGS)"
+ "$(CONFIG_PREEMPT_RT)" "$(CC)" "$(LD)"
diff --git a/init/main.c b/init/main.c
index e488213857e2..a48617f2e5e5 100644
--- a/init/main.c
+++ b/init/main.c
@@ -913,7 +913,6 @@ asmlinkage __visible void __init start_kernel(void)
boot_init_stack_canary();
time_init();
- printk_safe_init();
perf_event_init();
profile_init();
call_function_init();
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 49a05ba3000d..dc8307bf2d74 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -239,11 +239,10 @@ static inline void msg_tree_erase(struct posix_msg_tree_node *leaf,
info->msg_tree_rightmost = rb_prev(node);
rb_erase(node, &info->msg_tree);
- if (info->node_cache) {
+ if (info->node_cache)
kfree(leaf);
- } else {
+ else
info->node_cache = leaf;
- }
}
static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
diff --git a/ipc/shm.c b/ipc/shm.c
index ce1ca9f7c6e9..0ba6add05b35 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1332,7 +1332,7 @@ static int copy_compat_shmid_from_user(struct shmid64_ds *out, void __user *buf,
}
}
-long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr, int version)
+static long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr, int version)
{
struct ipc_namespace *ns;
struct shmid64_ds sem64;
diff --git a/ipc/util.c b/ipc/util.c
index fe61df53775a..7acccfded7cb 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -764,13 +764,13 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
total++;
}
+ *new_pos = pos + 1;
if (total >= ids->in_use)
return NULL;
for (; pos < ipc_mni; pos++) {
ipc = idr_find(&ids->ipcs_idr, pos);
if (ipc != NULL) {
- *new_pos = pos + 1;
rcu_read_lock();
ipc_lock_object(ipc);
return ipc;
@@ -885,6 +885,7 @@ static int sysvipc_proc_release(struct inode *inode, struct file *file)
}
static const struct proc_ops sysvipc_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = sysvipc_proc_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index 7fa0c4ae6394..8a44b93da0f3 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -6,7 +6,6 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_KERNEL_XZ=y
# CONFIG_KERNEL_LZO is not set
# CONFIG_KERNEL_LZ4 is not set
-CONFIG_OPTIMIZE_INLINING=y
# CONFIG_SLAB is not set
# CONFIG_SLUB is not set
CONFIG_SLOB=y
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 2031ed1ad7fa..9e1777c81f55 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -137,9 +137,12 @@ static const char *const maperr2str[] = {
[MAP_ERR_CHECKED] = "dma map error checked",
};
-static const char *type2name[5] = { "single", "page",
- "scather-gather", "coherent",
- "resource" };
+static const char *type2name[] = {
+ [dma_debug_single] = "single",
+ [dma_debug_sg] = "scather-gather",
+ [dma_debug_coherent] = "coherent",
+ [dma_debug_resource] = "resource",
+};
static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE",
"DMA_FROM_DEVICE", "DMA_NONE" };
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index a8560052a915..8f4bbdaf965e 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -39,7 +39,8 @@ static inline struct page *dma_direct_to_page(struct device *dev,
u64 dma_direct_get_required_mask(struct device *dev)
{
- u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
+ phys_addr_t phys = (phys_addr_t)(max_pfn - 1) << PAGE_SHIFT;
+ u64 max_dma = phys_to_dma_direct(dev, phys);
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 81e6d80cb219..bc9b98a9af9a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -28,6 +28,7 @@
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
+#include <linux/hugetlb.h>
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
@@ -982,16 +983,10 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
event->shadow_ctx_time = now - t->timestamp;
}
-/*
- * Update cpuctx->cgrp so that it is set when first cgroup event is added and
- * cleared when last cgroup event is removed.
- */
static inline void
-list_update_cgroup_event(struct perf_event *event,
- struct perf_event_context *ctx, bool add)
+perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
- struct list_head *cpuctx_entry;
if (!is_cgroup_event(event))
return;
@@ -1008,28 +1003,41 @@ list_update_cgroup_event(struct perf_event *event,
* because if the first would mismatch, the second would not try again
* and we would leave cpuctx->cgrp unset.
*/
- if (add && !cpuctx->cgrp) {
+ if (ctx->is_active && !cpuctx->cgrp) {
struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
cpuctx->cgrp = cgrp;
}
- if (add && ctx->nr_cgroups++)
+ if (ctx->nr_cgroups++)
return;
- else if (!add && --ctx->nr_cgroups)
+
+ list_add(&cpuctx->cgrp_cpuctx_entry,
+ per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
+}
+
+static inline void
+perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (!is_cgroup_event(event))
+ return;
+
+ /*
+ * Because cgroup events are always per-cpu events,
+ * @ctx == &cpuctx->ctx.
+ */
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
+
+ if (--ctx->nr_cgroups)
return;
- /* no cgroup running */
- if (!add)
+ if (ctx->is_active && cpuctx->cgrp)
cpuctx->cgrp = NULL;
- cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
- if (add)
- list_add(cpuctx_entry,
- per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
- else
- list_del(cpuctx_entry);
+ list_del(&cpuctx->cgrp_cpuctx_entry);
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1095,11 +1103,14 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
}
static inline void
-list_update_cgroup_event(struct perf_event *event,
- struct perf_event_context *ctx, bool add)
+perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
}
+static inline void
+perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
+{
+}
#endif
/*
@@ -1790,13 +1801,14 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
add_event_to_groups(event, ctx);
}
- list_update_cgroup_event(event, ctx, true);
-
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+ if (event->state > PERF_EVENT_STATE_OFF)
+ perf_cgroup_event_enable(event, ctx);
+
ctx->generation++;
}
@@ -1975,8 +1987,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
- list_update_cgroup_event(event, ctx, false);
-
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -1993,8 +2003,10 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
* of error state is by explicit re-enabling
* of the event
*/
- if (event->state > PERF_EVENT_STATE_OFF)
+ if (event->state > PERF_EVENT_STATE_OFF) {
+ perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
+ }
ctx->generation++;
}
@@ -2225,6 +2237,7 @@ event_sched_out(struct perf_event *event,
if (READ_ONCE(event->pending_disable) >= 0) {
WRITE_ONCE(event->pending_disable, -1);
+ perf_cgroup_event_disable(event, ctx);
state = PERF_EVENT_STATE_OFF;
}
perf_event_set_state(event, state);
@@ -2362,6 +2375,7 @@ static void __perf_event_disable(struct perf_event *event,
event_sched_out(event, cpuctx, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
+ perf_cgroup_event_disable(event, ctx);
}
/*
@@ -2745,7 +2759,7 @@ static int __perf_install_in_context(void *info)
}
#ifdef CONFIG_CGROUP_PERF
- if (is_cgroup_event(event)) {
+ if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
/*
* If the current cgroup doesn't match the event's
* cgroup, we should not try to schedule it.
@@ -2905,6 +2919,7 @@ static void __perf_event_enable(struct perf_event *event,
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
+ perf_cgroup_event_enable(event, ctx);
if (!ctx->is_active)
return;
@@ -3507,7 +3522,8 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
static bool perf_less_group_idx(const void *l, const void *r)
{
- const struct perf_event *le = l, *re = r;
+ const struct perf_event *le = *(const struct perf_event **)l;
+ const struct perf_event *re = *(const struct perf_event **)r;
return le->group_index < re->group_index;
}
@@ -3615,8 +3631,10 @@ static int merge_sched_in(struct perf_event *event, void *data)
}
if (event->state == PERF_EVENT_STATE_INACTIVE) {
- if (event->attr.pinned)
+ if (event->attr.pinned) {
+ perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ }
*can_add_hw = 0;
ctx->rotate_necessary = 1;
@@ -6916,9 +6934,12 @@ static u64 perf_virt_to_phys(u64 virt)
* Try IRQ-safe __get_user_pages_fast first.
* If failed, leave phys_addr as 0.
*/
- if ((current->mm != NULL) &&
- (__get_user_pages_fast(virt, 1, 0, &p) == 1))
- phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ if (current->mm != NULL) {
+ pagefault_disable();
+ if (__get_user_pages_fast(virt, 1, 0, &p) == 1)
+ phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ pagefault_enable();
+ }
if (p)
put_page(p);
@@ -7973,7 +7994,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
flags |= MAP_EXECUTABLE;
if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
- if (vma->vm_flags & VM_HUGETLB)
+ if (is_vm_hugetlb_page(vma))
flags |= MAP_HUGETLB;
if (file) {
diff --git a/kernel/extable.c b/kernel/extable.c
index 7681f87e89dd..b0ea5eb0c3b4 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -34,7 +34,8 @@ u32 __initdata __visible main_extable_sort_needed = 1;
/* Sort the kernel's built-in exception table */
void __init sort_main_extable(void)
{
- if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) {
+ if (main_extable_sort_needed &&
+ &__stop___ex_table > &__start___ex_table) {
pr_notice("Sorting __ex_table...\n");
sort_extable(__start___ex_table, __stop___ex_table);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index d2a967bf85d5..4385f3d639f2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -361,6 +361,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
if (new) {
*new = *orig;
INIT_LIST_HEAD(&new->anon_vma_chain);
+ new->vm_next = new->vm_prev = NULL;
}
return new;
}
@@ -553,14 +554,15 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (retval)
goto fail_nomem_anon_vma_fork;
if (tmp->vm_flags & VM_WIPEONFORK) {
- /* VM_WIPEONFORK gets a clean slate in the child. */
+ /*
+ * VM_WIPEONFORK gets a clean slate in the child.
+ * Don't prepare anon_vma until fault since we don't
+ * copy page for current vma.
+ */
tmp->anon_vma = NULL;
- if (anon_vma_prepare(tmp))
- goto fail_nomem_anon_vma_fork;
} else if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
- tmp->vm_next = tmp->vm_prev = NULL;
file = tmp->vm_file;
if (file) {
struct inode *inode = file_inode(file);
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index e5eb5ea7ea59..82babf5aa077 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -58,7 +58,7 @@ struct gcov_node {
struct dentry *dentry;
struct dentry **links;
int num_loaded;
- char name[0];
+ char name[];
};
static const char objtree[] = OBJTREE;
@@ -108,9 +108,9 @@ static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
{
struct gcov_iterator *iter = data;
+ (*pos)++;
if (gcov_iter_next(iter))
return NULL;
- (*pos)++;
return iter;
}
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index 801ee4b0b969..acb83558e5df 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -38,7 +38,7 @@ static struct gcov_info *gcov_info_head;
struct gcov_fn_info {
unsigned int ident;
unsigned int checksum;
- unsigned int n_ctrs[0];
+ unsigned int n_ctrs[];
};
/**
@@ -78,7 +78,7 @@ struct gcov_info {
unsigned int n_functions;
const struct gcov_fn_info *functions;
unsigned int ctr_mask;
- struct gcov_ctr_info counts[0];
+ struct gcov_ctr_info counts[];
};
/**
@@ -352,7 +352,7 @@ struct gcov_iterator {
unsigned int count;
int num_types;
- struct type_info type_info[0];
+ struct type_info type_info[];
};
static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index ec37563674d6..908fdf5098c3 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -68,7 +68,7 @@ struct gcov_fn_info {
unsigned int ident;
unsigned int lineno_checksum;
unsigned int cfg_checksum;
- struct gcov_ctr_info ctrs[0];
+ struct gcov_ctr_info ctrs[];
};
/**
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index a9b3f660dee7..16c8c605f4b0 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -175,7 +175,6 @@ unsigned long kallsyms_lookup_name(const char *name)
}
return module_kallsyms_lookup_name(name);
}
-EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
unsigned long),
@@ -194,7 +193,6 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
}
return module_kallsyms_on_each_symbol(fn, data);
}
-EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
static unsigned long get_symbol_pos(unsigned long addr,
unsigned long *symbolsize,
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bc6addd9152b..37c3c4b97b8e 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -35,7 +35,7 @@
* (u64) THREAD_SIZE * 8UL);
*
* If you need less than 50 threads would mean we're dealing with systems
- * smaller than 3200 pages. This assuems you are capable of having ~13M memory,
+ * smaller than 3200 pages. This assumes you are capable of having ~13M memory,
* and this would only be an be an upper limit, after which the OOM killer
* would take effect. Systems like these are very unlikely if modules are
* enabled.
@@ -120,7 +120,7 @@ out:
* invoke it.
*
* If module auto-loading support is disabled then this function
- * becomes a no-operation.
+ * simply returns -ENOENT.
*/
int __request_module(bool wait, const char *fmt, ...)
{
@@ -137,7 +137,7 @@ int __request_module(bool wait, const char *fmt, ...)
WARN_ON_ONCE(wait && current_is_async());
if (!modprobe_path[0])
- return 0;
+ return -ENOENT;
va_start(args, fmt);
ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 1511690e4de7..ac10db66cc63 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3952,10 +3952,36 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return ret;
}
+static inline short task_wait_context(struct task_struct *curr)
+{
+ /*
+ * Set appropriate wait type for the context; for IRQs we have to take
+ * into account force_irqthread as that is implied by PREEMPT_RT.
+ */
+ if (curr->hardirq_context) {
+ /*
+ * Check if force_irqthreads will run us threaded.
+ */
+ if (curr->hardirq_threaded || curr->irq_config)
+ return LD_WAIT_CONFIG;
+
+ return LD_WAIT_SPIN;
+ } else if (curr->softirq_context) {
+ /*
+ * Softirqs are always threaded.
+ */
+ return LD_WAIT_CONFIG;
+ }
+
+ return LD_WAIT_MAX;
+}
+
static int
print_lock_invalid_wait_context(struct task_struct *curr,
struct held_lock *hlock)
{
+ short curr_inner;
+
if (!debug_locks_off())
return 0;
if (debug_locks_silent)
@@ -3971,6 +3997,10 @@ print_lock_invalid_wait_context(struct task_struct *curr,
print_lock(hlock);
pr_warn("other info that might help us debug this:\n");
+
+ curr_inner = task_wait_context(curr);
+ pr_warn("context-{%d:%d}\n", curr_inner, curr_inner);
+
lockdep_print_held_locks(curr);
pr_warn("stack backtrace:\n");
@@ -4017,26 +4047,7 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
}
depth++;
- /*
- * Set appropriate wait type for the context; for IRQs we have to take
- * into account force_irqthread as that is implied by PREEMPT_RT.
- */
- if (curr->hardirq_context) {
- /*
- * Check if force_irqthreads will run us threaded.
- */
- if (curr->hardirq_threaded || curr->irq_config)
- curr_inner = LD_WAIT_CONFIG;
- else
- curr_inner = LD_WAIT_SPIN;
- } else if (curr->softirq_context) {
- /*
- * Softirqs are always threaded.
- */
- curr_inner = LD_WAIT_CONFIG;
- } else {
- curr_inner = LD_WAIT_MAX;
- }
+ curr_inner = task_wait_context(curr);
for (; depth < curr->lockdep_depth; depth++) {
struct held_lock *prev = curr->held_locks + depth;
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index a008a1ba21a7..8bbafe3e5203 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -118,14 +118,15 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
unsigned int mode, int wake_flags,
void *key)
{
- struct task_struct *p = get_task_struct(wq_entry->private);
bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
struct percpu_rw_semaphore *sem = key;
+ struct task_struct *p;
/* concurrent against percpu_down_write(), can get stolen */
if (!__percpu_rwsem_trylock(sem, reader))
return 1;
+ p = get_task_struct(wq_entry->private);
list_del_init(&wq_entry->entry);
smp_store_release(&wq_entry->private, NULL);
diff --git a/kernel/module.c b/kernel/module.c
index 33569a01d6e1..646f1e2330d2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1515,7 +1515,7 @@ struct module_sect_attr {
struct module_sect_attrs {
struct attribute_group grp;
unsigned int nsections;
- struct module_sect_attr attrs[0];
+ struct module_sect_attr attrs[];
};
static ssize_t module_sect_show(struct module_attribute *mattr,
@@ -1608,7 +1608,7 @@ static void remove_sect_attrs(struct module *mod)
struct module_notes_attrs {
struct kobject *dir;
unsigned int notes;
- struct bin_attribute attrs[0];
+ struct bin_attribute attrs[];
};
static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
@@ -4355,6 +4355,7 @@ static int modules_open(struct inode *inode, struct file *file)
}
static const struct proc_ops modules_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = modules_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
diff --git a/kernel/pid.c b/kernel/pid.c
index bc21c0fb26d8..c835b844aca7 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -256,6 +256,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
get_pid_ns(ns);
refcount_set(&pid->count, 1);
+ spin_lock_init(&pid->lock);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index ef90eb1fb86e..7959449765d9 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -196,6 +196,50 @@ unlock:
return res;
}
+struct compat_resume_swap_area {
+ compat_loff_t offset;
+ u32 dev;
+} __packed;
+
+static int snapshot_set_swap_area(struct snapshot_data *data,
+ void __user *argp)
+{
+ sector_t offset;
+ dev_t swdev;
+
+ if (swsusp_swap_in_use())
+ return -EPERM;
+
+ if (in_compat_syscall()) {
+ struct compat_resume_swap_area swap_area;
+
+ if (copy_from_user(&swap_area, argp, sizeof(swap_area)))
+ return -EFAULT;
+ swdev = new_decode_dev(swap_area.dev);
+ offset = swap_area.offset;
+ } else {
+ struct resume_swap_area swap_area;
+
+ if (copy_from_user(&swap_area, argp, sizeof(swap_area)))
+ return -EFAULT;
+ swdev = new_decode_dev(swap_area.dev);
+ offset = swap_area.offset;
+ }
+
+ /*
+ * User space encodes device types as two-byte values,
+ * so we need to recode them
+ */
+ if (!swdev) {
+ data->swap = -1;
+ return -EINVAL;
+ }
+ data->swap = swap_type_of(swdev, offset, NULL);
+ if (data->swap < 0)
+ return -ENODEV;
+ return 0;
+}
+
static long snapshot_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
@@ -351,34 +395,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
break;
case SNAPSHOT_SET_SWAP_AREA:
- if (swsusp_swap_in_use()) {
- error = -EPERM;
- } else {
- struct resume_swap_area swap_area;
- dev_t swdev;
-
- error = copy_from_user(&swap_area, (void __user *)arg,
- sizeof(struct resume_swap_area));
- if (error) {
- error = -EFAULT;
- break;
- }
-
- /*
- * User space encodes device types as two-byte values,
- * so we need to recode them
- */
- swdev = new_decode_dev(swap_area.dev);
- if (swdev) {
- offset = swap_area.offset;
- data->swap = swap_type_of(swdev, offset, NULL);
- if (data->swap < 0)
- error = -ENODEV;
- } else {
- data->swap = -1;
- error = -EINVAL;
- }
- }
+ error = snapshot_set_swap_area(data, (void __user *)arg);
break;
default:
@@ -393,12 +410,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
}
#ifdef CONFIG_COMPAT
-
-struct compat_resume_swap_area {
- compat_loff_t offset;
- u32 dev;
-} __packed;
-
static long
snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
@@ -409,33 +420,13 @@ snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case SNAPSHOT_AVAIL_SWAP_SIZE:
case SNAPSHOT_ALLOC_SWAP_PAGE:
case SNAPSHOT_CREATE_IMAGE:
+ case SNAPSHOT_SET_SWAP_AREA:
return snapshot_ioctl(file, cmd,
(unsigned long) compat_ptr(arg));
-
- case SNAPSHOT_SET_SWAP_AREA: {
- struct compat_resume_swap_area __user *u_swap_area =
- compat_ptr(arg);
- struct resume_swap_area swap_area;
- mm_segment_t old_fs;
- int err;
-
- err = get_user(swap_area.offset, &u_swap_area->offset);
- err |= get_user(swap_area.dev, &u_swap_area->dev);
- if (err)
- return -EFAULT;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA,
- (unsigned long) &swap_area);
- set_fs(old_fs);
- return err;
- }
-
default:
return snapshot_ioctl(file, cmd, arg);
}
}
-
#endif /* CONFIG_COMPAT */
static const struct file_operations snapshot_fops = {
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index c8e6ab689d42..b2b0f526f249 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -23,6 +23,9 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args);
void __printk_safe_enter(void);
void __printk_safe_exit(void);
+void printk_safe_init(void);
+bool printk_percpu_data_ready(void);
+
#define printk_safe_enter_irqsave(flags) \
do { \
local_irq_save(flags); \
@@ -64,4 +67,6 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
#define printk_safe_enter_irq() local_irq_disable()
#define printk_safe_exit_irq() local_irq_enable()
+static inline void printk_safe_init(void) { }
+static inline bool printk_percpu_data_ready(void) { return false; }
#endif /* CONFIG_PRINTK */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 633f41a11d75..9a9b6156270b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -460,6 +460,18 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
+/*
+ * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
+ * per_cpu_areas are initialised. This variable is set to true when
+ * it's safe to access per-CPU data.
+ */
+static bool __printk_percpu_data_ready __read_mostly;
+
+bool printk_percpu_data_ready(void)
+{
+ return __printk_percpu_data_ready;
+}
+
/* Return log buffer address */
char *log_buf_addr_get(void)
{
@@ -1146,12 +1158,28 @@ static void __init log_buf_add_cpu(void)
static inline void log_buf_add_cpu(void) {}
#endif /* CONFIG_SMP */
+static void __init set_percpu_data_ready(void)
+{
+ printk_safe_init();
+ /* Make sure we set this flag only after printk_safe() init is done */
+ barrier();
+ __printk_percpu_data_ready = true;
+}
+
void __init setup_log_buf(int early)
{
unsigned long flags;
char *new_log_buf;
unsigned int free;
+ /*
+ * Some archs call setup_log_buf() multiple times - first is very
+ * early, e.g. from setup_arch(), and second - when percpu_areas
+ * are initialised.
+ */
+ if (!early)
+ set_percpu_data_ready();
+
if (log_buf != __log_buf)
return;
@@ -2975,6 +3003,9 @@ static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
void wake_up_klogd(void)
{
+ if (!printk_percpu_data_ready())
+ return;
+
preempt_disable();
if (waitqueue_active(&log_wait)) {
this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
@@ -2985,6 +3016,9 @@ void wake_up_klogd(void)
void defer_console_output(void)
{
+ if (!printk_percpu_data_ready())
+ return;
+
preempt_disable();
__this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index b4045e782743..d9a659a686f3 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -27,7 +27,6 @@
* There are situations when we want to make sure that all buffers
* were handled or when IRQs are blocked.
*/
-static int printk_safe_irq_ready __read_mostly;
#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
sizeof(atomic_t) - \
@@ -51,7 +50,7 @@ static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
/* Get flushed in a more safe context. */
static void queue_flush_work(struct printk_safe_seq_buf *s)
{
- if (printk_safe_irq_ready)
+ if (printk_percpu_data_ready())
irq_work_queue(&s->work);
}
@@ -402,14 +401,6 @@ void __init printk_safe_init(void)
#endif
}
- /*
- * In the highly unlikely event that a NMI were to trigger at
- * this moment. Make sure IRQ work is set up before this
- * variable is set.
- */
- barrier();
- printk_safe_irq_ready = 1;
-
/* Flush pending messages that did not have scheduled IRQ works. */
printk_safe_flush();
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a2694ba82874..3a61a3b8eaa9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2119,12 +2119,6 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
return cpu;
}
-static void update_avg(u64 *avg, u64 sample)
-{
- s64 diff = sample - *avg;
- *avg += diff >> 3;
-}
-
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -4126,7 +4120,8 @@ static inline void sched_submit_work(struct task_struct *tsk)
* it wants to wake up a task to maintain concurrency.
* As this function is called inside the schedule() context,
* we disable preemption to avoid it calling schedule() again
- * in the possible wakeup of a kworker.
+ * in the possible wakeup of a kworker and because wq_worker_sleeping()
+ * requires it.
*/
if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
preempt_disable();
@@ -6699,7 +6694,6 @@ void __init sched_init(void)
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
- rq->last_load_update_tick = jiffies;
rq->last_blocked_load_update_tick = jiffies;
atomic_set(&rq->nohz_flags, 0);
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8331bc04aea2..a562df57a86e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -816,10 +816,12 @@ static int __init init_sched_debug_procfs(void)
__initcall(init_sched_debug_procfs);
-#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
-#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
-#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
+#define __P(F) __PS(#F, F)
+#define P(F) __PS(#F, p->F)
+#define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
+#define __PN(F) __PSN(#F, F)
+#define PN(F) __PSN(#F, p->F)
#ifdef CONFIG_NUMA_BALANCING
@@ -868,18 +870,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
SEQ_printf(m,
"---------------------------------------------------------"
"----------\n");
-#define __P(F) \
- SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
-#define P(F) \
- SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
-#define P_SCHEDSTAT(F) \
- SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
-#define __PN(F) \
- SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN(F) \
- SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
-#define PN_SCHEDSTAT(F) \
- SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
+
+#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->F))
+#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F))
PN(se.exec_start);
PN(se.vruntime);
@@ -939,10 +932,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
}
__P(nr_switches);
- SEQ_printf(m, "%-45s:%21Ld\n",
- "nr_voluntary_switches", (long long)p->nvcsw);
- SEQ_printf(m, "%-45s:%21Ld\n",
- "nr_involuntary_switches", (long long)p->nivcsw);
+ __PS("nr_voluntary_switches", p->nvcsw);
+ __PS("nr_involuntary_switches", p->nivcsw);
P(se.load.weight);
#ifdef CONFIG_SMP
@@ -956,6 +947,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.avg.util_est.ewma);
P(se.avg.util_est.enqueued);
#endif
+#ifdef CONFIG_UCLAMP_TASK
+ __PS("uclamp.min", p->uclamp[UCLAMP_MIN].value);
+ __PS("uclamp.max", p->uclamp[UCLAMP_MAX].value);
+ __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN));
+ __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX));
+#endif
P(policy);
P(prio);
if (task_has_dl_policy(p)) {
@@ -963,11 +960,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(dl.deadline);
}
#undef PN_SCHEDSTAT
-#undef PN
-#undef __PN
#undef P_SCHEDSTAT
-#undef P
-#undef __P
{
unsigned int this_cpu = raw_smp_processor_id();
@@ -975,8 +968,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
t0 = cpu_clock(this_cpu);
t1 = cpu_clock(this_cpu);
- SEQ_printf(m, "%-45s:%21Ld\n",
- "clock-delta", (long long)(t1-t0));
+ __PS("clock-delta", t1-t0);
}
sched_show_numa(p, m);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d7fb20adabeb..02f323b85b6d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2799,7 +2799,7 @@ static void task_numa_work(struct callback_head *work)
* Skip inaccessible VMAs to avoid any confusion between
* PROT_NONE and NUMA hinting ptes
*/
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+ if (!vma_is_accessible(vma))
continue;
do {
@@ -4836,11 +4836,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
resched_curr(rq);
}
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
+static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
{
struct cfs_rq *cfs_rq;
- u64 runtime;
- u64 starting_runtime = remaining;
+ u64 runtime, remaining = 1;
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -4855,10 +4854,13 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
/* By the above check, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
+ raw_spin_lock(&cfs_b->lock);
runtime = -cfs_rq->runtime_remaining + 1;
- if (runtime > remaining)
- runtime = remaining;
- remaining -= runtime;
+ if (runtime > cfs_b->runtime)
+ runtime = cfs_b->runtime;
+ cfs_b->runtime -= runtime;
+ remaining = cfs_b->runtime;
+ raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += runtime;
@@ -4873,8 +4875,6 @@ next:
break;
}
rcu_read_unlock();
-
- return starting_runtime - remaining;
}
/*
@@ -4885,7 +4885,6 @@ next:
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
- u64 runtime;
int throttled;
/* no need to continue the timer with no bandwidth constraint */
@@ -4914,24 +4913,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
cfs_b->nr_throttled += overrun;
/*
- * This check is repeated as we are holding onto the new bandwidth while
- * we unthrottle. This can potentially race with an unthrottled group
- * trying to acquire new bandwidth from the global pool. This can result
- * in us over-using our runtime if it is all used during this loop, but
- * only by limited amounts in that extreme case.
+ * This check is repeated as we release cfs_b->lock while we unthrottle.
*/
while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
- runtime = cfs_b->runtime;
cfs_b->distribute_running = 1;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- runtime = distribute_cfs_runtime(cfs_b, runtime);
+ distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
-
- lsub_positive(&cfs_b->runtime, runtime);
}
/*
@@ -5065,10 +5057,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (!runtime)
return;
- runtime = distribute_cfs_runtime(cfs_b, runtime);
+ distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
- lsub_positive(&cfs_b->runtime, runtime);
cfs_b->distribute_running = 0;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
@@ -6080,8 +6071,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
struct sched_domain *this_sd;
u64 avg_cost, avg_idle;
- u64 time, cost;
- s64 delta;
+ u64 time;
int this = smp_processor_id();
int cpu, nr = INT_MAX;
@@ -6119,9 +6109,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
}
time = cpu_clock(this) - time;
- cost = this_sd->avg_scan_cost;
- delta = (s64)(time - cost) / 8;
- this_sd->avg_scan_cost += delta;
+ update_avg(&this_sd->avg_scan_cost, time);
return cpu;
}
@@ -9048,6 +9036,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
sds->total_capacity;
+ /*
+ * If the local group is more loaded than the selected
+ * busiest group don't try to pull any tasks.
+ */
+ if (local->avg_load >= busiest->avg_load) {
+ env->imbalance = 0;
+ return;
+ }
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0f616bf7bce3..db3a57675ccf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -195,6 +195,12 @@ static inline int task_has_dl_policy(struct task_struct *p)
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+static inline void update_avg(u64 *avg, u64 sample)
+{
+ s64 diff = sample - *avg;
+ *avg += diff / 8;
+}
+
/*
* !! For sched_setattr_nocheck() (kernel) only !!
*
@@ -882,7 +888,6 @@ struct rq {
#endif
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
- unsigned long last_load_update_tick;
unsigned long last_blocked_load_update_tick;
unsigned int has_blocked_load;
#endif /* CONFIG_SMP */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3816a18c251e..891ccad5f271 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -858,7 +858,8 @@ void wq_worker_running(struct task_struct *task)
* @task: task going to sleep
*
* This function is called from schedule() when a busy worker is
- * going to sleep.
+ * going to sleep. Preemption needs to be disabled to protect ->sleeping
+ * assignment.
*/
void wq_worker_sleeping(struct task_struct *task)
{
@@ -875,7 +876,8 @@ void wq_worker_sleeping(struct task_struct *task)
pool = worker->pool;
- if (WARN_ON_ONCE(worker->sleeping))
+ /* Return if preempted before wq_worker_running() was reached */
+ if (worker->sleeping)
return;
worker->sleeping = 1;
diff --git a/lib/Kconfig b/lib/Kconfig
index bc7e56370129..5d53f9609c25 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -615,6 +615,9 @@ config ARCH_HAS_PMEM_API
config MEMREGION
bool
+config ARCH_HAS_MEMREMAP_COMPAT_ALIGN
+ bool
+
# use memcpy to implement user copies for nommu architectures
config UACCESS_MEMCPY
bool
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d1398cef3b18..50c1f5f08e6f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -305,18 +305,6 @@ config HEADERS_INSTALL
user-space program samples. It is also needed by some features such
as uapi header sanity checks.
-config OPTIMIZE_INLINING
- def_bool y
- help
- This option determines if the kernel forces gcc to inline the functions
- developers have marked 'inline'. Doing so takes away freedom from gcc to
- do what it thinks is best, which is desirable for the gcc 3.x series of
- compilers. The gcc 4.x series have a rewritten inlining algorithm and
- enabling this option will generate a smaller kernel there. Hopefully
- this algorithm is so good that allowing gcc 4.x and above to make the
- decision will become the default in the future. Until then this option
- is there to test gcc for this.
-
config DEBUG_SECTION_MISMATCH
bool "Enable full Section mismatch analysis"
help
@@ -988,6 +976,18 @@ config WQ_WATCHDOG
state. This can be configured through kernel parameter
"workqueue.watchdog_thresh" and its sysfs counterpart.
+config TEST_LOCKUP
+ tristate "Test module to generate lockups"
+ help
+ This builds the "test_lockup" module that helps to make sure
+ that watchdogs and lockup detectors are working properly.
+
+ Depending on module parameters it could emulate soft or hard
+ lockup, "hung task", or locking arbitrary lock for a long time.
+ Also it could generate series of lockups with cooling-down periods.
+
+ If unsure, say N.
+
endmenu # "Debug lockups and hangs"
menu "Scheduler Debugging"
@@ -1655,7 +1655,7 @@ config FAILSLAB
Provide fault-injection capability for kmalloc.
config FAIL_PAGE_ALLOC
- bool "Fault-injection capabilitiy for alloc_pages()"
+ bool "Fault-injection capability for alloc_pages()"
depends on FAULT_INJECTION
help
Provide fault-injection capability for alloc_pages().
diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan
index 0e04fcb3ab3d..48469c95d78e 100644
--- a/lib/Kconfig.ubsan
+++ b/lib/Kconfig.ubsan
@@ -2,18 +2,50 @@
config ARCH_HAS_UBSAN_SANITIZE_ALL
bool
-config UBSAN
+menuconfig UBSAN
bool "Undefined behaviour sanity checker"
help
- This option enables undefined behaviour sanity checker
+ This option enables the Undefined Behaviour sanity checker.
Compile-time instrumentation is used to detect various undefined
- behaviours in runtime. Various types of checks may be enabled
- via boot parameter ubsan_handle
- (see: Documentation/dev-tools/ubsan.rst).
+ behaviours at runtime. For more details, see:
+ Documentation/dev-tools/ubsan.rst
+
+if UBSAN
+
+config UBSAN_TRAP
+ bool "On Sanitizer warnings, abort the running kernel code"
+ depends on $(cc-option, -fsanitize-undefined-trap-on-error)
+ help
+ Building kernels with Sanitizer features enabled tends to grow
+ the kernel size by around 5%, due to adding all the debugging
+ text on failure paths. To avoid this, Sanitizer instrumentation
+ can just issue a trap. This reduces the kernel size overhead but
+ turns all warnings (including potentially harmless conditions)
+ into full exceptions that abort the running kernel code
+ (regardless of context, locks held, etc), which may destabilize
+ the system. For some system builders this is an acceptable
+ trade-off.
+
+config UBSAN_BOUNDS
+ bool "Perform array index bounds checking"
+ default UBSAN
+ help
+ This option enables detection of directly indexed out of bounds
+ array accesses, where the array size is known at compile time.
+ Note that this does not protect array overflows via bad calls
+ to the {str,mem}*cpy() family of functions (that is addressed
+ by CONFIG_FORTIFY_SOURCE).
+
+config UBSAN_MISC
+ bool "Enable all other Undefined Behavior sanity checks"
+ default UBSAN
+ help
+ This option enables all sanity checks that don't have their
+ own Kconfig options. Disable this if you only want to have
+ individually selected checks.
config UBSAN_SANITIZE_ALL
bool "Enable instrumentation for the entire kernel"
- depends on UBSAN
depends on ARCH_HAS_UBSAN_SANITIZE_ALL
# We build with -Wno-maybe-uninitilzed, but we still want to
@@ -30,7 +62,6 @@ config UBSAN_SANITIZE_ALL
config UBSAN_NO_ALIGNMENT
bool "Disable checking of pointers alignment"
- depends on UBSAN
default y if HAVE_EFFICIENT_UNALIGNED_ACCESS
help
This option disables the check of unaligned memory accesses.
@@ -43,7 +74,9 @@ config UBSAN_ALIGNMENT
config TEST_UBSAN
tristate "Module for testing for undefined behavior detection"
- depends on m && UBSAN
+ depends on m
help
This is a test module for UBSAN.
It triggers various undefined behavior, and detect it.
+
+endif # if UBSAN
diff --git a/lib/Makefile b/lib/Makefile
index 09a8acb0cf92..685aee60de1d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -87,9 +87,11 @@ obj-$(CONFIG_TEST_KMOD) += test_kmod.o
obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o
obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o
obj-$(CONFIG_TEST_OBJAGG) += test_objagg.o
+CFLAGS_test_stackinit.o += $(call cc-disable-warning, switch-unreachable)
obj-$(CONFIG_TEST_STACKINIT) += test_stackinit.o
obj-$(CONFIG_TEST_BLACKHOLE_DEV) += test_blackhole_dev.o
obj-$(CONFIG_TEST_MEMINIT) += test_meminit.o
+obj-$(CONFIG_TEST_LOCKUP) += test_lockup.o
obj-$(CONFIG_TEST_LIVEPATCH) += livepatch/
@@ -221,6 +223,10 @@ obj-$(CONFIG_MEMREGION) += memregion.o
obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
obj-$(CONFIG_IRQ_POLL) += irq_poll.o
+# stackdepot.c should not be instrumented or call instrumented functions.
+# Prevent the compiler from calling builtins like memcmp() or bcmp() from this
+# file.
+CFLAGS_stackdepot.o += -fno-builtin
obj-$(CONFIG_STACKDEPOT) += stackdepot.o
KASAN_SANITIZE_stackdepot.o := n
KCOV_INSTRUMENT_stackdepot.o := n
@@ -280,7 +286,9 @@ quiet_cmd_build_OID_registry = GEN $@
clean-files += oid_registry_data.c
obj-$(CONFIG_UCS2_STRING) += ucs2_string.o
+ifneq ($(CONFIG_UBSAN_TRAP),y)
obj-$(CONFIG_UBSAN) += ubsan.o
+endif
UBSAN_SANITIZE_ubsan.o := n
KASAN_SANITIZE_ubsan.o := n
diff --git a/lib/bch.c b/lib/bch.c
index 5db6d3a4c8a6..052d3fb753a0 100644
--- a/lib/bch.c
+++ b/lib/bch.c
@@ -102,7 +102,7 @@
*/
struct gf_poly {
unsigned int deg; /* polynomial degree */
- unsigned int c[0]; /* polynomial terms */
+ unsigned int c[]; /* polynomial terms */
};
/* given its degree, compute a polynomial size in bytes */
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index aae17d9522e5..8f199f403ab5 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -1031,7 +1031,7 @@ static int __init dynamic_debug_init(void)
int n = 0, entries = 0, modct = 0;
int verbose_bytes = 0;
- if (__start___verbose == __stop___verbose) {
+ if (&__start___verbose == &__stop___verbose) {
pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n");
return 1;
}
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index bf1b4765c8f6..6d5e5000fdd7 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -34,10 +34,8 @@ const struct raid6_calls * const raid6_algos[] = {
&raid6_avx512x2,
&raid6_avx512x1,
#endif
-#ifdef CONFIG_AS_AVX2
&raid6_avx2x2,
&raid6_avx2x1,
-#endif
&raid6_sse2x2,
&raid6_sse2x1,
&raid6_sse1x2,
@@ -51,11 +49,9 @@ const struct raid6_calls * const raid6_algos[] = {
&raid6_avx512x2,
&raid6_avx512x1,
#endif
-#ifdef CONFIG_AS_AVX2
&raid6_avx2x4,
&raid6_avx2x2,
&raid6_avx2x1,
-#endif
&raid6_sse2x4,
&raid6_sse2x2,
&raid6_sse2x1,
@@ -97,13 +93,11 @@ void (*raid6_datap_recov)(int, size_t, int, void **);
EXPORT_SYMBOL_GPL(raid6_datap_recov);
const struct raid6_recov_calls *const raid6_recov_algos[] = {
+#ifdef CONFIG_X86
#ifdef CONFIG_AS_AVX512
&raid6_recov_avx512,
#endif
-#ifdef CONFIG_AS_AVX2
&raid6_recov_avx2,
-#endif
-#ifdef CONFIG_AS_SSSE3
&raid6_recov_ssse3,
#endif
#ifdef CONFIG_S390
diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c
index 87184b6da28a..f299476e1d76 100644
--- a/lib/raid6/avx2.c
+++ b/lib/raid6/avx2.c
@@ -13,8 +13,6 @@
*
*/
-#ifdef CONFIG_AS_AVX2
-
#include <linux/raid/pq.h>
#include "x86.h"
@@ -470,5 +468,3 @@ const struct raid6_calls raid6_avx2x4 = {
1 /* Has cache hints */
};
#endif
-
-#endif /* CONFIG_AS_AVX2 */
diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c
index 7a3b5e7f66ee..4e8095403ee2 100644
--- a/lib/raid6/recov_avx2.c
+++ b/lib/raid6/recov_avx2.c
@@ -4,8 +4,6 @@
* Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
*/
-#ifdef CONFIG_AS_AVX2
-
#include <linux/raid/pq.h>
#include "x86.h"
@@ -313,7 +311,3 @@ const struct raid6_recov_calls raid6_recov_avx2 = {
#endif
.priority = 2,
};
-
-#else
-#warning "your version of binutils lacks AVX2 support"
-#endif
diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c
index 1de97d2405d0..4bfa3c6b60de 100644
--- a/lib/raid6/recov_ssse3.c
+++ b/lib/raid6/recov_ssse3.c
@@ -3,8 +3,6 @@
* Copyright (C) 2012 Intel Corporation
*/
-#ifdef CONFIG_AS_SSSE3
-
#include <linux/raid/pq.h>
#include "x86.h"
@@ -328,7 +326,3 @@ const struct raid6_recov_calls raid6_recov_ssse3 = {
#endif
.priority = 1,
};
-
-#else
-#warning "your version of binutils lacks SSSE3 support"
-#endif
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index 3ab8720aa2f8..a4c7cd74cff5 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -34,14 +34,9 @@ endif
ifeq ($(IS_X86),yes)
OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o
- CFLAGS += $(shell echo "pshufb %xmm0, %xmm0" | \
- gcc -c -x assembler - >&/dev/null && \
- rm ./-.o && echo -DCONFIG_AS_SSSE3=1)
- CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \
- gcc -c -x assembler - >&/dev/null && \
- rm ./-.o && echo -DCONFIG_AS_AVX2=1)
+ CFLAGS += -DCONFIG_X86
CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \
- gcc -c -x assembler - >&/dev/null && \
+ gcc -c -x assembler - >/dev/null 2>&1 && \
rm ./-.o && echo -DCONFIG_AS_AVX512=1)
else ifeq ($(HAS_NEON),yes)
OBJS += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
diff --git a/lib/rbtree.c b/lib/rbtree.c
index abc86c6a3177..8545872e61db 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -503,7 +503,7 @@ struct rb_node *rb_next(const struct rb_node *node)
if (node->rb_right) {
node = node->rb_right;
while (node->rb_left)
- node=node->rb_left;
+ node = node->rb_left;
return (struct rb_node *)node;
}
@@ -535,7 +535,7 @@ struct rb_node *rb_prev(const struct rb_node *node)
if (node->rb_left) {
node = node->rb_left;
while (node->rb_right)
- node=node->rb_right;
+ node = node->rb_right;
return (struct rb_node *)node;
}
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 5813072bc589..5d63a8857f36 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -832,7 +832,7 @@ EXPORT_SYMBOL(sg_miter_stop);
* @buflen: The number of bytes to copy
* @skip: Number of bytes to skip before copying
* @to_buffer: transfer direction (true == from an sg list to a
- * buffer, false == from a buffer to an sg list
+ * buffer, false == from a buffer to an sg list)
*
* Returns the number of copied bytes.
*
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 81c69c08d1d1..2caffc64e4c8 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -20,6 +20,7 @@
*/
#include <linux/gfp.h>
+#include <linux/interrupt.h>
#include <linux/jhash.h>
#include <linux/kernel.h>
#include <linux/mm.h>
@@ -202,9 +203,20 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
unsigned long **entries)
{
union handle_parts parts = { .handle = handle };
- void *slab = stack_slabs[parts.slabindex];
+ void *slab;
size_t offset = parts.offset << STACK_ALLOC_ALIGN;
- struct stack_record *stack = slab + offset;
+ struct stack_record *stack;
+
+ *entries = NULL;
+ if (parts.slabindex > depot_index) {
+ WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n",
+ parts.slabindex, depot_index, handle);
+ return 0;
+ }
+ slab = stack_slabs[parts.slabindex];
+ if (!slab)
+ return 0;
+ stack = slab + offset;
*entries = stack->entries;
return stack->size;
@@ -305,3 +317,26 @@ fast_exit:
return retval;
}
EXPORT_SYMBOL_GPL(stack_depot_save);
+
+static inline int in_irqentry_text(unsigned long ptr)
+{
+ return (ptr >= (unsigned long)&__irqentry_text_start &&
+ ptr < (unsigned long)&__irqentry_text_end) ||
+ (ptr >= (unsigned long)&__softirqentry_text_start &&
+ ptr < (unsigned long)&__softirqentry_text_end);
+}
+
+unsigned int filter_irq_stacks(unsigned long *entries,
+ unsigned int nr_entries)
+{
+ unsigned int i;
+
+ for (i = 0; i < nr_entries; i++) {
+ if (in_irqentry_text(entries[i])) {
+ /* Include the irqentry function into the stack. */
+ return i + 1;
+ }
+ }
+ return nr_entries;
+}
+EXPORT_SYMBOL_GPL(filter_irq_stacks);
diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 61ed71c1daba..6b13150667f5 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -278,6 +278,8 @@ static void __init test_replace(void)
unsigned int nlongs = DIV_ROUND_UP(nbits, BITS_PER_LONG);
DECLARE_BITMAP(bmap, 1024);
+ BUILD_BUG_ON(EXP2_IN_BITS < nbits * 2);
+
bitmap_zero(bmap, 1024);
bitmap_replace(bmap, &exp2[0 * nlongs], &exp2[1 * nlongs], exp2_to_exp3_mask, nbits);
expect_eq_bitmap(bmap, exp3_0_1, nbits);
diff --git a/lib/test_kmod.c b/lib/test_kmod.c
index 9cf77628fc91..e651c37d56db 100644
--- a/lib/test_kmod.c
+++ b/lib/test_kmod.c
@@ -204,7 +204,7 @@ static void test_kmod_put_module(struct kmod_test_device_info *info)
case TEST_KMOD_DRIVER:
break;
case TEST_KMOD_FS_TYPE:
- if (info && info->fs_sync && info->fs_sync->owner)
+ if (info->fs_sync && info->fs_sync->owner)
module_put(info->fs_sync->owner);
break;
default:
diff --git a/lib/test_lockup.c b/lib/test_lockup.c
new file mode 100644
index 000000000000..ea09ca335b21
--- /dev/null
+++ b/lib/test_lockup.c
@@ -0,0 +1,599 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test module to generate lockups
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/clock.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/file.h>
+
+static unsigned int time_secs;
+module_param(time_secs, uint, 0600);
+MODULE_PARM_DESC(time_secs, "lockup time in seconds, default 0");
+
+static unsigned int time_nsecs;
+module_param(time_nsecs, uint, 0600);
+MODULE_PARM_DESC(time_nsecs, "nanoseconds part of lockup time, default 0");
+
+static unsigned int cooldown_secs;
+module_param(cooldown_secs, uint, 0600);
+MODULE_PARM_DESC(cooldown_secs, "cooldown time between iterations in seconds, default 0");
+
+static unsigned int cooldown_nsecs;
+module_param(cooldown_nsecs, uint, 0600);
+MODULE_PARM_DESC(cooldown_nsecs, "nanoseconds part of cooldown, default 0");
+
+static unsigned int iterations = 1;
+module_param(iterations, uint, 0600);
+MODULE_PARM_DESC(iterations, "lockup iterations, default 1");
+
+static bool all_cpus;
+module_param(all_cpus, bool, 0400);
+MODULE_PARM_DESC(all_cpus, "trigger lockup at all cpus at once");
+
+static int wait_state;
+static char *state = "R";
+module_param(state, charp, 0400);
+MODULE_PARM_DESC(state, "wait in 'R' running (default), 'D' uninterruptible, 'K' killable, 'S' interruptible state");
+
+static bool use_hrtimer;
+module_param(use_hrtimer, bool, 0400);
+MODULE_PARM_DESC(use_hrtimer, "use high-resolution timer for sleeping");
+
+static bool iowait;
+module_param(iowait, bool, 0400);
+MODULE_PARM_DESC(iowait, "account sleep time as iowait");
+
+static bool lock_read;
+module_param(lock_read, bool, 0400);
+MODULE_PARM_DESC(lock_read, "lock read-write locks for read");
+
+static bool lock_single;
+module_param(lock_single, bool, 0400);
+MODULE_PARM_DESC(lock_single, "acquire locks only at one cpu");
+
+static bool reacquire_locks;
+module_param(reacquire_locks, bool, 0400);
+MODULE_PARM_DESC(reacquire_locks, "release and reacquire locks/irq/preempt between iterations");
+
+static bool touch_softlockup;
+module_param(touch_softlockup, bool, 0600);
+MODULE_PARM_DESC(touch_softlockup, "touch soft-lockup watchdog between iterations");
+
+static bool touch_hardlockup;
+module_param(touch_hardlockup, bool, 0600);
+MODULE_PARM_DESC(touch_hardlockup, "touch hard-lockup watchdog between iterations");
+
+static bool call_cond_resched;
+module_param(call_cond_resched, bool, 0600);
+MODULE_PARM_DESC(call_cond_resched, "call cond_resched() between iterations");
+
+static bool measure_lock_wait;
+module_param(measure_lock_wait, bool, 0400);
+MODULE_PARM_DESC(measure_lock_wait, "measure lock wait time");
+
+static unsigned long lock_wait_threshold = ULONG_MAX;
+module_param(lock_wait_threshold, ulong, 0400);
+MODULE_PARM_DESC(lock_wait_threshold, "print lock wait time longer than this in nanoseconds, default off");
+
+static bool test_disable_irq;
+module_param_named(disable_irq, test_disable_irq, bool, 0400);
+MODULE_PARM_DESC(disable_irq, "disable interrupts: generate hard-lockups");
+
+static bool disable_softirq;
+module_param(disable_softirq, bool, 0400);
+MODULE_PARM_DESC(disable_softirq, "disable bottom-half irq handlers");
+
+static bool disable_preempt;
+module_param(disable_preempt, bool, 0400);
+MODULE_PARM_DESC(disable_preempt, "disable preemption: generate soft-lockups");
+
+static bool lock_rcu;
+module_param(lock_rcu, bool, 0400);
+MODULE_PARM_DESC(lock_rcu, "grab rcu_read_lock: generate rcu stalls");
+
+static bool lock_mmap_sem;
+module_param(lock_mmap_sem, bool, 0400);
+MODULE_PARM_DESC(lock_mmap_sem, "lock mm->mmap_sem: block procfs interfaces");
+
+static unsigned long lock_rwsem_ptr;
+module_param_unsafe(lock_rwsem_ptr, ulong, 0400);
+MODULE_PARM_DESC(lock_rwsem_ptr, "lock rw_semaphore at address");
+
+static unsigned long lock_mutex_ptr;
+module_param_unsafe(lock_mutex_ptr, ulong, 0400);
+MODULE_PARM_DESC(lock_mutex_ptr, "lock mutex at address");
+
+static unsigned long lock_spinlock_ptr;
+module_param_unsafe(lock_spinlock_ptr, ulong, 0400);
+MODULE_PARM_DESC(lock_spinlock_ptr, "lock spinlock at address");
+
+static unsigned long lock_rwlock_ptr;
+module_param_unsafe(lock_rwlock_ptr, ulong, 0400);
+MODULE_PARM_DESC(lock_rwlock_ptr, "lock rwlock at address");
+
+static unsigned int alloc_pages_nr;
+module_param_unsafe(alloc_pages_nr, uint, 0600);
+MODULE_PARM_DESC(alloc_pages_nr, "allocate and free pages under locks");
+
+static unsigned int alloc_pages_order;
+module_param(alloc_pages_order, uint, 0400);
+MODULE_PARM_DESC(alloc_pages_order, "page order to allocate");
+
+static gfp_t alloc_pages_gfp = GFP_KERNEL;
+module_param_unsafe(alloc_pages_gfp, uint, 0400);
+MODULE_PARM_DESC(alloc_pages_gfp, "allocate pages with this gfp_mask, default GFP_KERNEL");
+
+static bool alloc_pages_atomic;
+module_param(alloc_pages_atomic, bool, 0400);
+MODULE_PARM_DESC(alloc_pages_atomic, "allocate pages with GFP_ATOMIC");
+
+static bool reallocate_pages;
+module_param(reallocate_pages, bool, 0400);
+MODULE_PARM_DESC(reallocate_pages, "free and allocate pages between iterations");
+
+struct file *test_file;
+struct inode *test_inode;
+static char test_file_path[256];
+module_param_string(file_path, test_file_path, sizeof(test_file_path), 0400);
+MODULE_PARM_DESC(file_path, "file path to test");
+
+static bool test_lock_inode;
+module_param_named(lock_inode, test_lock_inode, bool, 0400);
+MODULE_PARM_DESC(lock_inode, "lock file -> inode -> i_rwsem");
+
+static bool test_lock_mapping;
+module_param_named(lock_mapping, test_lock_mapping, bool, 0400);
+MODULE_PARM_DESC(lock_mapping, "lock file -> mapping -> i_mmap_rwsem");
+
+static bool test_lock_sb_umount;
+module_param_named(lock_sb_umount, test_lock_sb_umount, bool, 0400);
+MODULE_PARM_DESC(lock_sb_umount, "lock file -> sb -> s_umount");
+
+static atomic_t alloc_pages_failed = ATOMIC_INIT(0);
+
+static atomic64_t max_lock_wait = ATOMIC64_INIT(0);
+
+static struct task_struct *main_task;
+static int master_cpu;
+
+static void test_lock(bool master, bool verbose)
+{
+ u64 uninitialized_var(wait_start);
+
+ if (measure_lock_wait)
+ wait_start = local_clock();
+
+ if (lock_mutex_ptr && master) {
+ if (verbose)
+ pr_notice("lock mutex %ps\n", (void *)lock_mutex_ptr);
+ mutex_lock((struct mutex *)lock_mutex_ptr);
+ }
+
+ if (lock_rwsem_ptr && master) {
+ if (verbose)
+ pr_notice("lock rw_semaphore %ps\n",
+ (void *)lock_rwsem_ptr);
+ if (lock_read)
+ down_read((struct rw_semaphore *)lock_rwsem_ptr);
+ else
+ down_write((struct rw_semaphore *)lock_rwsem_ptr);
+ }
+
+ if (lock_mmap_sem && master) {
+ if (verbose)
+ pr_notice("lock mmap_sem pid=%d\n", main_task->pid);
+ if (lock_read)
+ down_read(&main_task->mm->mmap_sem);
+ else
+ down_write(&main_task->mm->mmap_sem);
+ }
+
+ if (test_disable_irq)
+ local_irq_disable();
+
+ if (disable_softirq)
+ local_bh_disable();
+
+ if (disable_preempt)
+ preempt_disable();
+
+ if (lock_rcu)
+ rcu_read_lock();
+
+ if (lock_spinlock_ptr && master) {
+ if (verbose)
+ pr_notice("lock spinlock %ps\n",
+ (void *)lock_spinlock_ptr);
+ spin_lock((spinlock_t *)lock_spinlock_ptr);
+ }
+
+ if (lock_rwlock_ptr && master) {
+ if (verbose)
+ pr_notice("lock rwlock %ps\n",
+ (void *)lock_rwlock_ptr);
+ if (lock_read)
+ read_lock((rwlock_t *)lock_rwlock_ptr);
+ else
+ write_lock((rwlock_t *)lock_rwlock_ptr);
+ }
+
+ if (measure_lock_wait) {
+ s64 cur_wait = local_clock() - wait_start;
+ s64 max_wait = atomic64_read(&max_lock_wait);
+
+ do {
+ if (cur_wait < max_wait)
+ break;
+ max_wait = atomic64_cmpxchg(&max_lock_wait,
+ max_wait, cur_wait);
+ } while (max_wait != cur_wait);
+
+ if (cur_wait > lock_wait_threshold)
+ pr_notice_ratelimited("lock wait %lld ns\n", cur_wait);
+ }
+}
+
+static void test_unlock(bool master, bool verbose)
+{
+ if (lock_rwlock_ptr && master) {
+ if (lock_read)
+ read_unlock((rwlock_t *)lock_rwlock_ptr);
+ else
+ write_unlock((rwlock_t *)lock_rwlock_ptr);
+ if (verbose)
+ pr_notice("unlock rwlock %ps\n",
+ (void *)lock_rwlock_ptr);
+ }
+
+ if (lock_spinlock_ptr && master) {
+ spin_unlock((spinlock_t *)lock_spinlock_ptr);
+ if (verbose)
+ pr_notice("unlock spinlock %ps\n",
+ (void *)lock_spinlock_ptr);
+ }
+
+ if (lock_rcu)
+ rcu_read_unlock();
+
+ if (disable_preempt)
+ preempt_enable();
+
+ if (disable_softirq)
+ local_bh_enable();
+
+ if (test_disable_irq)
+ local_irq_enable();
+
+ if (lock_mmap_sem && master) {
+ if (lock_read)
+ up_read(&main_task->mm->mmap_sem);
+ else
+ up_write(&main_task->mm->mmap_sem);
+ if (verbose)
+ pr_notice("unlock mmap_sem pid=%d\n", main_task->pid);
+ }
+
+ if (lock_rwsem_ptr && master) {
+ if (lock_read)
+ up_read((struct rw_semaphore *)lock_rwsem_ptr);
+ else
+ up_write((struct rw_semaphore *)lock_rwsem_ptr);
+ if (verbose)
+ pr_notice("unlock rw_semaphore %ps\n",
+ (void *)lock_rwsem_ptr);
+ }
+
+ if (lock_mutex_ptr && master) {
+ mutex_unlock((struct mutex *)lock_mutex_ptr);
+ if (verbose)
+ pr_notice("unlock mutex %ps\n",
+ (void *)lock_mutex_ptr);
+ }
+}
+
+static void test_alloc_pages(struct list_head *pages)
+{
+ struct page *page;
+ unsigned int i;
+
+ for (i = 0; i < alloc_pages_nr; i++) {
+ page = alloc_pages(alloc_pages_gfp, alloc_pages_order);
+ if (!page) {
+ atomic_inc(&alloc_pages_failed);
+ break;
+ }
+ list_add(&page->lru, pages);
+ }
+}
+
+static void test_free_pages(struct list_head *pages)
+{
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, pages, lru)
+ __free_pages(page, alloc_pages_order);
+ INIT_LIST_HEAD(pages);
+}
+
+static void test_wait(unsigned int secs, unsigned int nsecs)
+{
+ if (wait_state == TASK_RUNNING) {
+ if (secs)
+ mdelay(secs * MSEC_PER_SEC);
+ if (nsecs)
+ ndelay(nsecs);
+ return;
+ }
+
+ __set_current_state(wait_state);
+ if (use_hrtimer) {
+ ktime_t time;
+
+ time = ns_to_ktime((u64)secs * NSEC_PER_SEC + nsecs);
+ schedule_hrtimeout(&time, HRTIMER_MODE_REL);
+ } else {
+ schedule_timeout(secs * HZ + nsecs_to_jiffies(nsecs));
+ }
+}
+
+static void test_lockup(bool master)
+{
+ u64 lockup_start = local_clock();
+ unsigned int iter = 0;
+ LIST_HEAD(pages);
+
+ pr_notice("Start on CPU%d\n", raw_smp_processor_id());
+
+ test_lock(master, true);
+
+ test_alloc_pages(&pages);
+
+ while (iter++ < iterations && !signal_pending(main_task)) {
+
+ if (iowait)
+ current->in_iowait = 1;
+
+ test_wait(time_secs, time_nsecs);
+
+ if (iowait)
+ current->in_iowait = 0;
+
+ if (reallocate_pages)
+ test_free_pages(&pages);
+
+ if (reacquire_locks)
+ test_unlock(master, false);
+
+ if (touch_softlockup)
+ touch_softlockup_watchdog();
+
+ if (touch_hardlockup)
+ touch_nmi_watchdog();
+
+ if (call_cond_resched)
+ cond_resched();
+
+ test_wait(cooldown_secs, cooldown_nsecs);
+
+ if (reacquire_locks)
+ test_lock(master, false);
+
+ if (reallocate_pages)
+ test_alloc_pages(&pages);
+ }
+
+ pr_notice("Finish on CPU%d in %lld ns\n", raw_smp_processor_id(),
+ local_clock() - lockup_start);
+
+ test_free_pages(&pages);
+
+ test_unlock(master, true);
+}
+
+DEFINE_PER_CPU(struct work_struct, test_works);
+
+static void test_work_fn(struct work_struct *work)
+{
+ test_lockup(!lock_single ||
+ work == per_cpu_ptr(&test_works, master_cpu));
+}
+
+static bool test_kernel_ptr(unsigned long addr, int size)
+{
+ void *ptr = (void *)addr;
+ char buf;
+
+ if (!addr)
+ return false;
+
+ /* should be at least readable kernel address */
+ if (access_ok(ptr, 1) ||
+ access_ok(ptr + size - 1, 1) ||
+ probe_kernel_address(ptr, buf) ||
+ probe_kernel_address(ptr + size - 1, buf)) {
+ pr_err("invalid kernel ptr: %#lx\n", addr);
+ return true;
+ }
+
+ return false;
+}
+
+static bool __maybe_unused test_magic(unsigned long addr, int offset,
+ unsigned int expected)
+{
+ void *ptr = (void *)addr + offset;
+ unsigned int magic = 0;
+
+ if (!addr)
+ return false;
+
+ if (probe_kernel_address(ptr, magic) || magic != expected) {
+ pr_err("invalid magic at %#lx + %#x = %#x, expected %#x\n",
+ addr, offset, magic, expected);
+ return true;
+ }
+
+ return false;
+}
+
+static int __init test_lockup_init(void)
+{
+ u64 test_start = local_clock();
+
+ main_task = current;
+
+ switch (state[0]) {
+ case 'S':
+ wait_state = TASK_INTERRUPTIBLE;
+ break;
+ case 'D':
+ wait_state = TASK_UNINTERRUPTIBLE;
+ break;
+ case 'K':
+ wait_state = TASK_KILLABLE;
+ break;
+ case 'R':
+ wait_state = TASK_RUNNING;
+ break;
+ default:
+ pr_err("unknown state=%s\n", state);
+ return -EINVAL;
+ }
+
+ if (alloc_pages_atomic)
+ alloc_pages_gfp = GFP_ATOMIC;
+
+ if (test_kernel_ptr(lock_spinlock_ptr, sizeof(spinlock_t)) ||
+ test_kernel_ptr(lock_rwlock_ptr, sizeof(rwlock_t)) ||
+ test_kernel_ptr(lock_mutex_ptr, sizeof(struct mutex)) ||
+ test_kernel_ptr(lock_rwsem_ptr, sizeof(struct rw_semaphore)))
+ return -EINVAL;
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+ if (test_magic(lock_spinlock_ptr,
+ offsetof(spinlock_t, rlock.magic),
+ SPINLOCK_MAGIC) ||
+ test_magic(lock_rwlock_ptr,
+ offsetof(rwlock_t, magic),
+ RWLOCK_MAGIC) ||
+ test_magic(lock_mutex_ptr,
+ offsetof(struct mutex, wait_lock.rlock.magic),
+ SPINLOCK_MAGIC) ||
+ test_magic(lock_rwsem_ptr,
+ offsetof(struct rw_semaphore, wait_lock.magic),
+ SPINLOCK_MAGIC))
+ return -EINVAL;
+#endif
+
+ if ((wait_state != TASK_RUNNING ||
+ (call_cond_resched && !reacquire_locks) ||
+ (alloc_pages_nr && gfpflags_allow_blocking(alloc_pages_gfp))) &&
+ (test_disable_irq || disable_softirq || disable_preempt ||
+ lock_rcu || lock_spinlock_ptr || lock_rwlock_ptr)) {
+ pr_err("refuse to sleep in atomic context\n");
+ return -EINVAL;
+ }
+
+ if (lock_mmap_sem && !main_task->mm) {
+ pr_err("no mm to lock mmap_sem\n");
+ return -EINVAL;
+ }
+
+ if (test_file_path[0]) {
+ test_file = filp_open(test_file_path, O_RDONLY, 0);
+ if (IS_ERR(test_file)) {
+ pr_err("cannot find file_path\n");
+ return -EINVAL;
+ }
+ test_inode = file_inode(test_file);
+ } else if (test_lock_inode ||
+ test_lock_mapping ||
+ test_lock_sb_umount) {
+ pr_err("no file to lock\n");
+ return -EINVAL;
+ }
+
+ if (test_lock_inode && test_inode)
+ lock_rwsem_ptr = (unsigned long)&test_inode->i_rwsem;
+
+ if (test_lock_mapping && test_file && test_file->f_mapping)
+ lock_rwsem_ptr = (unsigned long)&test_file->f_mapping->i_mmap_rwsem;
+
+ if (test_lock_sb_umount && test_inode)
+ lock_rwsem_ptr = (unsigned long)&test_inode->i_sb->s_umount;
+
+ pr_notice("START pid=%d time=%u +%u ns cooldown=%u +%u ns iterations=%u state=%s %s%s%s%s%s%s%s%s%s%s%s\n",
+ main_task->pid, time_secs, time_nsecs,
+ cooldown_secs, cooldown_nsecs, iterations, state,
+ all_cpus ? "all_cpus " : "",
+ iowait ? "iowait " : "",
+ test_disable_irq ? "disable_irq " : "",
+ disable_softirq ? "disable_softirq " : "",
+ disable_preempt ? "disable_preempt " : "",
+ lock_rcu ? "lock_rcu " : "",
+ lock_read ? "lock_read " : "",
+ touch_softlockup ? "touch_softlockup " : "",
+ touch_hardlockup ? "touch_hardlockup " : "",
+ call_cond_resched ? "call_cond_resched " : "",
+ reacquire_locks ? "reacquire_locks " : "");
+
+ if (alloc_pages_nr)
+ pr_notice("ALLOCATE PAGES nr=%u order=%u gfp=%pGg %s\n",
+ alloc_pages_nr, alloc_pages_order, &alloc_pages_gfp,
+ reallocate_pages ? "reallocate_pages " : "");
+
+ if (all_cpus) {
+ unsigned int cpu;
+
+ cpus_read_lock();
+
+ preempt_disable();
+ master_cpu = smp_processor_id();
+ for_each_online_cpu(cpu) {
+ INIT_WORK(per_cpu_ptr(&test_works, cpu), test_work_fn);
+ queue_work_on(cpu, system_highpri_wq,
+ per_cpu_ptr(&test_works, cpu));
+ }
+ preempt_enable();
+
+ for_each_online_cpu(cpu)
+ flush_work(per_cpu_ptr(&test_works, cpu));
+
+ cpus_read_unlock();
+ } else {
+ test_lockup(true);
+ }
+
+ if (measure_lock_wait)
+ pr_notice("Maximum lock wait: %lld ns\n",
+ atomic64_read(&max_lock_wait));
+
+ if (alloc_pages_nr)
+ pr_notice("Page allocation failed %u times\n",
+ atomic_read(&alloc_pages_failed));
+
+ pr_notice("FINISH in %llu ns\n", local_clock() - test_start);
+
+ if (test_file)
+ fput(test_file);
+
+ if (signal_pending(main_task))
+ return -EINTR;
+
+ return -EAGAIN;
+}
+module_init(test_lockup_init);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Konstantin Khlebnikov <khlebnikov@yandex-team.ru>");
+MODULE_DESCRIPTION("Test module to generate lockups");
diff --git a/lib/test_stackinit.c b/lib/test_stackinit.c
index 2d7d257a430e..f93b1e145ada 100644
--- a/lib/test_stackinit.c
+++ b/lib/test_stackinit.c
@@ -92,8 +92,9 @@ static bool range_contains(char *haystack_start, size_t haystack_size,
* @var_type: type to be tested for zeroing initialization
* @which: is this a SCALAR, STRING, or STRUCT type?
* @init_level: what kind of initialization is performed
+ * @xfail: is this test expected to fail?
*/
-#define DEFINE_TEST_DRIVER(name, var_type, which) \
+#define DEFINE_TEST_DRIVER(name, var_type, which, xfail) \
/* Returns 0 on success, 1 on failure. */ \
static noinline __init int test_ ## name (void) \
{ \
@@ -139,13 +140,14 @@ static noinline __init int test_ ## name (void) \
for (sum = 0, i = 0; i < target_size; i++) \
sum += (check_buf[i] == 0xFF); \
\
- if (sum == 0) \
+ if (sum == 0) { \
pr_info(#name " ok\n"); \
- else \
- pr_warn(#name " FAIL (uninit bytes: %d)\n", \
- sum); \
- \
- return (sum != 0); \
+ return 0; \
+ } else { \
+ pr_warn(#name " %sFAIL (uninit bytes: %d)\n", \
+ (xfail) ? "X" : "", sum); \
+ return (xfail) ? 0 : 1; \
+ } \
}
#define DEFINE_TEST(name, var_type, which, init_level) \
/* no-op to force compiler into ignoring "uninitialized" vars */\
@@ -189,7 +191,7 @@ static noinline __init int leaf_ ## name(unsigned long sp, \
\
return (int)buf[0] | (int)buf[sizeof(buf) - 1]; \
} \
-DEFINE_TEST_DRIVER(name, var_type, which)
+DEFINE_TEST_DRIVER(name, var_type, which, 0)
/* Structure with no padding. */
struct test_packed {
@@ -326,8 +328,14 @@ static noinline __init int leaf_switch_2_none(unsigned long sp, bool fill,
return __leaf_switch_none(2, fill);
}
-DEFINE_TEST_DRIVER(switch_1_none, uint64_t, SCALAR);
-DEFINE_TEST_DRIVER(switch_2_none, uint64_t, SCALAR);
+/*
+ * These are expected to fail for most configurations because neither
+ * GCC nor Clang have a way to perform initialization of variables in
+ * non-code areas (i.e. in a switch statement before the first "case").
+ * https://bugs.llvm.org/show_bug.cgi?id=44916
+ */
+DEFINE_TEST_DRIVER(switch_1_none, uint64_t, SCALAR, 1);
+DEFINE_TEST_DRIVER(switch_2_none, uint64_t, SCALAR, 1);
static int __init test_stackinit_init(void)
{
diff --git a/lib/ts_bm.c b/lib/ts_bm.c
index b352903c50e3..277cb4417ac2 100644
--- a/lib/ts_bm.c
+++ b/lib/ts_bm.c
@@ -52,7 +52,7 @@ struct ts_bm
u8 * pattern;
unsigned int patlen;
unsigned int bad_shift[ASIZE];
- unsigned int good_shift[0];
+ unsigned int good_shift[];
};
static unsigned int bm_find(struct ts_config *conf, struct ts_state *state)
diff --git a/lib/ts_fsm.c b/lib/ts_fsm.c
index 9c873cadab7c..ab749ec10ab5 100644
--- a/lib/ts_fsm.c
+++ b/lib/ts_fsm.c
@@ -32,7 +32,7 @@
struct ts_fsm
{
unsigned int ntokens;
- struct ts_fsm_token tokens[0];
+ struct ts_fsm_token tokens[];
};
/* other values derived from ctype.h */
diff --git a/lib/ts_kmp.c b/lib/ts_kmp.c
index 94617e014b3a..c77a3d537f24 100644
--- a/lib/ts_kmp.c
+++ b/lib/ts_kmp.c
@@ -36,7 +36,7 @@ struct ts_kmp
{
u8 * pattern;
unsigned int pattern_len;
- unsigned int prefix_tbl[0];
+ unsigned int prefix_tbl[];
};
static unsigned int kmp_find(struct ts_config *conf, struct ts_state *state)
diff --git a/lib/ubsan.c b/lib/ubsan.c
index 7b9b58aee72c..f8c0ccf35f29 100644
--- a/lib/ubsan.c
+++ b/lib/ubsan.c
@@ -45,13 +45,6 @@ static bool was_reported(struct source_location *location)
return test_and_set_bit(REPORTED_BIT, &location->reported);
}
-static void print_source_location(const char *prefix,
- struct source_location *loc)
-{
- pr_err("%s %s:%d:%d\n", prefix, loc->file_name,
- loc->line & LINE_MASK, loc->column & COLUMN_MASK);
-}
-
static bool suppress_report(struct source_location *loc)
{
return current->in_ubsan || was_reported(loc);
@@ -140,13 +133,14 @@ static void val_to_string(char *str, size_t size, struct type_descriptor *type,
}
}
-static void ubsan_prologue(struct source_location *location)
+static void ubsan_prologue(struct source_location *loc, const char *reason)
{
current->in_ubsan++;
pr_err("========================================"
"========================================\n");
- print_source_location("UBSAN: Undefined behaviour in", location);
+ pr_err("UBSAN: %s in %s:%d:%d\n", reason, loc->file_name,
+ loc->line & LINE_MASK, loc->column & COLUMN_MASK);
}
static void ubsan_epilogue(void)
@@ -156,6 +150,17 @@ static void ubsan_epilogue(void)
"========================================\n");
current->in_ubsan--;
+
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from panicking the
+ * system on this thread. Other threads are blocked by the
+ * panic_mutex in panic().
+ */
+ panic_on_warn = 0;
+ panic("panic_on_warn set ...\n");
+ }
}
static void handle_overflow(struct overflow_data *data, void *lhs,
@@ -169,12 +174,12 @@ static void handle_overflow(struct overflow_data *data, void *lhs,
if (suppress_report(&data->location))
return;
- ubsan_prologue(&data->location);
+ ubsan_prologue(&data->location, type_is_signed(type) ?
+ "signed-integer-overflow" :
+ "unsigned-integer-overflow");
val_to_string(lhs_val_str, sizeof(lhs_val_str), type, lhs);
val_to_string(rhs_val_str, sizeof(rhs_val_str), type, rhs);
- pr_err("%s integer overflow:\n",
- type_is_signed(type) ? "signed" : "unsigned");
pr_err("%s %c %s cannot be represented in type %s\n",
lhs_val_str,
op,
@@ -214,7 +219,7 @@ void __ubsan_handle_negate_overflow(struct overflow_data *data,
if (suppress_report(&data->location))
return;
- ubsan_prologue(&data->location);
+ ubsan_prologue(&data->location, "negation-overflow");
val_to_string(old_val_str, sizeof(old_val_str), data->type, old_val);
@@ -234,7 +239,7 @@ void __ubsan_handle_divrem_overflow(struct overflow_data *data,
if (suppress_report(&data->location))
return;
- ubsan_prologue(&data->location);
+ ubsan_prologue(&data->location, "division-overflow");
val_to_string(rhs_val_str, sizeof(rhs_val_str), data->type, rhs);
@@ -253,7 +258,7 @@ static void handle_null_ptr_deref(struct type_mismatch_data_common *data)
if (suppress_report(data->location))
return;
- ubsan_prologue(data->location);
+ ubsan_prologue(data->location, "null-ptr-deref");
pr_err("%s null pointer of type %s\n",
type_check_kinds[data->type_check_kind],
@@ -268,7 +273,7 @@ static void handle_misaligned_access(struct type_mismatch_data_common *data,
if (suppress_report(data->location))
return;
- ubsan_prologue(data->location);
+ ubsan_prologue(data->location, "misaligned-access");
pr_err("%s misaligned address %p for type %s\n",
type_check_kinds[data->type_check_kind],
@@ -284,7 +289,7 @@ static void handle_object_size_mismatch(struct type_mismatch_data_common *data,
if (suppress_report(data->location))
return;
- ubsan_prologue(data->location);
+ ubsan_prologue(data->location, "object-size-mismatch");
pr_err("%s address %p with insufficient space\n",
type_check_kinds[data->type_check_kind],
(void *) ptr);
@@ -343,7 +348,7 @@ void __ubsan_handle_out_of_bounds(struct out_of_bounds_data *data, void *index)
if (suppress_report(&data->location))
return;
- ubsan_prologue(&data->location);
+ ubsan_prologue(&data->location, "array-index-out-of-bounds");
val_to_string(index_str, sizeof(index_str), data->index_type, index);
pr_err("index %s is out of range for type %s\n", index_str,
@@ -364,7 +369,7 @@ void __ubsan_handle_shift_out_of_bounds(struct shift_out_of_bounds_data *data,
if (suppress_report(&data->location))
goto out;
- ubsan_prologue(&data->location);
+ ubsan_prologue(&data->location, "shift-out-of-bounds");
val_to_string(rhs_str, sizeof(rhs_str), rhs_type, rhs);
val_to_string(lhs_str, sizeof(lhs_str), lhs_type, lhs);
@@ -396,7 +401,7 @@ EXPORT_SYMBOL(__ubsan_handle_shift_out_of_bounds);
void __ubsan_handle_builtin_unreachable(struct unreachable_data *data)
{
- ubsan_prologue(&data->location);
+ ubsan_prologue(&data->location, "unreachable");
pr_err("calling __builtin_unreachable()\n");
ubsan_epilogue();
panic("can't return from __builtin_unreachable()");
@@ -411,7 +416,7 @@ void __ubsan_handle_load_invalid_value(struct invalid_value_data *data,
if (suppress_report(&data->location))
return;
- ubsan_prologue(&data->location);
+ ubsan_prologue(&data->location, "invalid-load");
val_to_string(val_str, sizeof(val_str), data->type, val);
diff --git a/mm/Kconfig b/mm/Kconfig
index ab80933be65f..c1acc34c1c35 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -139,6 +139,10 @@ config HAVE_FAST_GUP
config ARCH_KEEP_MEMBLOCK
bool
+# Keep arch NUMA mapping infrastructure post-init.
+config NUMA_KEEP_MEMINFO
+ bool
+
config MEMORY_ISOLATION
bool
@@ -154,6 +158,7 @@ config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
+ select NUMA_KEEP_MEMINFO if NUMA
config MEMORY_HOTPLUG_SPARSE
def_bool y
@@ -237,6 +242,17 @@ config COMPACTION
linux-mm@kvack.org.
#
+# support for free page reporting
+config PAGE_REPORTING
+ bool "Free page reporting"
+ def_bool n
+ help
+ Free page reporting allows for the incremental acquisition of
+ free pages from the buddy allocator for the purpose of reporting
+ those pages to another entity, such as a hypervisor, so that the
+ memory can be freed within the host for other uses.
+
+#
# support for page migration
#
config MIGRATION
@@ -420,10 +436,6 @@ config THP_SWAP
For selection by architectures with reasonable THP sizes.
-config TRANSPARENT_HUGE_PAGECACHE
- def_bool y
- depends on TRANSPARENT_HUGEPAGE
-
#
# UP and nommu archs use km based percpu allocator
#
@@ -526,7 +538,6 @@ config MEM_SOFT_DIRTY
config ZSWAP
bool "Compressed cache for swap pages (EXPERIMENTAL)"
depends on FRONTSWAP && CRYPTO=y
- select CRYPTO_LZO
select ZPOOL
help
A lightweight compressed cache for swap pages. It takes
@@ -542,6 +553,123 @@ config ZSWAP
they have not be fully explored on the large set of potential
configurations and workloads that exist.
+choice
+ prompt "Compressed cache for swap pages default compressor"
+ depends on ZSWAP
+ default ZSWAP_COMPRESSOR_DEFAULT_LZO
+ help
+ Selects the default compression algorithm for the compressed cache
+ for swap pages.
+
+ For an overview what kind of performance can be expected from
+ a particular compression algorithm please refer to the benchmarks
+ available at the following LWN page:
+ https://lwn.net/Articles/751795/
+
+ If in doubt, select 'LZO'.
+
+ The selection made here can be overridden by using the kernel
+ command line 'zswap.compressor=' option.
+
+config ZSWAP_COMPRESSOR_DEFAULT_DEFLATE
+ bool "Deflate"
+ select CRYPTO_DEFLATE
+ help
+ Use the Deflate algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZO
+ bool "LZO"
+ select CRYPTO_LZO
+ help
+ Use the LZO algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_842
+ bool "842"
+ select CRYPTO_842
+ help
+ Use the 842 algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZ4
+ bool "LZ4"
+ select CRYPTO_LZ4
+ help
+ Use the LZ4 algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZ4HC
+ bool "LZ4HC"
+ select CRYPTO_LZ4HC
+ help
+ Use the LZ4HC algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_ZSTD
+ bool "zstd"
+ select CRYPTO_ZSTD
+ help
+ Use the zstd algorithm as the default compression algorithm.
+endchoice
+
+config ZSWAP_COMPRESSOR_DEFAULT
+ string
+ depends on ZSWAP
+ default "deflate" if ZSWAP_COMPRESSOR_DEFAULT_DEFLATE
+ default "lzo" if ZSWAP_COMPRESSOR_DEFAULT_LZO
+ default "842" if ZSWAP_COMPRESSOR_DEFAULT_842
+ default "lz4" if ZSWAP_COMPRESSOR_DEFAULT_LZ4
+ default "lz4hc" if ZSWAP_COMPRESSOR_DEFAULT_LZ4HC
+ default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD
+ default ""
+
+choice
+ prompt "Compressed cache for swap pages default allocator"
+ depends on ZSWAP
+ default ZSWAP_ZPOOL_DEFAULT_ZBUD
+ help
+ Selects the default allocator for the compressed cache for
+ swap pages.
+ The default is 'zbud' for compatibility, however please do
+ read the description of each of the allocators below before
+ making a right choice.
+
+ The selection made here can be overridden by using the kernel
+ command line 'zswap.zpool=' option.
+
+config ZSWAP_ZPOOL_DEFAULT_ZBUD
+ bool "zbud"
+ select ZBUD
+ help
+ Use the zbud allocator as the default allocator.
+
+config ZSWAP_ZPOOL_DEFAULT_Z3FOLD
+ bool "z3fold"
+ select Z3FOLD
+ help
+ Use the z3fold allocator as the default allocator.
+
+config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
+ bool "zsmalloc"
+ select ZSMALLOC
+ help
+ Use the zsmalloc allocator as the default allocator.
+endchoice
+
+config ZSWAP_ZPOOL_DEFAULT
+ string
+ depends on ZSWAP
+ default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD
+ default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD
+ default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
+ default ""
+
+config ZSWAP_DEFAULT_ON
+ bool "Enable the compressed cache for swap pages by default"
+ depends on ZSWAP
+ help
+ If selected, the compressed cache for swap pages will be enabled
+ at boot, otherwise it will be disabled.
+
+ The selection made here can be overridden by using the kernel
+ command line 'zswap.enabled=' option.
+
config ZPOOL
tristate "Common API for compressed memory storage"
help
@@ -714,7 +842,7 @@ config GUP_GET_PTE_LOW_HIGH
config READ_ONLY_THP_FOR_FS
bool "Read-only THP for filesystems (EXPERIMENTAL)"
- depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM
+ depends on TRANSPARENT_HUGEPAGE && SHMEM
help
Allow khugepaged to put read-only file-backed pages in THP.
diff --git a/mm/Makefile b/mm/Makefile
index dbc8346d16ca..fccd3756b25f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -111,3 +111,4 @@ obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
+obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 62f05f605fb5..c81b4f3a7268 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -491,8 +491,8 @@ static void cgwb_release_workfn(struct work_struct *work)
css_put(wb->blkcg_css);
mutex_unlock(&wb->bdi->cgwb_release_mutex);
- /* triggers blkg destruction if cgwb_refcnt becomes zero */
- blkcg_cgwb_put(blkcg);
+ /* triggers blkg destruction if no online users left */
+ blkcg_unpin_online(blkcg);
fprop_local_destroy_percpu(&wb->memcg_completions);
percpu_ref_exit(&wb->refcnt);
@@ -592,7 +592,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
list_add(&wb->memcg_node, memcg_cgwb_list);
list_add(&wb->blkcg_node, blkcg_cgwb_list);
- blkcg_cgwb_get(blkcg);
+ blkcg_pin_online(blkcg);
css_get(memcg_css);
css_get(blkcg_css);
}
diff --git a/mm/cma.c b/mm/cma.c
index be55d1988c67..0463ad2ce06b 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -220,7 +220,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
}
/**
- * cma_declare_contiguous() - reserve custom contiguous area
+ * cma_declare_contiguous_nid() - reserve custom contiguous area
* @base: Base address of the reserved area optional, use 0 for any
* @size: Size of the reserved area (in bytes),
* @limit: End address of the reserved memory (optional, 0 for any).
@@ -229,6 +229,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
* @fixed: hint about where to place the reserved area
* @name: The name of the area. See function cma_init_reserved_mem()
* @res_cma: Pointer to store the created cma region.
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* This function reserves memory from early allocator. It should be
* called by arch specific code once the early allocator (memblock or bootmem)
@@ -238,10 +239,11 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
* If @fixed is true, reserve contiguous area at exactly @base. If false,
* reserve in range from @base to @limit.
*/
-int __init cma_declare_contiguous(phys_addr_t base,
+int __init cma_declare_contiguous_nid(phys_addr_t base,
phys_addr_t size, phys_addr_t limit,
phys_addr_t alignment, unsigned int order_per_bit,
- bool fixed, const char *name, struct cma **res_cma)
+ bool fixed, const char *name, struct cma **res_cma,
+ int nid)
{
phys_addr_t memblock_end = memblock_end_of_DRAM();
phys_addr_t highmem_start;
@@ -336,14 +338,14 @@ int __init cma_declare_contiguous(phys_addr_t base,
* memory in case of failure.
*/
if (base < highmem_start && limit > highmem_start) {
- addr = memblock_phys_alloc_range(size, alignment,
- highmem_start, limit);
+ addr = memblock_alloc_range_nid(size, alignment,
+ highmem_start, limit, nid, false);
limit = highmem_start;
}
if (!addr) {
- addr = memblock_phys_alloc_range(size, alignment, base,
- limit);
+ addr = memblock_alloc_range_nid(size, alignment, base,
+ limit, nid, false);
if (!addr) {
ret = -ENOMEM;
goto err;
diff --git a/mm/compaction.c b/mm/compaction.c
index df3da2f76fdc..46f0fcc93081 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -481,6 +481,7 @@ static bool test_and_set_skip(struct compact_control *cc, struct page *page,
*/
static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
struct compact_control *cc)
+ __acquires(lock)
{
/* Track if the lock is contended in async mode */
if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
@@ -989,7 +990,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_is_file_cache(page),
+ NR_ISOLATED_ANON + page_is_file_lru(page),
hpage_nr_pages(page));
isolate_success:
diff --git a/mm/dmapool.c b/mm/dmapool.c
index fe5d33060415..f9fb9bbd733e 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -144,9 +144,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
else if (size < 4)
size = 4;
- if ((size % align) != 0)
- size = ALIGN(size, align);
-
+ size = ALIGN(size, align);
allocation = max_t(size_t, size, PAGE_SIZE);
if (!boundary)
diff --git a/mm/filemap.c b/mm/filemap.c
index 0fbdc8e30dd2..23a051a7ef0f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1693,6 +1693,11 @@ EXPORT_SYMBOL(pagecache_get_page);
* Any shadow entries of evicted pages, or swap entries from
* shmem/tmpfs, are included in the returned array.
*
+ * If it finds a Transparent Huge Page, head or tail, find_get_entries()
+ * stops at that page: the caller is likely to have a better way to handle
+ * the compound page as a whole, and then skip its extent, than repeatedly
+ * calling find_get_entries() to return all its tails.
+ *
* Return: the number of pages and shadow entries which were found.
*/
unsigned find_get_entries(struct address_space *mapping,
@@ -1724,8 +1729,15 @@ unsigned find_get_entries(struct address_space *mapping,
/* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- page = find_subpage(page, xas.xa_index);
+ /*
+ * Terminate early on finding a THP, to allow the caller to
+ * handle it all at once; but continue if this is hugetlbfs.
+ */
+ if (PageTransHuge(page) && !PageHuge(page)) {
+ page = find_subpage(page, xas.xa_index);
+ nr_entries = ret + 1;
+ }
export:
indices[ret] = xas.xa_index;
entries[ret] = page;
diff --git a/mm/gup.c b/mm/gup.c
index da3e03185144..6076df8e04a4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -351,7 +351,8 @@ static struct page *no_page_table(struct vm_area_struct *vma,
* But we can only make this optimization where a hole would surely
* be zero-filled if handle_mm_fault() actually did handle it.
*/
- if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
+ if ((flags & FOLL_DUMP) &&
+ (vma_is_anonymous(vma) || !vma->vm_ops->fault))
return ERR_PTR(-EFAULT);
return NULL;
}
@@ -1101,7 +1102,7 @@ retry:
goto retry;
case -EBUSY:
ret = 0;
- /* FALLTHRU */
+ fallthrough;
case -EFAULT:
case -ENOMEM:
case -EHWPOISON:
@@ -1325,10 +1326,12 @@ retry:
* start trying again otherwise it can loop forever.
*/
- if (fatal_signal_pending(current))
+ if (fatal_signal_pending(current)) {
+ if (!pages_done)
+ pages_done = -EINTR;
break;
+ }
- *locked = 1;
ret = down_read_killable(&mm->mmap_sem);
if (ret) {
BUG_ON(ret > 0);
@@ -1337,6 +1340,7 @@ retry:
break;
}
+ *locked = 1;
ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
pages, NULL, locked);
if (!*locked) {
@@ -1416,7 +1420,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
* We want mlock to succeed for regions that have any permissions
* other than PROT_NONE.
*/
- if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+ if (vma_is_accessible(vma))
gup_flags |= FOLL_FORCE;
/*
@@ -1676,7 +1680,7 @@ check_again:
list_add_tail(&head->lru, &cma_page_list);
mod_node_page_state(page_pgdat(head),
NR_ISOLATED_ANON +
- page_is_file_cache(head),
+ page_is_file_lru(head),
hpage_nr_pages(head));
}
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0f9389f9d1f8..6ecd1045113b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -326,7 +326,7 @@ static struct attribute *hugepage_attr[] = {
&defrag_attr.attr,
&use_zero_page_attr.attr,
&hpage_pmd_size_attr.attr,
-#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
+#ifdef CONFIG_SHMEM
&shmem_enabled_attr.attr,
#endif
#ifdef CONFIG_DEBUG_VM
@@ -597,6 +597,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
+ count_vm_event(THP_FAULT_FALLBACK_CHARGE);
return VM_FAULT_FALLBACK;
}
@@ -1043,6 +1044,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
+ /*
+ * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
+ * does not have the VM_UFFD_WP, which means that the uffd
+ * fork event is not enabled.
+ */
+ if (!(vma->vm_flags & VM_UFFD_WP))
+ pmd = pmd_clear_uffd_wp(pmd);
+
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (unlikely(is_swap_pmd(pmd))) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
@@ -1446,6 +1455,7 @@ alloc:
put_page(page);
ret |= VM_FAULT_FALLBACK;
count_vm_event(THP_FAULT_FALLBACK);
+ count_vm_event(THP_FAULT_FALLBACK_CHARGE);
goto out;
}
@@ -1977,13 +1987,16 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
* - HPAGE_PMD_NR is protections changed and TLB flush necessary
*/
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, pgprot_t newprot, int prot_numa)
+ unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
pmd_t entry;
bool preserve_write;
int ret;
+ bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+ bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
@@ -2050,6 +2063,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
entry = pmd_modify(entry, newprot);
if (preserve_write)
entry = pmd_mk_savedwrite(entry);
+ if (uffd_wp) {
+ entry = pmd_wrprotect(entry);
+ entry = pmd_mkuffd_wp(entry);
+ } else if (uffd_wp_resolve) {
+ /*
+ * Leave the write bit to be handled by PF interrupt
+ * handler, then things like COW could be properly
+ * handled.
+ */
+ entry = pmd_clear_uffd_wp(entry);
+ }
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
@@ -2198,7 +2222,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
struct page *page;
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
- bool young, write, soft_dirty, pmd_migration = false;
+ bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
unsigned long addr;
int i;
@@ -2273,6 +2297,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
write = is_write_migration_entry(entry);
young = false;
soft_dirty = pmd_swp_soft_dirty(old_pmd);
+ uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
page = pmd_page(old_pmd);
if (pmd_dirty(old_pmd))
@@ -2280,6 +2305,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
write = pmd_write(old_pmd);
young = pmd_young(old_pmd);
soft_dirty = pmd_soft_dirty(old_pmd);
+ uffd_wp = pmd_uffd_wp(old_pmd);
}
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
@@ -2304,6 +2330,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = swp_entry_to_pte(swp_entry);
if (soft_dirty)
entry = pte_swp_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
} else {
entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
entry = maybe_mkwrite(entry, vma);
@@ -2313,6 +2341,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_mkold(entry);
if (soft_dirty)
entry = pte_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_mkuffd_wp(entry);
}
pte = pte_offset_map(&_pmd, addr);
BUG_ON(!pte_none(*pte));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f9ea1e5197b4..cd459155d28a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -28,6 +28,7 @@
#include <linux/jhash.h>
#include <linux/numa.h>
#include <linux/llist.h>
+#include <linux/cma.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -44,6 +45,9 @@
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
+
+static struct cma *hugetlb_cma[MAX_NUMNODES];
+
/*
* Minimum page order among possible hugepage sizes, set to a proper value
* at boot time.
@@ -1228,6 +1232,14 @@ static void destroy_compound_gigantic_page(struct page *page,
static void free_gigantic_page(struct page *page, unsigned int order)
{
+ /*
+ * If the page isn't allocated using the cma allocator,
+ * cma_release() returns false.
+ */
+ if (IS_ENABLED(CONFIG_CMA) &&
+ cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
+ return;
+
free_contig_range(page_to_pfn(page), 1 << order);
}
@@ -1237,6 +1249,21 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
{
unsigned long nr_pages = 1UL << huge_page_order(h);
+ if (IS_ENABLED(CONFIG_CMA)) {
+ struct page *page;
+ int node;
+
+ for_each_node_mask(node, *nodemask) {
+ if (!hugetlb_cma[node])
+ continue;
+
+ page = cma_alloc(hugetlb_cma[node], nr_pages,
+ huge_page_order(h), true);
+ if (page)
+ return page;
+ }
+ }
+
return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
}
@@ -1281,8 +1308,14 @@ static void update_and_free_page(struct hstate *h, struct page *page)
set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
set_page_refcounted(page);
if (hstate_is_gigantic(h)) {
+ /*
+ * Temporarily drop the hugetlb_lock, because
+ * we might block in free_gigantic_page().
+ */
+ spin_unlock(&hugetlb_lock);
destroy_compound_gigantic_page(page, huge_page_order(h));
free_gigantic_page(page, huge_page_order(h));
+ spin_lock(&hugetlb_lock);
} else {
__free_pages(page, huge_page_order(h));
}
@@ -2010,6 +2043,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
* of size 'delta'.
*/
static int gather_surplus_pages(struct hstate *h, int delta)
+ __must_hold(&hugetlb_lock)
{
struct list_head surplus_list;
struct page *page, *tmp;
@@ -2538,6 +2572,10 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
+ if (IS_ENABLED(CONFIG_CMA) && hugetlb_cma[0]) {
+ pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
+ break;
+ }
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_pool_huge_page(h,
@@ -3193,6 +3231,7 @@ static int __init hugetlb_init(void)
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
}
+ hugetlb_cma_check();
hugetlb_init_hstates();
gather_bootmem_prealloc();
report_hugepages();
@@ -5505,3 +5544,74 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
spin_unlock(&hugetlb_lock);
}
}
+
+#ifdef CONFIG_CMA
+static unsigned long hugetlb_cma_size __initdata;
+static bool cma_reserve_called __initdata;
+
+static int __init cmdline_parse_hugetlb_cma(char *p)
+{
+ hugetlb_cma_size = memparse(p, &p);
+ return 0;
+}
+
+early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
+
+void __init hugetlb_cma_reserve(int order)
+{
+ unsigned long size, reserved, per_node;
+ int nid;
+
+ cma_reserve_called = true;
+
+ if (!hugetlb_cma_size)
+ return;
+
+ if (hugetlb_cma_size < (PAGE_SIZE << order)) {
+ pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
+ (PAGE_SIZE << order) / SZ_1M);
+ return;
+ }
+
+ /*
+ * If 3 GB area is requested on a machine with 4 numa nodes,
+ * let's allocate 1 GB on first three nodes and ignore the last one.
+ */
+ per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
+ pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
+ hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
+
+ reserved = 0;
+ for_each_node_state(nid, N_ONLINE) {
+ int res;
+
+ size = min(per_node, hugetlb_cma_size - reserved);
+ size = round_up(size, PAGE_SIZE << order);
+
+ res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
+ 0, false, "hugetlb",
+ &hugetlb_cma[nid], nid);
+ if (res) {
+ pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
+ res, nid);
+ continue;
+ }
+
+ reserved += size;
+ pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
+ size / SZ_1M, nid);
+
+ if (reserved >= hugetlb_cma_size)
+ break;
+ }
+}
+
+void __init hugetlb_cma_check(void)
+{
+ if (!hugetlb_cma_size || cma_reserve_called)
+ return;
+
+ pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
+}
+
+#endif /* CONFIG_CMA */
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index c2d7ae6cabd1..aabf65d4d91b 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -467,14 +467,14 @@ static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
switch (MEMFILE_ATTR(cft->private)) {
case RES_RSVD_USAGE:
counter = &h_cg->rsvd_hugepage[idx];
- /* Fall through. */
+ fallthrough;
case RES_USAGE:
val = (u64)page_counter_read(counter);
seq_printf(seq, "%llu\n", val * PAGE_SIZE);
break;
case RES_RSVD_LIMIT:
counter = &h_cg->rsvd_hugepage[idx];
- /* Fall through. */
+ fallthrough;
case RES_LIMIT:
val = (u64)counter->max;
if (val == limit)
@@ -514,7 +514,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_RSVD_LIMIT:
rsvd = true;
- /* Fall through. */
+ fallthrough;
case RES_LIMIT:
mutex_lock(&hugetlb_limit_mutex);
ret = page_counter_set_max(
diff --git a/mm/internal.h b/mm/internal.h
index 2d58ae15a958..b5634e78f01d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -180,6 +180,8 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
}
extern int __isolate_free_page(struct page *page, unsigned int order);
+extern void __putback_isolated_page(struct page *page, unsigned int order,
+ int mt);
extern void memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order);
extern void __free_pages_core(struct page *page, unsigned int order);
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index e61b4a492218..2906358e42f0 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -15,7 +15,6 @@
*/
#include <linux/export.h>
-#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
@@ -42,28 +41,6 @@
#include "kasan.h"
#include "../slab.h"
-static inline int in_irqentry_text(unsigned long ptr)
-{
- return (ptr >= (unsigned long)&__irqentry_text_start &&
- ptr < (unsigned long)&__irqentry_text_end) ||
- (ptr >= (unsigned long)&__softirqentry_text_start &&
- ptr < (unsigned long)&__softirqentry_text_end);
-}
-
-static inline unsigned int filter_irq_stacks(unsigned long *entries,
- unsigned int nr_entries)
-{
- unsigned int i;
-
- for (i = 0; i < nr_entries; i++) {
- if (in_irqentry_text(entries[i])) {
- /* Include the irqentry function into the stack. */
- return i + 1;
- }
- }
- return nr_entries;
-}
-
static inline depot_stack_handle_t save_stack(gfp_t flags)
{
unsigned long entries[KASAN_STACK_DEPTH];
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index cf5c17d5e361..80f23c9da6b0 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -92,8 +92,16 @@ static void end_report(unsigned long *flags)
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
- if (panic_on_warn)
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from panicking the
+ * system on this thread. Other threads are blocked by the
+ * panic_mutex in panic().
+ */
+ panic_on_warn = 0;
panic("panic_on_warn set ...\n");
+ }
kasan_enable_current();
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c659c68728bc..99d77ffb79c2 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -29,6 +29,7 @@ enum scan_result {
SCAN_PMD_NULL,
SCAN_EXCEED_NONE_PTE,
SCAN_PTE_NON_PRESENT,
+ SCAN_PTE_UFFD_WP,
SCAN_PAGE_RO,
SCAN_LACK_REFERENCED_PAGE,
SCAN_PAGE_NULL,
@@ -414,8 +415,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
(IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
vma->vm_file &&
(vm_flags & VM_DENYWRITE))) {
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
- return false;
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
HPAGE_PMD_NR);
}
@@ -513,7 +512,7 @@ void __khugepaged_exit(struct mm_struct *mm)
static void release_pte_page(struct page *page)
{
- dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
+ dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page));
unlock_page(page);
putback_lru_page(page);
}
@@ -613,7 +612,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
inc_node_page_state(page,
- NR_ISOLATED_ANON + page_is_file_cache(page));
+ NR_ISOLATED_ANON + page_is_file_lru(page));
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -1139,6 +1138,15 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
pte_t pteval = *_pte;
if (is_swap_pte(pteval)) {
if (++unmapped <= khugepaged_max_ptes_swap) {
+ /*
+ * Always be strict with uffd-wp
+ * enabled swap entries. Please see
+ * comment below for pte_uffd_wp().
+ */
+ if (pte_swp_uffd_wp(pteval)) {
+ result = SCAN_PTE_UFFD_WP;
+ goto out_unmap;
+ }
continue;
} else {
result = SCAN_EXCEED_SWAP_PTE;
@@ -1158,6 +1166,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
result = SCAN_PTE_NON_PRESENT;
goto out_unmap;
}
+ if (pte_uffd_wp(pteval)) {
+ /*
+ * Don't collapse the page if any of the small
+ * PTEs are armed with uffd write protection.
+ * Here we can also mark the new huge pmd as
+ * write protected if any of the small ones is
+ * marked but that could bring uknown
+ * userfault messages that falls outside of
+ * the registered range. So, just be simple.
+ */
+ result = SCAN_PTE_UFFD_WP;
+ goto out_unmap;
+ }
if (pte_write(pteval))
writable = true;
@@ -1258,7 +1279,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
}
}
-#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
+#ifdef CONFIG_SHMEM
/*
* Notify khugepaged that given addr of the mm is pte-mapped THP. Then
* khugepaged should try to collapse the page table.
@@ -1973,6 +1994,8 @@ skip:
if (khugepaged_scan.address < hstart)
khugepaged_scan.address = hstart;
VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+ if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma))
+ goto skip;
while (khugepaged_scan.address < hend) {
int ret;
@@ -1984,14 +2007,10 @@ skip:
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
- struct file *file;
+ struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
- if (shmem_file(vma->vm_file)
- && !shmem_huge_enabled(vma))
- goto skip;
- file = get_file(vma->vm_file);
up_read(&mm->mmap_sem);
ret = 1;
khugepaged_scan_file(mm, file, pgoff, hpage);
diff --git a/mm/ksm.c b/mm/ksm.c
index d17c7d57d0d8..a558da9e7177 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -455,7 +455,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
/*
* We use break_ksm to break COW on a ksm page: it's a stripped down
*
- * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
+ * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1)
* put_page(page);
*
* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
@@ -2813,8 +2813,7 @@ static int ksm_memory_callback(struct notifier_block *self,
*/
ksm_check_stable_tree(mn->start_pfn,
mn->start_pfn + mn->nr_pages);
- /* fallthrough */
-
+ fallthrough;
case MEM_CANCEL_OFFLINE:
mutex_lock(&ksm_thread_mutex);
ksm_run &= ~KSM_RUN_OFFLINE;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 8de5e3784ee4..4d5294c39bba 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -223,7 +223,7 @@ restart:
switch (ret) {
case LRU_REMOVED_RETRY:
assert_spin_locked(&nlru->lock);
- /* fall through */
+ fallthrough;
case LRU_REMOVED:
isolated++;
nlru->nr_items--;
diff --git a/mm/memblock.c b/mm/memblock.c
index 4d06bbaded0f..c79ba6f9920c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1349,7 +1349,7 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
* Return:
* Physical address of allocated memory block on success, %0 on failure.
*/
-static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
+phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
phys_addr_t end, int nid,
bool exact_nid)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ca194864d802..5beea03dd58a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2254,7 +2254,8 @@ static void reclaim_high(struct mem_cgroup *memcg,
continue;
memcg_memory_event(memcg, MEMCG_HIGH);
try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
- } while ((memcg = parent_mem_cgroup(memcg)));
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
}
static void high_work_func(struct work_struct *work)
@@ -2335,6 +2336,9 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
usage = page_counter_read(&memcg->memory);
high = READ_ONCE(memcg->high);
+ if (usage <= high)
+ continue;
+
/*
* Prevent division by 0 in overage calculation by acting as if
* it was a threshold of 1 page
@@ -5812,7 +5816,7 @@ retry:
switch (get_mctgt_type(vma, addr, ptent, &target)) {
case MC_TARGET_DEVICE:
device = true;
- /* fall through */
+ fallthrough;
case MC_TARGET_PAGE:
page = target.page;
/*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1c961cd26c0b..a96364be8ab4 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1810,7 +1810,7 @@ static int __soft_offline_page(struct page *page, int flags)
*/
if (!__PageMovable(page))
inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ page_is_file_lru(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
diff --git a/mm/memory.c b/mm/memory.c
index 586271f3efc6..f703fe8c8346 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -733,6 +733,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(*src_pte))
pte = pte_swp_mksoft_dirty(pte);
+ if (pte_swp_uffd_wp(*src_pte))
+ pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
} else if (is_device_private_entry(entry)) {
@@ -762,6 +764,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
is_cow_mapping(vm_flags)) {
make_device_private_entry_read(&entry);
pte = swp_entry_to_pte(entry);
+ if (pte_swp_uffd_wp(*src_pte))
+ pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
}
@@ -785,6 +789,14 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte = pte_mkclean(pte);
pte = pte_mkold(pte);
+ /*
+ * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
+ * does not have the VM_UFFD_WP, which means that the uffd
+ * fork event is not enabled.
+ */
+ if (!(vm_flags & VM_UFFD_WP))
+ pte = pte_clear_uffd_wp(pte);
+
page = vm_normal_page(vma, addr, pte);
if (page) {
get_page(page);
@@ -1407,8 +1419,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);
-pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
- spinlock_t **ptl)
+static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -1427,9 +1438,40 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
return NULL;
VM_BUG_ON(pmd_trans_huge(*pmd));
+ return pmd;
+}
+
+pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
+ spinlock_t **ptl)
+{
+ pmd_t *pmd = walk_to_pmd(mm, addr);
+
+ if (!pmd)
+ return NULL;
return pte_alloc_map_lock(mm, pmd, addr, ptl);
}
+static int validate_page_before_insert(struct page *page)
+{
+ if (PageAnon(page) || PageSlab(page) || page_has_type(page))
+ return -EINVAL;
+ flush_dcache_page(page);
+ return 0;
+}
+
+static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
+ unsigned long addr, struct page *page, pgprot_t prot)
+{
+ if (!pte_none(*pte))
+ return -EBUSY;
+ /* Ok, finally just insert the thing.. */
+ get_page(page);
+ inc_mm_counter_fast(mm, mm_counter_file(page));
+ page_add_file_rmap(page, false);
+ set_pte_at(mm, addr, pte, mk_pte(page, prot));
+ return 0;
+}
+
/*
* This is the old fallback for page remapping.
*
@@ -1445,31 +1487,135 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
pte_t *pte;
spinlock_t *ptl;
- retval = -EINVAL;
- if (PageAnon(page) || PageSlab(page) || page_has_type(page))
+ retval = validate_page_before_insert(page);
+ if (retval)
goto out;
retval = -ENOMEM;
- flush_dcache_page(page);
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
goto out;
- retval = -EBUSY;
- if (!pte_none(*pte))
- goto out_unlock;
-
- /* Ok, finally just insert the thing.. */
- get_page(page);
- inc_mm_counter_fast(mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
- set_pte_at(mm, addr, pte, mk_pte(page, prot));
-
- retval = 0;
-out_unlock:
+ retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
pte_unmap_unlock(pte, ptl);
out:
return retval;
}
+#ifdef pte_index
+static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, struct page *page, pgprot_t prot)
+{
+ int err;
+
+ if (!page_count(page))
+ return -EINVAL;
+ err = validate_page_before_insert(page);
+ return err ? err : insert_page_into_pte_locked(
+ mm, pte_offset_map(pmd, addr), addr, page, prot);
+}
+
+/* insert_pages() amortizes the cost of spinlock operations
+ * when inserting pages in a loop. Arch *must* define pte_index.
+ */
+static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
+ struct page **pages, unsigned long *num, pgprot_t prot)
+{
+ pmd_t *pmd = NULL;
+ spinlock_t *pte_lock = NULL;
+ struct mm_struct *const mm = vma->vm_mm;
+ unsigned long curr_page_idx = 0;
+ unsigned long remaining_pages_total = *num;
+ unsigned long pages_to_write_in_pmd;
+ int ret;
+more:
+ ret = -EFAULT;
+ pmd = walk_to_pmd(mm, addr);
+ if (!pmd)
+ goto out;
+
+ pages_to_write_in_pmd = min_t(unsigned long,
+ remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
+
+ /* Allocate the PTE if necessary; takes PMD lock once only. */
+ ret = -ENOMEM;
+ if (pte_alloc(mm, pmd))
+ goto out;
+ pte_lock = pte_lockptr(mm, pmd);
+
+ while (pages_to_write_in_pmd) {
+ int pte_idx = 0;
+ const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
+
+ spin_lock(pte_lock);
+ for (; pte_idx < batch_size; ++pte_idx) {
+ int err = insert_page_in_batch_locked(mm, pmd,
+ addr, pages[curr_page_idx], prot);
+ if (unlikely(err)) {
+ spin_unlock(pte_lock);
+ ret = err;
+ remaining_pages_total -= pte_idx;
+ goto out;
+ }
+ addr += PAGE_SIZE;
+ ++curr_page_idx;
+ }
+ spin_unlock(pte_lock);
+ pages_to_write_in_pmd -= batch_size;
+ remaining_pages_total -= batch_size;
+ }
+ if (remaining_pages_total)
+ goto more;
+ ret = 0;
+out:
+ *num = remaining_pages_total;
+ return ret;
+}
+#endif /* ifdef pte_index */
+
+/**
+ * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
+ * @vma: user vma to map to
+ * @addr: target start user address of these pages
+ * @pages: source kernel pages
+ * @num: in: number of pages to map. out: number of pages that were *not*
+ * mapped. (0 means all pages were successfully mapped).
+ *
+ * Preferred over vm_insert_page() when inserting multiple pages.
+ *
+ * In case of error, we may have mapped a subset of the provided
+ * pages. It is the caller's responsibility to account for this case.
+ *
+ * The same restrictions apply as in vm_insert_page().
+ */
+int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
+ struct page **pages, unsigned long *num)
+{
+#ifdef pte_index
+ const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
+
+ if (addr < vma->vm_start || end_addr >= vma->vm_end)
+ return -EFAULT;
+ if (!(vma->vm_flags & VM_MIXEDMAP)) {
+ BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+ BUG_ON(vma->vm_flags & VM_PFNMAP);
+ vma->vm_flags |= VM_MIXEDMAP;
+ }
+ /* Defer page refcount checking till we're about to map that page. */
+ return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
+#else
+ unsigned long idx = 0, pgcount = *num;
+ int err;
+
+ for (; idx < pgcount; ++idx) {
+ err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
+ if (err)
+ break;
+ }
+ *num = pgcount - idx;
+ return err;
+#endif /* ifdef pte_index */
+}
+EXPORT_SYMBOL(vm_insert_pages);
+
/**
* vm_insert_page - insert single page into user vma
* @vma: user vma to map to
@@ -1940,7 +2086,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
* @vma: user vma to map to
* @addr: target user address to start at
* @pfn: page frame number of kernel physical memory address
- * @size: size of map area
+ * @size: size of mapping area
* @prot: page protection flags for this mapping
*
* Note: this is only safe if the mm semaphore is held when called.
@@ -2752,6 +2898,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
+ if (userfaultfd_pte_wp(vma, *vmf->pte)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_WP);
+ }
+
vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
if (!vmf->page) {
/*
@@ -3085,6 +3236,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
+ if (pte_swp_uffd_wp(vmf->orig_pte)) {
+ pte = pte_mkuffd_wp(pte);
+ pte = pte_wrprotect(pte);
+ }
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
@@ -3373,7 +3528,7 @@ map_pte:
return 0;
}
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -3475,8 +3630,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
pte_t entry;
vm_fault_t ret;
- if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
- IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
+ if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
/* THP on COW? */
VM_BUG_ON_PAGE(memcg, page);
@@ -3949,8 +4103,11 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
- if (vma_is_anonymous(vmf->vma))
+ if (vma_is_anonymous(vmf->vma)) {
+ if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
+ return handle_userfault(vmf, VM_UFFD_WP);
return do_huge_pmd_wp_page(vmf, orig_pmd);
+ }
if (vmf->vma->vm_ops->huge_fault) {
vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
@@ -3964,11 +4121,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
return VM_FAULT_FALLBACK;
}
-static inline bool vma_is_accessible(struct vm_area_struct *vma)
-{
- return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
-}
-
static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 19389cdc16a5..fc0aad0bc1f5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -67,18 +67,17 @@ void put_online_mems(void)
bool movable_node_enabled = false;
#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
-bool memhp_auto_online;
+int memhp_default_online_type = MMOP_OFFLINE;
#else
-bool memhp_auto_online = true;
+int memhp_default_online_type = MMOP_ONLINE;
#endif
-EXPORT_SYMBOL_GPL(memhp_auto_online);
static int __init setup_memhp_default_state(char *str)
{
- if (!strcmp(str, "online"))
- memhp_auto_online = true;
- else if (!strcmp(str, "offline"))
- memhp_auto_online = false;
+ const int online_type = memhp_online_type_from_str(str);
+
+ if (online_type >= 0)
+ memhp_default_online_type = online_type;
return 1;
}
@@ -105,7 +104,13 @@ static struct resource *register_memory_resource(u64 start, u64 size)
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
char *resource_name = "System RAM";
- if (start + size > max_mem_size)
+ /*
+ * Make sure value parsed from 'mem=' only restricts memory adding
+ * while booting, so that memory hotplug won't be impacted. Please
+ * refer to document of 'mem=' in kernel-parameters.txt for more
+ * details.
+ */
+ if (start + size > max_mem_size && system_state < SYSTEM_RUNNING)
return ERR_PTR(-E2BIG);
/*
@@ -299,11 +304,15 @@ static int check_hotplug_memory_addressable(unsigned long pfn,
* add the new pages.
*/
int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
+ const unsigned long end_pfn = pfn + nr_pages;
+ unsigned long cur_nr_pages;
int err;
- unsigned long nr, start_sec, end_sec;
- struct vmem_altmap *altmap = restrictions->altmap;
+ struct vmem_altmap *altmap = params->altmap;
+
+ if (WARN_ON_ONCE(!params->pgprot.pgprot))
+ return -EINVAL;
err = check_hotplug_memory_addressable(pfn, nr_pages);
if (err)
@@ -325,18 +334,13 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
if (err)
return err;
- start_sec = pfn_to_section_nr(pfn);
- end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
- for (nr = start_sec; nr <= end_sec; nr++) {
- unsigned long pfns;
-
- pfns = min(nr_pages, PAGES_PER_SECTION
- - (pfn & ~PAGE_SECTION_MASK));
- err = sparse_add_section(nid, pfn, pfns, altmap);
+ for (; pfn < end_pfn; pfn += cur_nr_pages) {
+ /* Select all remaining pages up to the next section boundary */
+ cur_nr_pages = min(end_pfn - pfn,
+ SECTION_ALIGN_UP(pfn + 1) - pfn);
+ err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
if (err)
break;
- pfn += pfns;
- nr_pages -= pfns;
cond_resched();
}
vmemmap_populate_print_last();
@@ -494,7 +498,7 @@ static void __remove_section(unsigned long pfn, unsigned long nr_pages,
unsigned long map_offset,
struct vmem_altmap *altmap)
{
- struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
+ struct mem_section *ms = __pfn_to_section(pfn);
if (WARN_ON_ONCE(!valid_section(ms)))
return;
@@ -528,7 +532,8 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages,
for (; pfn < end_pfn; pfn += cur_nr_pages) {
cond_resched();
/* Select all remaining pages up to the next section boundary */
- cur_nr_pages = min(end_pfn - pfn, -(pfn | PAGE_SECTION_MASK));
+ cur_nr_pages = min(end_pfn - pfn,
+ SECTION_ALIGN_UP(pfn + 1) - pfn);
__remove_section(pfn, cur_nr_pages, map_offset, altmap);
map_offset = 0;
}
@@ -988,6 +993,7 @@ static int check_hotplug_memory_range(u64 start, u64 size)
static int online_memory_block(struct memory_block *mem, void *arg)
{
+ mem->online_type = memhp_default_online_type;
return device_online(&mem->dev);
}
@@ -999,7 +1005,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
*/
int __ref add_memory_resource(int nid, struct resource *res)
{
- struct mhp_restrictions restrictions = {};
+ struct mhp_params params = { .pgprot = PAGE_KERNEL };
u64 start, size;
bool new_node = false;
int ret;
@@ -1027,7 +1033,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
new_node = ret;
/* call arch's memory hotadd */
- ret = arch_add_memory(nid, start, size, &restrictions);
+ ret = arch_add_memory(nid, start, size, &params);
if (ret < 0)
goto error;
@@ -1060,7 +1066,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
mem_hotplug_done();
/* online pages if requested */
- if (memhp_auto_online)
+ if (memhp_default_online_type != MMOP_OFFLINE)
walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
@@ -1317,7 +1323,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
list_add_tail(&page->lru, &source);
if (!__PageMovable(page))
inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ page_is_file_lru(page));
} else {
pr_warn("failed to isolate pfn %lx\n", pfn);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5fb427aed612..48ba9729062e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -127,6 +127,32 @@ static struct mempolicy default_policy = {
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+/**
+ * numa_map_to_online_node - Find closest online node
+ * @nid: Node id to start the search
+ *
+ * Lookup the next closest node by distance if @nid is not online.
+ */
+int numa_map_to_online_node(int node)
+{
+ int min_dist = INT_MAX, dist, n, min_node;
+
+ if (node == NUMA_NO_NODE || node_online(node))
+ return node;
+
+ min_node = node;
+ for_each_online_node(n) {
+ dist = node_distance(node, n);
+ if (dist < min_dist) {
+ min_dist = dist;
+ min_node = n;
+ }
+ }
+
+ return min_node;
+}
+EXPORT_SYMBOL_GPL(numa_map_to_online_node);
+
struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
@@ -442,6 +468,7 @@ static inline bool queue_pages_required(struct page *page,
*/
static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
unsigned long end, struct mm_walk *walk)
+ __releases(ptl)
{
int ret = 0;
struct page *page;
@@ -627,7 +654,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
{
int nr_updated;
- nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
+ nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
@@ -678,8 +705,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
- if (!is_vm_hugetlb_page(vma) &&
- (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
+ if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
!(vma->vm_flags & VM_MIXEDMAP))
change_prot_numa(vma, start, endvma);
return 1;
@@ -881,7 +907,6 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
switch (p->mode) {
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
*nodes = p->v.nodes;
break;
@@ -897,12 +922,15 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
- struct page *p;
+ struct page *p = NULL;
int err;
int locked = 1;
err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
- if (err >= 0) {
+ if (err == 0) {
+ /* E.g. GUP interrupted by fatal signal */
+ err = -EFAULT;
+ } else if (err > 0) {
err = page_to_nid(p);
put_page(p);
}
@@ -1023,7 +1051,7 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist,
if (!isolate_lru_page(head)) {
list_add_tail(&head->lru, pagelist);
mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_cache(head),
+ NR_ISOLATED_ANON + page_is_file_lru(head),
hpage_nr_pages(head));
} else if (flags & MPOL_MF_STRICT) {
/*
@@ -2066,7 +2094,6 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
break;
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
*mask = mempolicy->v.nodes;
break;
@@ -2333,7 +2360,6 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
switch (a->mode) {
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
diff --git a/mm/memremap.c b/mm/memremap.c
index 9b2c97ceb775..03e38b7a38f1 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -7,6 +7,7 @@
#include <linux/mm.h>
#include <linux/pfn_t.h>
#include <linux/swap.h>
+#include <linux/mmzone.h>
#include <linux/swapops.h>
#include <linux/types.h>
#include <linux/wait_bit.h>
@@ -14,6 +15,28 @@
static DEFINE_XARRAY(pgmap_array);
+/*
+ * The memremap() and memremap_pages() interfaces are alternately used
+ * to map persistent memory namespaces. These interfaces place different
+ * constraints on the alignment and size of the mapping (namespace).
+ * memremap() can map individual PAGE_SIZE pages. memremap_pages() can
+ * only map subsections (2MB), and at least one architecture (PowerPC)
+ * the minimum mapping granularity of memremap_pages() is 16MB.
+ *
+ * The role of memremap_compat_align() is to communicate the minimum
+ * arch supported alignment of a namespace such that it can freely
+ * switch modes without violating the arch constraint. Namely, do not
+ * allow a namespace to be PAGE_SIZE aligned since that namespace may be
+ * reconfigured into a mode that requires SUBSECTION_SIZE alignment.
+ */
+#ifndef CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN
+unsigned long memremap_compat_align(void)
+{
+ return SUBSECTION_SIZE;
+}
+EXPORT_SYMBOL_GPL(memremap_compat_align);
+#endif
+
#ifdef CONFIG_DEV_PAGEMAP_OPS
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
EXPORT_SYMBOL(devmap_managed_key);
@@ -161,13 +184,13 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
{
struct resource *res = &pgmap->res;
struct dev_pagemap *conflict_pgmap;
- struct mhp_restrictions restrictions = {
+ struct mhp_params params = {
/*
* We do not want any optional features only our own memmap
*/
.altmap = pgmap_altmap(pgmap),
+ .pgprot = PAGE_KERNEL,
};
- pgprot_t pgprot = PAGE_KERNEL;
int error, is_ram;
bool need_devmap_managed = true;
@@ -194,7 +217,10 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
}
break;
case MEMORY_DEVICE_DEVDAX:
+ need_devmap_managed = false;
+ break;
case MEMORY_DEVICE_PCI_P2PDMA:
+ params.pgprot = pgprot_noncached(params.pgprot);
need_devmap_managed = false;
break;
default:
@@ -259,8 +285,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
if (nid < 0)
nid = numa_mem_id();
- error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(res->start), 0,
- resource_size(res));
+ error = track_pfn_remap(NULL, &params.pgprot, PHYS_PFN(res->start),
+ 0, resource_size(res));
if (error)
goto err_pfn_remap;
@@ -279,7 +305,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
*/
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
error = add_pages(nid, PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), &restrictions);
+ PHYS_PFN(resource_size(res)), &params);
} else {
error = kasan_add_zero_shadow(__va(res->start), resource_size(res));
if (error) {
@@ -288,7 +314,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
}
error = arch_add_memory(nid, res->start, resource_size(res),
- &restrictions);
+ &params);
}
if (!error) {
@@ -296,7 +322,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
move_pfn_range_to_zone(zone, PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), restrictions.altmap);
+ PHYS_PFN(resource_size(res)), params.altmap);
}
mem_hotplug_done();
diff --git a/mm/migrate.c b/mm/migrate.c
index 7ded07081be9..7160c1556f79 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -193,7 +193,7 @@ void putback_movable_pages(struct list_head *l)
put_page(page);
} else {
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_cache(page), -hpage_nr_pages(page));
+ page_is_file_lru(page), -hpage_nr_pages(page));
putback_lru_page(page);
}
}
@@ -243,11 +243,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
entry = pte_to_swp_entry(*pvmw.pte);
if (is_write_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
+ else if (pte_swp_uffd_wp(*pvmw.pte))
+ pte = pte_mkuffd_wp(pte);
if (unlikely(is_zone_device_page(new))) {
if (is_device_private_page(new)) {
entry = make_device_private_entry(new, pte_write(pte));
pte = swp_entry_to_pte(entry);
+ if (pte_swp_uffd_wp(*pvmw.pte))
+ pte = pte_mkuffd_wp(pte);
}
}
@@ -647,6 +651,14 @@ void migrate_page_states(struct page *newpage, struct page *page)
if (PageWriteback(newpage))
end_page_writeback(newpage);
+ /*
+ * PG_readahead shares the same bit with PG_reclaim. The above
+ * end_page_writeback() may clear PG_readahead mistakenly, so set the
+ * bit after that.
+ */
+ if (PageReadahead(page))
+ SetPageReadahead(newpage);
+
copy_page_owner(page, newpage);
mem_cgroup_migrate(page, newpage);
@@ -1211,7 +1223,7 @@ out:
*/
if (likely(!__PageMovable(page)))
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_cache(page), -hpage_nr_pages(page));
+ page_is_file_lru(page), -hpage_nr_pages(page));
}
/*
@@ -1518,9 +1530,6 @@ static int do_move_pages_to_node(struct mm_struct *mm,
{
int err;
- if (list_empty(pagelist))
- return 0;
-
err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
MIGRATE_SYNC, MR_SYSCALL);
if (err)
@@ -1587,7 +1596,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
err = 1;
list_add_tail(&head->lru, pagelist);
mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_cache(head),
+ NR_ISOLATED_ANON + page_is_file_lru(head),
hpage_nr_pages(head));
}
out_putpage:
@@ -1602,6 +1611,32 @@ out:
return err;
}
+static int move_pages_and_store_status(struct mm_struct *mm, int node,
+ struct list_head *pagelist, int __user *status,
+ int start, int i, unsigned long nr_pages)
+{
+ int err;
+
+ if (list_empty(pagelist))
+ return 0;
+
+ err = do_move_pages_to_node(mm, pagelist, node);
+ if (err) {
+ /*
+ * Positive err means the number of failed
+ * pages to migrate. Since we are going to
+ * abort and return the number of non-migrated
+ * pages, so need to incude the rest of the
+ * nr_pages that have not been attempted as
+ * well.
+ */
+ if (err > 0)
+ err += nr_pages - i - 1;
+ return err;
+ }
+ return store_status(status, start, node, i - start);
+}
+
/*
* Migrate an array of page address onto an array of nodes and fill
* the corresponding array of status.
@@ -1645,21 +1680,8 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
current_node = node;
start = i;
} else if (node != current_node) {
- err = do_move_pages_to_node(mm, &pagelist, current_node);
- if (err) {
- /*
- * Positive err means the number of failed
- * pages to migrate. Since we are going to
- * abort and return the number of non-migrated
- * pages, so need to incude the rest of the
- * nr_pages that have not been attempted as
- * well.
- */
- if (err > 0)
- err += nr_pages - i - 1;
- goto out;
- }
- err = store_status(status, start, current_node, i - start);
+ err = move_pages_and_store_status(mm, current_node,
+ &pagelist, status, start, i, nr_pages);
if (err)
goto out;
start = i;
@@ -1673,49 +1695,29 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
err = add_page_for_migration(mm, addr, current_node,
&pagelist, flags & MPOL_MF_MOVE_ALL);
- if (!err) {
- /* The page is already on the target node */
- err = store_status(status, i, current_node, 1);
- if (err)
- goto out_flush;
- continue;
- } else if (err > 0) {
+ if (err > 0) {
/* The page is successfully queued for migration */
continue;
}
- err = store_status(status, i, err, 1);
+ /*
+ * If the page is already on the target node (!err), store the
+ * node, otherwise, store the err.
+ */
+ err = store_status(status, i, err ? : current_node, 1);
if (err)
goto out_flush;
- err = do_move_pages_to_node(mm, &pagelist, current_node);
- if (err) {
- if (err > 0)
- err += nr_pages - i - 1;
+ err = move_pages_and_store_status(mm, current_node, &pagelist,
+ status, start, i, nr_pages);
+ if (err)
goto out;
- }
- if (i > start) {
- err = store_status(status, start, current_node, i - start);
- if (err)
- goto out;
- }
current_node = NUMA_NO_NODE;
}
out_flush:
- if (list_empty(&pagelist))
- return err;
-
/* Make sure we do not overwrite the existing error */
- err1 = do_move_pages_to_node(mm, &pagelist, current_node);
- /*
- * Don't have to report non-attempted pages here since:
- * - If the above loop is done gracefully all pages have been
- * attempted.
- * - If the above loop is aborted it means a fatal error
- * happened, should return ret.
- */
- if (!err1)
- err1 = store_status(status, start, current_node, i - start);
+ err1 = move_pages_and_store_status(mm, current_node, &pagelist,
+ status, start, i, nr_pages);
if (err >= 0)
err = err1;
out:
@@ -1957,7 +1959,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
return 0;
}
- page_lru = page_is_file_cache(page);
+ page_lru = page_is_file_lru(page);
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
hpage_nr_pages(page));
@@ -1993,7 +1995,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
* Don't migrate file pages that are mapped in multiple processes
* with execute permissions as they are probably shared libraries.
*/
- if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
+ if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
(vma->vm_flags & VM_EXEC))
goto out;
@@ -2001,7 +2003,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
* Also do not migrate dirty pages as not all filesystems can move
* dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
*/
- if (page_is_file_cache(page) && PageDirty(page))
+ if (page_is_file_lru(page) && PageDirty(page))
goto out;
isolated = numamigrate_isolate_page(pgdat, page);
@@ -2016,7 +2018,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
if (!list_empty(&migratepages)) {
list_del(&page->lru);
dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ page_is_file_lru(page));
putback_lru_page(page);
}
isolated = 0;
@@ -2046,7 +2048,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
pg_data_t *pgdat = NODE_DATA(node);
int isolated = 0;
struct page *new_page = NULL;
- int page_lru = page_is_file_cache(page);
+ int page_lru = page_is_file_lru(page);
unsigned long start = address & HPAGE_PMD_MASK;
new_page = alloc_pages_node(node,
@@ -2340,6 +2342,8 @@ again:
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pte))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, addr, ptep, swp_pte);
/*
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 5c918388de99..7da6991d9435 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -37,7 +37,7 @@ void __init mminit_verify_zonelist(void)
struct zonelist *zonelist;
int i, listid, zoneid;
- BUG_ON(MAX_ZONELISTS > 2);
+ BUILD_BUG_ON(MAX_ZONELISTS > 2);
for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
/* Identify the zone and nodelist */
diff --git a/mm/mmap.c b/mm/mmap.c
index 94ae18398c59..f609e9ec4a25 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1224,7 +1224,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
return a->vm_end == b->vm_start &&
mpol_equal(vma_policy(a), vma_policy(b)) &&
a->vm_file == b->vm_file &&
- !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&
+ !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
}
@@ -1460,7 +1460,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
* with MAP_SHARED to preserve backward compatibility.
*/
flags &= LEGACY_MAP_MASK;
- /* fall through */
+ fallthrough;
case MAP_SHARED_VALIDATE:
if (flags & ~flags_mask)
return -EOPNOTSUPP;
@@ -1487,8 +1487,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
-
- /* fall through */
+ fallthrough;
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
@@ -2124,6 +2123,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
info.low_limit = mm->mmap_base;
info.high_limit = mmap_end;
info.align_mask = 0;
+ info.align_offset = 0;
return vm_unmapped_area(&info);
}
#endif
@@ -2165,6 +2165,7 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
info.align_mask = 0;
+ info.align_offset = 0;
addr = vm_unmapped_area(&info);
/*
@@ -2358,8 +2359,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
gap_addr = TASK_SIZE;
next = vma->vm_next;
- if (next && next->vm_start < gap_addr &&
- (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
+ if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
if (!(next->vm_flags & VM_GROWSUP))
return -ENOMEM;
/* Check that both stack segments have the same anon_vma? */
@@ -2440,7 +2440,7 @@ int expand_downwards(struct vm_area_struct *vma,
prev = vma->vm_prev;
/* Check that both stack segments have the same anon_vma? */
if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
- (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
+ vma_is_accessible(prev)) {
if (address - prev->vm_end < stack_guard_gap)
return -ENOMEM;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 311c0dadf71c..494192ca954b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -37,12 +37,16 @@
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable, int prot_numa)
+ unsigned long cp_flags)
{
pte_t *pte, oldpte;
spinlock_t *ptl;
unsigned long pages = 0;
int target_node = NUMA_NO_NODE;
+ bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT;
+ bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+ bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
/*
* Can be called with only the mmap_sem for reading by
@@ -98,7 +102,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* it cannot move them all from MIGRATE_ASYNC
* context.
*/
- if (page_is_file_cache(page) && PageDirty(page))
+ if (page_is_file_lru(page) && PageDirty(page))
continue;
/*
@@ -114,6 +118,19 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (preserve_write)
ptent = pte_mk_savedwrite(ptent);
+ if (uffd_wp) {
+ ptent = pte_wrprotect(ptent);
+ ptent = pte_mkuffd_wp(ptent);
+ } else if (uffd_wp_resolve) {
+ /*
+ * Leave the write bit to be handled
+ * by PF interrupt handler, then
+ * things like COW could be properly
+ * handled.
+ */
+ ptent = pte_clear_uffd_wp(ptent);
+ }
+
/* Avoid taking write faults for known dirty pages */
if (dirty_accountable && pte_dirty(ptent) &&
(pte_soft_dirty(ptent) ||
@@ -122,11 +139,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
}
ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
pages++;
- } else if (IS_ENABLED(CONFIG_MIGRATION)) {
+ } else if (is_swap_pte(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
+ pte_t newpte;
if (is_write_migration_entry(entry)) {
- pte_t newpte;
/*
* A protection check is difficult so
* just be safe and disable write
@@ -135,22 +152,28 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
newpte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(oldpte))
newpte = pte_swp_mksoft_dirty(newpte);
- set_pte_at(vma->vm_mm, addr, pte, newpte);
-
- pages++;
- }
-
- if (is_write_device_private_entry(entry)) {
- pte_t newpte;
-
+ if (pte_swp_uffd_wp(oldpte))
+ newpte = pte_swp_mkuffd_wp(newpte);
+ } else if (is_write_device_private_entry(entry)) {
/*
* We do not preserve soft-dirtiness. See
* copy_one_pte() for explanation.
*/
make_device_private_entry_read(&entry);
newpte = swp_entry_to_pte(entry);
- set_pte_at(vma->vm_mm, addr, pte, newpte);
+ if (pte_swp_uffd_wp(oldpte))
+ newpte = pte_swp_mkuffd_wp(newpte);
+ } else {
+ newpte = oldpte;
+ }
+
+ if (uffd_wp)
+ newpte = pte_swp_mkuffd_wp(newpte);
+ else if (uffd_wp_resolve)
+ newpte = pte_swp_clear_uffd_wp(newpte);
+ if (!pte_same(oldpte, newpte)) {
+ set_pte_at(vma->vm_mm, addr, pte, newpte);
pages++;
}
}
@@ -188,7 +211,7 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pud_t *pud, unsigned long addr, unsigned long end,
- pgprot_t newprot, int dirty_accountable, int prot_numa)
+ pgprot_t newprot, unsigned long cp_flags)
{
pmd_t *pmd;
unsigned long next;
@@ -229,7 +252,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
__split_huge_pmd(vma, pmd, addr, false, NULL);
} else {
int nr_ptes = change_huge_pmd(vma, pmd, addr,
- newprot, prot_numa);
+ newprot, cp_flags);
if (nr_ptes) {
if (nr_ptes == HPAGE_PMD_NR) {
@@ -244,7 +267,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
/* fall through, the trans huge pmd just split */
}
this_pages = change_pte_range(vma, pmd, addr, next, newprot,
- dirty_accountable, prot_numa);
+ cp_flags);
pages += this_pages;
next:
cond_resched();
@@ -260,7 +283,7 @@ next:
static inline unsigned long change_pud_range(struct vm_area_struct *vma,
p4d_t *p4d, unsigned long addr, unsigned long end,
- pgprot_t newprot, int dirty_accountable, int prot_numa)
+ pgprot_t newprot, unsigned long cp_flags)
{
pud_t *pud;
unsigned long next;
@@ -272,7 +295,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma,
if (pud_none_or_clear_bad(pud))
continue;
pages += change_pmd_range(vma, pud, addr, next, newprot,
- dirty_accountable, prot_numa);
+ cp_flags);
} while (pud++, addr = next, addr != end);
return pages;
@@ -280,7 +303,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma,
static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
pgd_t *pgd, unsigned long addr, unsigned long end,
- pgprot_t newprot, int dirty_accountable, int prot_numa)
+ pgprot_t newprot, unsigned long cp_flags)
{
p4d_t *p4d;
unsigned long next;
@@ -292,7 +315,7 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
if (p4d_none_or_clear_bad(p4d))
continue;
pages += change_pud_range(vma, p4d, addr, next, newprot,
- dirty_accountable, prot_numa);
+ cp_flags);
} while (p4d++, addr = next, addr != end);
return pages;
@@ -300,7 +323,7 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
static unsigned long change_protection_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable, int prot_numa)
+ unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
@@ -317,7 +340,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
if (pgd_none_or_clear_bad(pgd))
continue;
pages += change_p4d_range(vma, pgd, addr, next, newprot,
- dirty_accountable, prot_numa);
+ cp_flags);
} while (pgd++, addr = next, addr != end);
/* Only flush the TLB if we actually modified any entries: */
@@ -330,14 +353,17 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
- int dirty_accountable, int prot_numa)
+ unsigned long cp_flags)
{
unsigned long pages;
+ BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
+
if (is_vm_hugetlb_page(vma))
pages = hugetlb_change_protection(vma, start, end, newprot);
else
- pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
+ pages = change_protection_range(vma, start, end, newprot,
+ cp_flags);
return pages;
}
@@ -393,7 +419,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
*/
if (arch_has_pfn_modify_check() &&
(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
- (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) {
+ (newflags & VM_ACCESS_FLAGS) == 0) {
pgprot_t new_pgprot = vm_get_page_prot(newflags);
error = walk_page_range(current->mm, start, end,
@@ -459,7 +485,7 @@ success:
vma_set_page_prot(vma);
change_protection(vma, start, end, vma->vm_page_prot,
- dirty_accountable, 0);
+ dirty_accountable ? MM_CP_DIRTY_ACCT : 0);
/*
* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
@@ -572,7 +598,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
newflags |= (vma->vm_flags & ~mask_off_old_flags);
/* newflags >> 4 shift VM_MAY% in place of VM_% */
- if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
+ if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) {
error = -EACCES;
goto out;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e5f76da8cd4e..69827d4fa052 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -74,6 +74,7 @@
#include <asm/div64.h>
#include "internal.h"
#include "shuffle.h"
+#include "page_reporting.h"
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
@@ -102,8 +103,8 @@ struct pcpu_drain {
struct zone *zone;
struct work_struct work;
};
-DEFINE_MUTEX(pcpu_drain_mutex);
-DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
+static DEFINE_MUTEX(pcpu_drain_mutex);
+static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
volatile unsigned long latent_entropy __latent_entropy;
@@ -864,6 +865,78 @@ compaction_capture(struct capture_control *capc, struct page *page,
}
#endif /* CONFIG_COMPACTION */
+/* Used for pages not on another list */
+static inline void add_to_free_list(struct page *page, struct zone *zone,
+ unsigned int order, int migratetype)
+{
+ struct free_area *area = &zone->free_area[order];
+
+ list_add(&page->lru, &area->free_list[migratetype]);
+ area->nr_free++;
+}
+
+/* Used for pages not on another list */
+static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
+ unsigned int order, int migratetype)
+{
+ struct free_area *area = &zone->free_area[order];
+
+ list_add_tail(&page->lru, &area->free_list[migratetype]);
+ area->nr_free++;
+}
+
+/* Used for pages which are on another list */
+static inline void move_to_free_list(struct page *page, struct zone *zone,
+ unsigned int order, int migratetype)
+{
+ struct free_area *area = &zone->free_area[order];
+
+ list_move(&page->lru, &area->free_list[migratetype]);
+}
+
+static inline void del_page_from_free_list(struct page *page, struct zone *zone,
+ unsigned int order)
+{
+ /* clear reported state and update reported page count */
+ if (page_reported(page))
+ __ClearPageReported(page);
+
+ list_del(&page->lru);
+ __ClearPageBuddy(page);
+ set_page_private(page, 0);
+ zone->free_area[order].nr_free--;
+}
+
+/*
+ * If this is not the largest possible page, check if the buddy
+ * of the next-highest order is free. If it is, it's possible
+ * that pages are being freed that will coalesce soon. In case,
+ * that is happening, add the free page to the tail of the list
+ * so it's less likely to be used soon and more likely to be merged
+ * as a higher order page
+ */
+static inline bool
+buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
+ struct page *page, unsigned int order)
+{
+ struct page *higher_page, *higher_buddy;
+ unsigned long combined_pfn;
+
+ if (order >= MAX_ORDER - 2)
+ return false;
+
+ if (!pfn_valid_within(buddy_pfn))
+ return false;
+
+ combined_pfn = buddy_pfn & pfn;
+ higher_page = page + (combined_pfn - pfn);
+ buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
+ higher_buddy = higher_page + (buddy_pfn - combined_pfn);
+
+ return pfn_valid_within(buddy_pfn) &&
+ page_is_buddy(higher_page, higher_buddy, order + 1);
+}
+
/*
* Freeing function for a buddy system allocator.
*
@@ -891,13 +964,14 @@ compaction_capture(struct capture_control *capc, struct page *page,
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
- int migratetype)
+ int migratetype, bool report)
{
- unsigned long combined_pfn;
+ struct capture_control *capc = task_capc(zone);
unsigned long uninitialized_var(buddy_pfn);
- struct page *buddy;
+ unsigned long combined_pfn;
unsigned int max_order;
- struct capture_control *capc = task_capc(zone);
+ struct page *buddy;
+ bool to_tail;
max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
@@ -932,7 +1006,7 @@ continue_merging:
if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order, migratetype);
else
- del_page_from_free_area(buddy, &zone->free_area[order]);
+ del_page_from_free_list(buddy, zone, order);
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
@@ -966,35 +1040,19 @@ continue_merging:
done_merging:
set_page_order(page, order);
- /*
- * If this is not the largest possible page, check if the buddy
- * of the next-highest order is free. If it is, it's possible
- * that pages are being freed that will coalesce soon. In case,
- * that is happening, add the free page to the tail of the list
- * so it's less likely to be used soon and more likely to be merged
- * as a higher order page
- */
- if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)
- && !is_shuffle_order(order)) {
- struct page *higher_page, *higher_buddy;
- combined_pfn = buddy_pfn & pfn;
- higher_page = page + (combined_pfn - pfn);
- buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
- higher_buddy = higher_page + (buddy_pfn - combined_pfn);
- if (pfn_valid_within(buddy_pfn) &&
- page_is_buddy(higher_page, higher_buddy, order + 1)) {
- add_to_free_area_tail(page, &zone->free_area[order],
- migratetype);
- return;
- }
- }
-
if (is_shuffle_order(order))
- add_to_free_area_random(page, &zone->free_area[order],
- migratetype);
+ to_tail = shuffle_pick_tail();
+ else
+ to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
+
+ if (to_tail)
+ add_to_free_list_tail(page, zone, order, migratetype);
else
- add_to_free_area(page, &zone->free_area[order], migratetype);
+ add_to_free_list(page, zone, order, migratetype);
+ /* Notify page reporting subsystem of freed page */
+ if (report)
+ page_reporting_notify_free(order);
}
/*
@@ -1311,7 +1369,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt, true);
trace_mm_page_pcpu_drain(page, 0, mt);
}
spin_unlock(&zone->lock);
@@ -1327,7 +1385,7 @@ static void free_one_page(struct zone *zone,
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
- __free_one_page(page, pfn, zone, order, migratetype);
+ __free_one_page(page, pfn, zone, order, migratetype, true);
spin_unlock(&zone->lock);
}
@@ -2008,13 +2066,11 @@ void __init init_cma_reserved_pageblock(struct page *page)
* -- nyc
*/
static inline void expand(struct zone *zone, struct page *page,
- int low, int high, struct free_area *area,
- int migratetype)
+ int low, int high, int migratetype)
{
unsigned long size = 1 << high;
while (high > low) {
- area--;
high--;
size >>= 1;
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
@@ -2028,7 +2084,7 @@ static inline void expand(struct zone *zone, struct page *page,
if (set_page_guard(zone, &page[size], high, migratetype))
continue;
- add_to_free_area(&page[size], area, migratetype);
+ add_to_free_list(&page[size], zone, high, migratetype);
set_page_order(&page[size], high);
}
}
@@ -2186,8 +2242,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
- del_page_from_free_area(page, area);
- expand(zone, page, order, current_order, area, migratetype);
+ del_page_from_free_list(page, zone, current_order);
+ expand(zone, page, order, current_order, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
}
@@ -2261,7 +2317,7 @@ static int move_freepages(struct zone *zone,
VM_BUG_ON_PAGE(page_zone(page) != zone, page);
order = page_order(page);
- move_to_free_area(page, &zone->free_area[order], migratetype);
+ move_to_free_list(page, zone, order, migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -2377,7 +2433,6 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
unsigned int alloc_flags, int start_type, bool whole_block)
{
unsigned int current_order = page_order(page);
- struct free_area *area;
int free_pages, movable_pages, alike_pages;
int old_block_type;
@@ -2448,8 +2503,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
return;
single_page:
- area = &zone->free_area[current_order];
- move_to_free_area(page, area, start_type);
+ move_to_free_list(page, zone, current_order, start_type);
}
/*
@@ -3120,7 +3174,6 @@ EXPORT_SYMBOL_GPL(split_page);
int __isolate_free_page(struct page *page, unsigned int order)
{
- struct free_area *area = &page_zone(page)->free_area[order];
unsigned long watermark;
struct zone *zone;
int mt;
@@ -3146,7 +3199,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
/* Remove page from free list */
- del_page_from_free_area(page, area);
+ del_page_from_free_list(page, zone, order);
/*
* Set the pageblock if the isolated page is at least half of a
@@ -3167,6 +3220,26 @@ int __isolate_free_page(struct page *page, unsigned int order)
return 1UL << order;
}
+/**
+ * __putback_isolated_page - Return a now-isolated page back where we got it
+ * @page: Page that was isolated
+ * @order: Order of the isolated page
+ * @mt: The page's pageblock's migratetype
+ *
+ * This function is meant to return a page pulled from the free lists via
+ * __isolate_free_page back to the free lists they were pulled from.
+ */
+void __putback_isolated_page(struct page *page, unsigned int order, int mt)
+{
+ struct zone *zone = page_zone(page);
+
+ /* zone lock should be held when this function is called */
+ lockdep_assert_held(&zone->lock);
+
+ /* Return isolated page to tail of freelist. */
+ __free_one_page(page, page_to_pfn(page), zone, order, mt, false);
+}
+
/*
* Update NUMA hit/miss statistics
*
@@ -8713,7 +8786,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(!PageBuddy(page));
order = page_order(page);
offlined_pages += 1 << order;
- del_page_from_free_area(page, &zone->free_area[order]);
+ del_page_from_free_list(page, zone, order);
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 08ded037f89f..a3616f7a0e9e 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -303,11 +303,8 @@ static int __meminit online_page_ext(unsigned long start_pfn,
VM_BUG_ON(!node_state(nid, N_ONLINE));
}
- for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
- if (!pfn_in_present_section(pfn))
- continue;
+ for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
fail = init_section_page_ext(pfn, nid);
- }
if (!fail)
return 0;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index a9fd7c740c23..2c11a38d6e87 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -117,13 +117,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
__mod_zone_freepage_state(zone, nr_pages, migratetype);
}
set_pageblock_migratetype(page, migratetype);
+ if (isolated_page)
+ __putback_isolated_page(page, order, migratetype);
zone->nr_isolate_pageblock--;
out:
spin_unlock_irqrestore(&zone->lock, flags);
- if (isolated_page) {
- post_alloc_hook(page, order, __GFP_MOVABLE);
- __free_pages(page, order);
- }
}
static inline struct page *
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
new file mode 100644
index 000000000000..3bbd471cfc81
--- /dev/null
+++ b/mm/page_reporting.c
@@ -0,0 +1,364 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/page_reporting.h>
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/scatterlist.h>
+
+#include "page_reporting.h"
+#include "internal.h"
+
+#define PAGE_REPORTING_DELAY (2 * HZ)
+static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
+
+enum {
+ PAGE_REPORTING_IDLE = 0,
+ PAGE_REPORTING_REQUESTED,
+ PAGE_REPORTING_ACTIVE
+};
+
+/* request page reporting */
+static void
+__page_reporting_request(struct page_reporting_dev_info *prdev)
+{
+ unsigned int state;
+
+ /* Check to see if we are in desired state */
+ state = atomic_read(&prdev->state);
+ if (state == PAGE_REPORTING_REQUESTED)
+ return;
+
+ /*
+ * If reporting is already active there is nothing we need to do.
+ * Test against 0 as that represents PAGE_REPORTING_IDLE.
+ */
+ state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED);
+ if (state != PAGE_REPORTING_IDLE)
+ return;
+
+ /*
+ * Delay the start of work to allow a sizable queue to build. For
+ * now we are limiting this to running no more than once every
+ * couple of seconds.
+ */
+ schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
+}
+
+/* notify prdev of free page reporting request */
+void __page_reporting_notify(void)
+{
+ struct page_reporting_dev_info *prdev;
+
+ /*
+ * We use RCU to protect the pr_dev_info pointer. In almost all
+ * cases this should be present, however in the unlikely case of
+ * a shutdown this will be NULL and we should exit.
+ */
+ rcu_read_lock();
+ prdev = rcu_dereference(pr_dev_info);
+ if (likely(prdev))
+ __page_reporting_request(prdev);
+
+ rcu_read_unlock();
+}
+
+static void
+page_reporting_drain(struct page_reporting_dev_info *prdev,
+ struct scatterlist *sgl, unsigned int nents, bool reported)
+{
+ struct scatterlist *sg = sgl;
+
+ /*
+ * Drain the now reported pages back into their respective
+ * free lists/areas. We assume at least one page is populated.
+ */
+ do {
+ struct page *page = sg_page(sg);
+ int mt = get_pageblock_migratetype(page);
+ unsigned int order = get_order(sg->length);
+
+ __putback_isolated_page(page, order, mt);
+
+ /* If the pages were not reported due to error skip flagging */
+ if (!reported)
+ continue;
+
+ /*
+ * If page was not comingled with another page we can
+ * consider the result to be "reported" since the page
+ * hasn't been modified, otherwise we will need to
+ * report on the new larger page when we make our way
+ * up to that higher order.
+ */
+ if (PageBuddy(page) && page_order(page) == order)
+ __SetPageReported(page);
+ } while ((sg = sg_next(sg)));
+
+ /* reinitialize scatterlist now that it is empty */
+ sg_init_table(sgl, nents);
+}
+
+/*
+ * The page reporting cycle consists of 4 stages, fill, report, drain, and
+ * idle. We will cycle through the first 3 stages until we cannot obtain a
+ * full scatterlist of pages, in that case we will switch to idle.
+ */
+static int
+page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
+ unsigned int order, unsigned int mt,
+ struct scatterlist *sgl, unsigned int *offset)
+{
+ struct free_area *area = &zone->free_area[order];
+ struct list_head *list = &area->free_list[mt];
+ unsigned int page_len = PAGE_SIZE << order;
+ struct page *page, *next;
+ long budget;
+ int err = 0;
+
+ /*
+ * Perform early check, if free area is empty there is
+ * nothing to process so we can skip this free_list.
+ */
+ if (list_empty(list))
+ return err;
+
+ spin_lock_irq(&zone->lock);
+
+ /*
+ * Limit how many calls we will be making to the page reporting
+ * device for this list. By doing this we avoid processing any
+ * given list for too long.
+ *
+ * The current value used allows us enough calls to process over a
+ * sixteenth of the current list plus one additional call to handle
+ * any pages that may have already been present from the previous
+ * list processed. This should result in us reporting all pages on
+ * an idle system in about 30 seconds.
+ *
+ * The division here should be cheap since PAGE_REPORTING_CAPACITY
+ * should always be a power of 2.
+ */
+ budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
+
+ /* loop through free list adding unreported pages to sg list */
+ list_for_each_entry_safe(page, next, list, lru) {
+ /* We are going to skip over the reported pages. */
+ if (PageReported(page))
+ continue;
+
+ /*
+ * If we fully consumed our budget then update our
+ * state to indicate that we are requesting additional
+ * processing and exit this list.
+ */
+ if (budget < 0) {
+ atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
+ next = page;
+ break;
+ }
+
+ /* Attempt to pull page from list and place in scatterlist */
+ if (*offset) {
+ if (!__isolate_free_page(page, order)) {
+ next = page;
+ break;
+ }
+
+ /* Add page to scatter list */
+ --(*offset);
+ sg_set_page(&sgl[*offset], page, page_len, 0);
+
+ continue;
+ }
+
+ /*
+ * Make the first non-reported page in the free list
+ * the new head of the free list before we release the
+ * zone lock.
+ */
+ if (&page->lru != list && !list_is_first(&page->lru, list))
+ list_rotate_to_front(&page->lru, list);
+
+ /* release lock before waiting on report processing */
+ spin_unlock_irq(&zone->lock);
+
+ /* begin processing pages in local list */
+ err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
+
+ /* reset offset since the full list was reported */
+ *offset = PAGE_REPORTING_CAPACITY;
+
+ /* update budget to reflect call to report function */
+ budget--;
+
+ /* reacquire zone lock and resume processing */
+ spin_lock_irq(&zone->lock);
+
+ /* flush reported pages from the sg list */
+ page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
+
+ /*
+ * Reset next to first entry, the old next isn't valid
+ * since we dropped the lock to report the pages
+ */
+ next = list_first_entry(list, struct page, lru);
+
+ /* exit on error */
+ if (err)
+ break;
+ }
+
+ /* Rotate any leftover pages to the head of the freelist */
+ if (&next->lru != list && !list_is_first(&next->lru, list))
+ list_rotate_to_front(&next->lru, list);
+
+ spin_unlock_irq(&zone->lock);
+
+ return err;
+}
+
+static int
+page_reporting_process_zone(struct page_reporting_dev_info *prdev,
+ struct scatterlist *sgl, struct zone *zone)
+{
+ unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
+ unsigned long watermark;
+ int err = 0;
+
+ /* Generate minimum watermark to be able to guarantee progress */
+ watermark = low_wmark_pages(zone) +
+ (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
+
+ /*
+ * Cancel request if insufficient free memory or if we failed
+ * to allocate page reporting statistics for the zone.
+ */
+ if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+ return err;
+
+ /* Process each free list starting from lowest order/mt */
+ for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
+ for (mt = 0; mt < MIGRATE_TYPES; mt++) {
+ /* We do not pull pages from the isolate free list */
+ if (is_migrate_isolate(mt))
+ continue;
+
+ err = page_reporting_cycle(prdev, zone, order, mt,
+ sgl, &offset);
+ if (err)
+ return err;
+ }
+ }
+
+ /* report the leftover pages before going idle */
+ leftover = PAGE_REPORTING_CAPACITY - offset;
+ if (leftover) {
+ sgl = &sgl[offset];
+ err = prdev->report(prdev, sgl, leftover);
+
+ /* flush any remaining pages out from the last report */
+ spin_lock_irq(&zone->lock);
+ page_reporting_drain(prdev, sgl, leftover, !err);
+ spin_unlock_irq(&zone->lock);
+ }
+
+ return err;
+}
+
+static void page_reporting_process(struct work_struct *work)
+{
+ struct delayed_work *d_work = to_delayed_work(work);
+ struct page_reporting_dev_info *prdev =
+ container_of(d_work, struct page_reporting_dev_info, work);
+ int err = 0, state = PAGE_REPORTING_ACTIVE;
+ struct scatterlist *sgl;
+ struct zone *zone;
+
+ /*
+ * Change the state to "Active" so that we can track if there is
+ * anyone requests page reporting after we complete our pass. If
+ * the state is not altered by the end of the pass we will switch
+ * to idle and quit scheduling reporting runs.
+ */
+ atomic_set(&prdev->state, state);
+
+ /* allocate scatterlist to store pages being reported on */
+ sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
+ if (!sgl)
+ goto err_out;
+
+ sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
+
+ for_each_zone(zone) {
+ err = page_reporting_process_zone(prdev, sgl, zone);
+ if (err)
+ break;
+ }
+
+ kfree(sgl);
+err_out:
+ /*
+ * If the state has reverted back to requested then there may be
+ * additional pages to be processed. We will defer for 2s to allow
+ * more pages to accumulate.
+ */
+ state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
+ if (state == PAGE_REPORTING_REQUESTED)
+ schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
+}
+
+static DEFINE_MUTEX(page_reporting_mutex);
+DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
+
+int page_reporting_register(struct page_reporting_dev_info *prdev)
+{
+ int err = 0;
+
+ mutex_lock(&page_reporting_mutex);
+
+ /* nothing to do if already in use */
+ if (rcu_access_pointer(pr_dev_info)) {
+ err = -EBUSY;
+ goto err_out;
+ }
+
+ /* initialize state and work structures */
+ atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
+ INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
+
+ /* Begin initial flush of zones */
+ __page_reporting_request(prdev);
+
+ /* Assign device to allow notifications */
+ rcu_assign_pointer(pr_dev_info, prdev);
+
+ /* enable page reporting notification */
+ if (!static_key_enabled(&page_reporting_enabled)) {
+ static_branch_enable(&page_reporting_enabled);
+ pr_info("Free page reporting enabled\n");
+ }
+err_out:
+ mutex_unlock(&page_reporting_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(page_reporting_register);
+
+void page_reporting_unregister(struct page_reporting_dev_info *prdev)
+{
+ mutex_lock(&page_reporting_mutex);
+
+ if (rcu_access_pointer(pr_dev_info) == prdev) {
+ /* Disable page reporting notification */
+ RCU_INIT_POINTER(pr_dev_info, NULL);
+ synchronize_rcu();
+
+ /* Flush any existing work, and lock it out */
+ cancel_delayed_work_sync(&prdev->work);
+ }
+
+ mutex_unlock(&page_reporting_mutex);
+}
+EXPORT_SYMBOL_GPL(page_reporting_unregister);
diff --git a/mm/page_reporting.h b/mm/page_reporting.h
new file mode 100644
index 000000000000..aa6d37f4dc22
--- /dev/null
+++ b/mm/page_reporting.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_PAGE_REPORTING_H
+#define _MM_PAGE_REPORTING_H
+
+#include <linux/mmzone.h>
+#include <linux/pageblock-flags.h>
+#include <linux/page-isolation.h>
+#include <linux/jump_label.h>
+#include <linux/slab.h>
+#include <asm/pgtable.h>
+#include <linux/scatterlist.h>
+
+#define PAGE_REPORTING_MIN_ORDER pageblock_order
+
+#ifdef CONFIG_PAGE_REPORTING
+DECLARE_STATIC_KEY_FALSE(page_reporting_enabled);
+void __page_reporting_notify(void);
+
+static inline bool page_reported(struct page *page)
+{
+ return static_branch_unlikely(&page_reporting_enabled) &&
+ PageReported(page);
+}
+
+/**
+ * page_reporting_notify_free - Free page notification to start page processing
+ *
+ * This function is meant to act as a screener for __page_reporting_notify
+ * which will determine if a give zone has crossed over the high-water mark
+ * that will justify us beginning page treatment. If we have crossed that
+ * threshold then it will start the process of pulling some pages and
+ * placing them in the batch list for treatment.
+ */
+static inline void page_reporting_notify_free(unsigned int order)
+{
+ /* Called from hot path in __free_one_page() */
+ if (!static_branch_unlikely(&page_reporting_enabled))
+ return;
+
+ /* Determine if we have crossed reporting threshold */
+ if (order < PAGE_REPORTING_MIN_ORDER)
+ return;
+
+ /* This will add a few cycles, but should be called infrequently */
+ __page_reporting_notify();
+}
+#else /* CONFIG_PAGE_REPORTING */
+#define page_reported(_page) false
+
+static inline void page_reporting_notify_free(unsigned int order)
+{
+}
+#endif /* CONFIG_PAGE_REPORTING */
+#endif /*_MM_PAGE_REPORTING_H */
diff --git a/mm/rmap.c b/mm/rmap.c
index 2df75a119c83..f79a206b271a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -275,19 +275,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
struct anon_vma_chain *avc, *pavc;
struct anon_vma *root = NULL;
- struct vm_area_struct *prev = dst->vm_prev, *pprev = src->vm_prev;
-
- /*
- * If parent share anon_vma with its vm_prev, keep this sharing in in
- * child.
- *
- * 1. Parent has vm_prev, which implies we have vm_prev.
- * 2. Parent and its vm_prev have the same anon_vma.
- */
- if (!dst->anon_vma && src->anon_vma &&
- pprev && pprev->anon_vma == src->anon_vma)
- dst->anon_vma = prev->anon_vma;
-
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma;
@@ -946,7 +933,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
set_pte_at(vma->vm_mm, address, pte, entry);
ret = 1;
} else {
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmd_t *pmd = pvmw.pmd;
pmd_t entry;
@@ -1385,7 +1372,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
struct page *subpage;
bool ret = true;
struct mmu_notifier_range range;
- enum ttu_flags flags = (enum ttu_flags)arg;
+ enum ttu_flags flags = (enum ttu_flags)(long)arg;
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1515,6 +1502,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
/*
* No need to invalidate here it will synchronize on
@@ -1614,6 +1603,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
/*
* No need to invalidate here it will synchronize on
@@ -1680,6 +1671,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm, address,
diff --git a/mm/shmem.c b/mm/shmem.c
index f47347cb30f6..d722eb830317 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -410,7 +410,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
#define SHMEM_HUGE_DENY (-1)
#define SHMEM_HUGE_FORCE (-2)
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* ifdef here to avoid bloating shmem.o when not necessary */
static int shmem_huge __read_mostly;
@@ -580,7 +580,7 @@ static long shmem_unused_huge_count(struct super_block *sb,
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
return READ_ONCE(sbinfo->shrinklist_len);
}
-#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
#define shmem_huge SHMEM_HUGE_DENY
@@ -589,11 +589,11 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
{
return 0;
}
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
{
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
(shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
shmem_huge != SHMEM_HUGE_DENY)
return true;
@@ -789,6 +789,32 @@ void shmem_unlock_mapping(struct address_space *mapping)
}
/*
+ * Check whether a hole-punch or truncation needs to split a huge page,
+ * returning true if no split was required, or the split has been successful.
+ *
+ * Eviction (or truncation to 0 size) should never need to split a huge page;
+ * but in rare cases might do so, if shmem_undo_range() failed to trylock on
+ * head, and then succeeded to trylock on tail.
+ *
+ * A split can only succeed when there are no additional references on the
+ * huge page: so the split below relies upon find_get_entries() having stopped
+ * when it found a subpage of the huge page, without getting further references.
+ */
+static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
+{
+ if (!PageTransCompound(page))
+ return true;
+
+ /* Just proceed to delete a huge page wholly within the range punched */
+ if (PageHead(page) &&
+ page->index >= start && page->index + HPAGE_PMD_NR <= end)
+ return true;
+
+ /* Try to split huge page, so we can truly punch the hole or truncate */
+ return split_huge_page(page) >= 0;
+}
+
+/*
* Remove range of pages and swap entries from page cache, and free them.
* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
*/
@@ -838,31 +864,11 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (!trylock_page(page))
continue;
- if (PageTransTail(page)) {
- /* Middle of THP: zero out the page */
- clear_highpage(page);
- unlock_page(page);
- continue;
- } else if (PageTransHuge(page)) {
- if (index == round_down(end, HPAGE_PMD_NR)) {
- /*
- * Range ends in the middle of THP:
- * zero out the page
- */
- clear_highpage(page);
- unlock_page(page);
- continue;
- }
- index += HPAGE_PMD_NR - 1;
- i += HPAGE_PMD_NR - 1;
- }
-
- if (!unfalloc || !PageUptodate(page)) {
- VM_BUG_ON_PAGE(PageTail(page), page);
- if (page_mapping(page) == mapping) {
- VM_BUG_ON_PAGE(PageWriteback(page), page);
+ if ((!unfalloc || !PageUptodate(page)) &&
+ page_mapping(page) == mapping) {
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
+ if (shmem_punch_compound(page, start, end))
truncate_inode_page(mapping, page);
- }
}
unlock_page(page);
}
@@ -936,43 +942,25 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
lock_page(page);
- if (PageTransTail(page)) {
- /* Middle of THP: zero out the page */
- clear_highpage(page);
- unlock_page(page);
- /*
- * Partial thp truncate due 'start' in middle
- * of THP: don't need to look on these pages
- * again on !pvec.nr restart.
- */
- if (index != round_down(end, HPAGE_PMD_NR))
- start++;
- continue;
- } else if (PageTransHuge(page)) {
- if (index == round_down(end, HPAGE_PMD_NR)) {
- /*
- * Range ends in the middle of THP:
- * zero out the page
- */
- clear_highpage(page);
- unlock_page(page);
- continue;
- }
- index += HPAGE_PMD_NR - 1;
- i += HPAGE_PMD_NR - 1;
- }
-
if (!unfalloc || !PageUptodate(page)) {
- VM_BUG_ON_PAGE(PageTail(page), page);
- if (page_mapping(page) == mapping) {
- VM_BUG_ON_PAGE(PageWriteback(page), page);
- truncate_inode_page(mapping, page);
- } else {
+ if (page_mapping(page) != mapping) {
/* Page was replaced by swap: retry */
unlock_page(page);
index--;
break;
}
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
+ if (shmem_punch_compound(page, start, end))
+ truncate_inode_page(mapping, page);
+ else {
+ /* Wipe the page and don't get stuck */
+ clear_highpage(page);
+ flush_dcache_page(page);
+ set_page_dirty(page);
+ if (index <
+ round_up(start, HPAGE_PMD_NR))
+ start = index + 1;
+ }
}
unlock_page(page);
}
@@ -1059,7 +1047,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
* Part of the huge page can be beyond i_size: subject
* to shrink under memory pressure.
*/
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
spin_lock(&sbinfo->shrinklist_lock);
/*
* _careful to defend against unlocked access to
@@ -1472,9 +1460,6 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
pgoff_t hindex;
struct page *page;
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
- return NULL;
-
hindex = round_down(index, HPAGE_PMD_NR);
if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
XA_PRESENT))
@@ -1486,6 +1471,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
shmem_pseudo_vma_destroy(&pvma);
if (page)
prep_transhuge_page(page);
+ else
+ count_vm_event(THP_FILE_FALLBACK);
return page;
}
@@ -1511,7 +1498,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
int nr;
int err = -ENOSPC;
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
huge = false;
nr = huge ? HPAGE_PMD_NR : 1;
@@ -1813,17 +1800,20 @@ repeat:
if (shmem_huge == SHMEM_HUGE_FORCE)
goto alloc_huge;
switch (sbinfo->huge) {
- loff_t i_size;
- pgoff_t off;
case SHMEM_HUGE_NEVER:
goto alloc_nohuge;
- case SHMEM_HUGE_WITHIN_SIZE:
+ case SHMEM_HUGE_WITHIN_SIZE: {
+ loff_t i_size;
+ pgoff_t off;
+
off = round_up(index, HPAGE_PMD_NR);
i_size = round_up(i_size_read(inode), PAGE_SIZE);
if (i_size >= HPAGE_PMD_SIZE &&
i_size >> PAGE_SHIFT >= off)
goto alloc_huge;
- /* fallthrough */
+
+ fallthrough;
+ }
case SHMEM_HUGE_ADVISE:
if (sgp_huge == SGP_HUGE)
goto alloc_huge;
@@ -1871,8 +1861,13 @@ alloc_nohuge:
error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
PageTransHuge(page));
- if (error)
+ if (error) {
+ if (PageTransHuge(page)) {
+ count_vm_event(THP_FILE_FALLBACK);
+ count_vm_event(THP_FILE_FALLBACK_CHARGE);
+ }
goto unacct;
+ }
error = shmem_add_to_page_cache(page, mapping, hindex,
NULL, gfp & GFP_RECLAIM_MASK);
if (error) {
@@ -2089,7 +2084,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
get_area = current->mm->get_unmapped_area;
addr = get_area(file, uaddr, len, pgoff, flags);
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return addr;
if (IS_ERR_VALUE(addr))
return addr;
@@ -2228,7 +2223,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
(vma->vm_end & HPAGE_PMD_MASK)) {
khugepaged_enter(vma, vma->vm_flags);
@@ -3113,12 +3108,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
error = security_inode_init_security(inode, dir, &dentry->d_name,
shmem_initxattrs, NULL);
- if (error) {
- if (error != -EOPNOTSUPP) {
- iput(inode);
- return error;
- }
- error = 0;
+ if (error && error != -EOPNOTSUPP) {
+ iput(inode);
+ return error;
}
inode->i_size = len-1;
@@ -3455,7 +3447,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
case Opt_huge:
ctx->huge = result.uint_32;
if (ctx->huge != SHMEM_HUGE_NEVER &&
- !(IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
has_transparent_hugepage()))
goto unsupported_parameter;
ctx->seen |= SHMEM_SEEN_HUGE;
@@ -3601,7 +3593,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
seq_printf(seq, ",gid=%u",
from_kgid_munged(&init_user_ns, sbinfo->gid));
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
if (sbinfo->huge)
seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
@@ -3846,7 +3838,7 @@ static const struct super_operations shmem_ops = {
.evict_inode = shmem_evict_inode,
.drop_inode = generic_delete_inode,
.put_super = shmem_put_super,
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
.nr_cached_objects = shmem_unused_huge_count,
.free_cached_objects = shmem_unused_huge_scan,
#endif
@@ -3908,7 +3900,7 @@ int __init shmem_init(void)
goto out1;
}
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
else
@@ -3924,7 +3916,7 @@ out2:
return error;
}
-#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS)
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
static ssize_t shmem_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -3976,9 +3968,9 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
struct kobj_attribute shmem_enabled_attr =
__ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool shmem_huge_enabled(struct vm_area_struct *vma)
{
struct inode *inode = file_inode(vma->vm_file);
@@ -4004,7 +3996,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
if (i_size >= HPAGE_PMD_SIZE &&
i_size >> PAGE_SHIFT >= off)
return true;
- /* fall through */
+ fallthrough;
case SHMEM_HUGE_ADVISE:
/* TODO: implement fadvise() hints */
return (vma->vm_flags & VM_HUGEPAGE);
@@ -4013,7 +4005,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
return false;
}
}
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#else /* !CONFIG_SHMEM */
@@ -4182,7 +4174,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
vma->vm_file = file;
vma->vm_ops = &shmem_vm_ops;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
(vma->vm_end & HPAGE_PMD_MASK)) {
khugepaged_enter(vma, vma->vm_flags);
diff --git a/mm/shuffle.c b/mm/shuffle.c
index c716059cbd3c..44406d9977c7 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -183,11 +183,11 @@ void __meminit __shuffle_free_memory(pg_data_t *pgdat)
shuffle_zone(z);
}
-void add_to_free_area_random(struct page *page, struct free_area *area,
- int migratetype)
+bool shuffle_pick_tail(void)
{
static u64 rand;
static u8 rand_bits;
+ bool ret;
/*
* The lack of locking is deliberate. If 2 threads race to
@@ -198,10 +198,10 @@ void add_to_free_area_random(struct page *page, struct free_area *area,
rand = get_random_u64();
}
- if (rand & 1)
- add_to_free_area(page, area, migratetype);
- else
- add_to_free_area_tail(page, area, migratetype);
+ ret = rand & 1;
+
rand_bits--;
rand >>= 1;
+
+ return ret;
}
diff --git a/mm/shuffle.h b/mm/shuffle.h
index 777a257a0d2f..4d79f03b6658 100644
--- a/mm/shuffle.h
+++ b/mm/shuffle.h
@@ -22,6 +22,7 @@ enum mm_shuffle_ctl {
DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl);
extern void __shuffle_free_memory(pg_data_t *pgdat);
+extern bool shuffle_pick_tail(void);
static inline void shuffle_free_memory(pg_data_t *pgdat)
{
if (!static_branch_unlikely(&page_alloc_shuffle_key))
@@ -44,6 +45,11 @@ static inline bool is_shuffle_order(int order)
return order >= SHUFFLE_ORDER;
}
#else
+static inline bool shuffle_pick_tail(void)
+{
+ return false;
+}
+
static inline void shuffle_free_memory(pg_data_t *pgdat)
{
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5282f881d2f5..23c7500eea7d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -731,7 +731,7 @@ static void kmemcg_rcufn(struct rcu_head *head)
/*
* We need to grab blocking locks. Bounce to ->work. The
* work item shares the space with the RCU head and can't be
- * initialized eariler.
+ * initialized earlier.
*/
INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
@@ -1581,6 +1581,7 @@ static int slabinfo_open(struct inode *inode, struct file *file)
}
static const struct proc_ops slabinfo_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = slabinfo_open,
.proc_read = seq_read,
.proc_write = slabinfo_write,
diff --git a/mm/slub.c b/mm/slub.c
index 3098e0cf2899..332d4b459a90 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -449,6 +449,7 @@ static DEFINE_SPINLOCK(object_map_lock);
* not vanish from under us.
*/
static unsigned long *get_map(struct kmem_cache *s, struct page *page)
+ __acquires(&object_map_lock)
{
void *p;
void *addr = page_address(page);
@@ -465,7 +466,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
return object_map;
}
-static void put_map(unsigned long *map)
+static void put_map(unsigned long *map) __releases(&object_map_lock)
{
VM_BUG_ON(map != object_map);
lockdep_assert_held(&object_map_lock);
diff --git a/mm/sparse.c b/mm/sparse.c
index f1af4d4ee80b..1aee5a481571 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -209,6 +209,7 @@ static inline unsigned long first_present_section_nr(void)
return next_present_section_nr(-1);
}
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
static void subsection_mask_set(unsigned long *map, unsigned long pfn,
unsigned long nr_pages)
{
@@ -243,6 +244,11 @@ void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
nr_pages -= pfns;
}
}
+#else
+void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
+{
+}
+#endif
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
@@ -660,6 +666,55 @@ static void free_map_bootmem(struct page *memmap)
vmemmap_free(start, end, NULL);
}
+
+static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
+{
+ DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+ DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
+ struct mem_section *ms = __pfn_to_section(pfn);
+ unsigned long *subsection_map = ms->usage
+ ? &ms->usage->subsection_map[0] : NULL;
+
+ subsection_mask_set(map, pfn, nr_pages);
+ if (subsection_map)
+ bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
+
+ if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
+ "section already deactivated (%#lx + %ld)\n",
+ pfn, nr_pages))
+ return -EINVAL;
+
+ bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
+ return 0;
+}
+
+static bool is_subsection_map_empty(struct mem_section *ms)
+{
+ return bitmap_empty(&ms->usage->subsection_map[0],
+ SUBSECTIONS_PER_SECTION);
+}
+
+static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
+{
+ struct mem_section *ms = __pfn_to_section(pfn);
+ DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+ unsigned long *subsection_map;
+ int rc = 0;
+
+ subsection_mask_set(map, pfn, nr_pages);
+
+ subsection_map = &ms->usage->subsection_map[0];
+
+ if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
+ rc = -EINVAL;
+ else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
+ rc = -EEXIST;
+ else
+ bitmap_or(subsection_map, map, subsection_map,
+ SUBSECTIONS_PER_SECTION);
+
+ return rc;
+}
#else
struct page * __meminit populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
@@ -703,48 +758,51 @@ static void free_map_bootmem(struct page *memmap)
put_page_bootmem(page);
}
}
+
+static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
+{
+ return 0;
+}
+
+static bool is_subsection_map_empty(struct mem_section *ms)
+{
+ return true;
+}
+
+static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
+{
+ return 0;
+}
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+/*
+ * To deactivate a memory region, there are 3 cases to handle across
+ * two configurations (SPARSEMEM_VMEMMAP={y,n}):
+ *
+ * 1. deactivation of a partial hot-added section (only possible in
+ * the SPARSEMEM_VMEMMAP=y case).
+ * a) section was present at memory init.
+ * b) section was hot-added post memory init.
+ * 2. deactivation of a complete hot-added section.
+ * 3. deactivation of a complete section from memory init.
+ *
+ * For 1, when subsection_map does not empty we will not be freeing the
+ * usage map, but still need to free the vmemmap range.
+ *
+ * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified
+ */
static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
struct vmem_altmap *altmap)
{
- DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
- DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
struct mem_section *ms = __pfn_to_section(pfn);
bool section_is_early = early_section(ms);
struct page *memmap = NULL;
bool empty;
- unsigned long *subsection_map = ms->usage
- ? &ms->usage->subsection_map[0] : NULL;
-
- subsection_mask_set(map, pfn, nr_pages);
- if (subsection_map)
- bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
- if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
- "section already deactivated (%#lx + %ld)\n",
- pfn, nr_pages))
+ if (clear_subsection_map(pfn, nr_pages))
return;
- /*
- * There are 3 cases to handle across two configurations
- * (SPARSEMEM_VMEMMAP={y,n}):
- *
- * 1/ deactivation of a partial hot-added section (only possible
- * in the SPARSEMEM_VMEMMAP=y case).
- * a/ section was present at memory init
- * b/ section was hot-added post memory init
- * 2/ deactivation of a complete hot-added section
- * 3/ deactivation of a complete section from memory init
- *
- * For 1/, when subsection_map does not empty we will not be
- * freeing the usage map, but still need to free the vmemmap
- * range.
- *
- * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
- */
- bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
- empty = bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION);
+ empty = is_subsection_map_empty(ms);
if (empty) {
unsigned long section_nr = pfn_to_section_nr(pfn);
@@ -780,31 +838,19 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
static struct page * __meminit section_activate(int nid, unsigned long pfn,
unsigned long nr_pages, struct vmem_altmap *altmap)
{
- DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
struct mem_section *ms = __pfn_to_section(pfn);
struct mem_section_usage *usage = NULL;
- unsigned long *subsection_map;
struct page *memmap;
int rc = 0;
- subsection_mask_set(map, pfn, nr_pages);
-
if (!ms->usage) {
usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
if (!usage)
return ERR_PTR(-ENOMEM);
ms->usage = usage;
}
- subsection_map = &ms->usage->subsection_map[0];
-
- if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
- rc = -EINVAL;
- else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
- rc = -EEXIST;
- else
- bitmap_or(subsection_map, map, subsection_map,
- SUBSECTIONS_PER_SECTION);
+ rc = fill_subsection_map(pfn, nr_pages);
if (rc) {
if (usage)
ms->usage = NULL;
@@ -840,6 +886,10 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
*
* This is only intended for hotplug.
*
+ * Note that only VMEMMAP supports sub-section aligned hotplug,
+ * the proper alignment and size are gated by check_pfn_span().
+ *
+ *
* Return:
* * 0 - On success.
* * -EEXIST - Section has been present.
diff --git a/mm/swap.c b/mm/swap.c
index a4af8c999963..bf9a79fed62d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -276,7 +276,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
void *arg)
{
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- int file = page_is_file_cache(page);
+ int file = page_is_file_lru(page);
int lru = page_lru_base_type(page);
del_page_from_lru_list(page, lruvec, lru);
@@ -394,7 +394,7 @@ void mark_page_accessed(struct page *page)
else
__lru_cache_activate_page(page);
ClearPageReferenced(page);
- if (page_is_file_cache(page))
+ if (page_is_file_lru(page))
workingset_activation(page);
}
if (page_is_idle(page))
@@ -515,7 +515,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
return;
active = PageActive(page);
- file = page_is_file_cache(page);
+ file = page_is_file_lru(page);
lru = page_lru_base_type(page);
del_page_from_lru_list(page, lruvec, lru + active);
@@ -548,7 +548,7 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- int file = page_is_file_cache(page);
+ int file = page_is_file_lru(page);
int lru = page_lru_base_type(page);
del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
@@ -573,9 +573,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
ClearPageActive(page);
ClearPageReferenced(page);
/*
- * lazyfree pages are clean anonymous pages. They have
- * SwapBacked flag cleared to distinguish normal anonymous
- * pages
+ * Lazyfree pages are clean anonymous pages. They have
+ * PG_swapbacked flag cleared, to distinguish them from normal
+ * anonymous pages
*/
ClearPageSwapBacked(page);
add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
@@ -962,7 +962,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
if (page_evictable(page)) {
lru = page_lru(page);
- update_page_reclaim_stat(lruvec, page_is_file_cache(page),
+ update_page_reclaim_stat(lruvec, page_is_file_lru(page),
PageActive(page));
if (was_unevictable)
count_vm_event(UNEVICTABLE_PGRESCUED);
@@ -1004,6 +1004,10 @@ void __pagevec_lru_add(struct pagevec *pvec)
* ascending indexes. There may be holes in the indices due to
* not-present entries.
*
+ * Only one subpage of a Transparent Huge Page is returned in one call:
+ * allowing truncate_inode_pages_range() to evict the whole THP without
+ * cycling through a pagevec of extra references.
+ *
* pagevec_lookup_entries() returns the number of entries which were
* found.
*/
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 273a923c275c..5871a2aa86a5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2797,6 +2797,7 @@ static int swaps_open(struct inode *inode, struct file *file)
}
static const struct proc_ops swaps_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = swaps_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index bd96855f3961..512576e171ce 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -53,7 +53,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- struct page **pagep)
+ struct page **pagep,
+ bool wp_copy)
{
struct mem_cgroup *memcg;
pte_t _dst_pte, *dst_pte;
@@ -99,9 +100,13 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
goto out_release;
- _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
- if (dst_vma->vm_flags & VM_WRITE)
- _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+ _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
+ if (dst_vma->vm_flags & VM_WRITE) {
+ if (wp_copy)
+ _dst_pte = pte_mkuffd_wp(_dst_pte);
+ else
+ _dst_pte = pte_mkwrite(_dst_pte);
+ }
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
if (dst_vma->vm_file) {
@@ -415,7 +420,8 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
unsigned long dst_addr,
unsigned long src_addr,
struct page **page,
- bool zeropage)
+ bool zeropage,
+ bool wp_copy)
{
ssize_t err;
@@ -432,11 +438,13 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
if (!(dst_vma->vm_flags & VM_SHARED)) {
if (!zeropage)
err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, page);
+ dst_addr, src_addr, page,
+ wp_copy);
else
err = mfill_zeropage_pte(dst_mm, dst_pmd,
dst_vma, dst_addr);
} else {
+ VM_WARN_ON_ONCE(wp_copy);
if (!zeropage)
err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
dst_vma, dst_addr,
@@ -454,7 +462,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long src_start,
unsigned long len,
bool zeropage,
- bool *mmap_changing)
+ bool *mmap_changing,
+ __u64 mode)
{
struct vm_area_struct *dst_vma;
ssize_t err;
@@ -462,6 +471,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long src_addr, dst_addr;
long copied;
struct page *page;
+ bool wp_copy;
/*
* Sanitize the command parameters:
@@ -508,6 +518,14 @@ retry:
goto out_unlock;
/*
+ * validate 'mode' now that we know the dst_vma: don't allow
+ * a wrprotect copy if the userfaultfd didn't register as WP.
+ */
+ wp_copy = mode & UFFDIO_COPY_MODE_WP;
+ if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
+ goto out_unlock;
+
+ /*
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
@@ -562,7 +580,7 @@ retry:
BUG_ON(pmd_trans_huge(*dst_pmd));
err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- src_addr, &page, zeropage);
+ src_addr, &page, zeropage, wp_copy);
cond_resched();
if (unlikely(err == -ENOENT)) {
@@ -609,14 +627,68 @@ out:
ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len,
- bool *mmap_changing)
+ bool *mmap_changing, __u64 mode)
{
return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
- mmap_changing);
+ mmap_changing, mode);
}
ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, bool *mmap_changing)
{
- return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
+ return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
+}
+
+int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, bool enable_wp, bool *mmap_changing)
+{
+ struct vm_area_struct *dst_vma;
+ pgprot_t newprot;
+ int err;
+
+ /*
+ * Sanitize the command parameters:
+ */
+ BUG_ON(start & ~PAGE_MASK);
+ BUG_ON(len & ~PAGE_MASK);
+
+ /* Does the address range wrap, or is the span zero-sized? */
+ BUG_ON(start + len <= start);
+
+ down_read(&dst_mm->mmap_sem);
+
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ err = -EAGAIN;
+ if (mmap_changing && READ_ONCE(*mmap_changing))
+ goto out_unlock;
+
+ err = -ENOENT;
+ dst_vma = find_dst_vma(dst_mm, start, len);
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+ goto out_unlock;
+ if (!userfaultfd_wp(dst_vma))
+ goto out_unlock;
+ if (!vma_is_anonymous(dst_vma))
+ goto out_unlock;
+
+ if (enable_wp)
+ newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
+ else
+ newprot = vm_get_page_prot(dst_vma->vm_flags);
+
+ change_protection(dst_vma, start, start + len, newprot,
+ enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
+
+ err = 0;
+out_unlock:
+ up_read(&dst_mm->mmap_sem);
+ return err;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6b8eeb0ecee5..399f219544f7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3368,7 +3368,7 @@ retry:
goto overflow;
/*
- * If required width exeeds current VA block, move
+ * If required width exceeds current VA block, move
* base downwards and then recheck.
*/
if (base + end > va->va_end) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2e8e690d2813..b06868fc4926 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -919,7 +919,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* exceptional entries and shadow exceptional entries in the
* same address_space.
*/
- if (reclaimed && page_is_file_cache(page) &&
+ if (reclaimed && page_is_file_lru(page) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(page, target_memcg);
__delete_from_page_cache(page, shadow);
@@ -1043,7 +1043,7 @@ static void page_check_dirty_writeback(struct page *page,
* Anonymous pages are not handled by flushers and must be written
* from reclaim context. Do not stall reclaim based on them
*/
- if (!page_is_file_cache(page) ||
+ if (!page_is_file_lru(page) ||
(PageAnon(page) && !PageSwapBacked(page))) {
*dirty = false;
*writeback = false;
@@ -1315,7 +1315,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* the rest of the LRU for clean pages and see
* the same dirty pages again (PageReclaim).
*/
- if (page_is_file_cache(page) &&
+ if (page_is_file_lru(page) &&
(!current_is_kswapd() || !PageReclaim(page) ||
!test_bit(PGDAT_DIRTY, &pgdat->flags))) {
/*
@@ -1459,7 +1459,7 @@ activate_locked:
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
if (!PageMlocked(page)) {
- int type = page_is_file_cache(page);
+ int type = page_is_file_lru(page);
SetPageActive(page);
stat->nr_activate[type] += nr_pages;
count_memcg_page_event(page, PGACTIVATE);
@@ -1497,7 +1497,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
LIST_HEAD(clean_pages);
list_for_each_entry_safe(page, next, page_list, lru) {
- if (page_is_file_cache(page) && !PageDirty(page) &&
+ if (page_is_file_lru(page) && !PageDirty(page) &&
!__PageMovable(page) && !PageUnevictable(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
@@ -2053,7 +2053,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
* IO, plus JVM can create lots of anon VM_EXEC pages,
* so we ignore them here.
*/
- if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
+ if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
list_add(&page->lru, &l_active);
continue;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c9c0d71f917f..96d21a792b57 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1256,9 +1256,12 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
"thp_fault_alloc",
"thp_fault_fallback",
+ "thp_fault_fallback_charge",
"thp_collapse_alloc",
"thp_collapse_alloc_failed",
"thp_file_alloc",
+ "thp_file_fallback",
+ "thp_file_fallback_charge",
"thp_file_mapped",
"thp_split_page",
"thp_split_page_failed",
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 22d17ecfe7df..2f836a2b993f 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -424,7 +424,7 @@ static void *zs_zpool_map(void *pool, unsigned long handle,
case ZPOOL_MM_WO:
zs_mm = ZS_MM_WO;
break;
- case ZPOOL_MM_RW: /* fall through */
+ case ZPOOL_MM_RW:
default:
zs_mm = ZS_MM_RW;
break;
@@ -891,12 +891,12 @@ static inline int trypin_tag(unsigned long handle)
return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
}
-static void pin_tag(unsigned long handle)
+static void pin_tag(unsigned long handle) __acquires(bitlock)
{
bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
}
-static void unpin_tag(unsigned long handle)
+static void unpin_tag(unsigned long handle) __releases(bitlock)
{
bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
}
@@ -1833,12 +1833,12 @@ static void migrate_lock_init(struct zspage *zspage)
rwlock_init(&zspage->lock);
}
-static void migrate_read_lock(struct zspage *zspage)
+static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock)
{
read_lock(&zspage->lock);
}
-static void migrate_read_unlock(struct zspage *zspage)
+static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
{
read_unlock(&zspage->lock);
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 55094e63b72d..fbb782924ccc 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -77,8 +77,8 @@ static bool zswap_pool_reached_full;
#define ZSWAP_PARAM_UNSET ""
-/* Enable/disable zswap (disabled by default) */
-static bool zswap_enabled;
+/* Enable/disable zswap */
+static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
static int zswap_enabled_param_set(const char *,
const struct kernel_param *);
static struct kernel_param_ops zswap_enabled_param_ops = {
@@ -88,8 +88,7 @@ static struct kernel_param_ops zswap_enabled_param_ops = {
module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
/* Crypto compressor to use */
-#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
-static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
+static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
static int zswap_compressor_param_set(const char *,
const struct kernel_param *);
static struct kernel_param_ops zswap_compressor_param_ops = {
@@ -101,8 +100,7 @@ module_param_cb(compressor, &zswap_compressor_param_ops,
&zswap_compressor, 0644);
/* Compressed storage zpool to use */
-#define ZSWAP_ZPOOL_DEFAULT "zbud"
-static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
+static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
static int zswap_zpool_param_set(const char *, const struct kernel_param *);
static struct kernel_param_ops zswap_zpool_param_ops = {
.set = zswap_zpool_param_set,
@@ -599,11 +597,12 @@ static __init struct zswap_pool *__zswap_pool_create_fallback(void)
bool has_comp, has_zpool;
has_comp = crypto_has_comp(zswap_compressor, 0, 0);
- if (!has_comp && strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) {
+ if (!has_comp && strcmp(zswap_compressor,
+ CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
pr_err("compressor %s not available, using default %s\n",
- zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
+ zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
param_free_charp(&zswap_compressor);
- zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
+ zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
has_comp = crypto_has_comp(zswap_compressor, 0, 0);
}
if (!has_comp) {
@@ -614,11 +613,12 @@ static __init struct zswap_pool *__zswap_pool_create_fallback(void)
}
has_zpool = zpool_has_pool(zswap_zpool_type);
- if (!has_zpool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
+ if (!has_zpool && strcmp(zswap_zpool_type,
+ CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
pr_err("zpool %s not available, using default %s\n",
- zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
+ zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
param_free_charp(&zswap_zpool_type);
- zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
+ zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
has_zpool = zpool_has_pool(zswap_zpool_type);
}
if (!has_zpool) {
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 7cb992e55475..1344f232ecc5 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -383,11 +383,11 @@ static int client_options_show(struct seq_file *s, void *p)
return 0;
}
-CEPH_DEFINE_SHOW_FUNC(monmap_show)
-CEPH_DEFINE_SHOW_FUNC(osdmap_show)
-CEPH_DEFINE_SHOW_FUNC(monc_show)
-CEPH_DEFINE_SHOW_FUNC(osdc_show)
-CEPH_DEFINE_SHOW_FUNC(client_options_show)
+DEFINE_SHOW_ATTRIBUTE(monmap);
+DEFINE_SHOW_ATTRIBUTE(osdmap);
+DEFINE_SHOW_ATTRIBUTE(monc);
+DEFINE_SHOW_ATTRIBUTE(osdc);
+DEFINE_SHOW_ATTRIBUTE(client_options);
void __init ceph_debugfs_init(void)
{
@@ -414,31 +414,31 @@ void ceph_debugfs_client_init(struct ceph_client *client)
0400,
client->debugfs_dir,
client,
- &monc_show_fops);
+ &monc_fops);
client->osdc.debugfs_file = debugfs_create_file("osdc",
0400,
client->debugfs_dir,
client,
- &osdc_show_fops);
+ &osdc_fops);
client->debugfs_monmap = debugfs_create_file("monmap",
0400,
client->debugfs_dir,
client,
- &monmap_show_fops);
+ &monmap_fops);
client->debugfs_osdmap = debugfs_create_file("osdmap",
0400,
client->debugfs_dir,
client,
- &osdmap_show_fops);
+ &osdmap_fops);
client->debugfs_options = debugfs_create_file("client_options",
0400,
client->debugfs_dir,
client,
- &client_options_show_fops);
+ &client_options_fops);
}
void ceph_debugfs_client_cleanup(struct ceph_client *client)
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 9d9e4e4ea600..3d8c8015e976 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -467,7 +467,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
struct ceph_msg *msg)
{
struct ceph_client *client = monc->client;
- struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+ struct ceph_monmap *monmap;
void *p, *end;
mutex_lock(&monc->mutex);
@@ -484,13 +484,13 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
goto out;
}
- if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
+ if (ceph_check_fsid(client, &monmap->fsid) < 0) {
kfree(monmap);
goto out;
}
- client->monc.monmap = monmap;
- kfree(old);
+ kfree(monc->monmap);
+ monc->monmap = monmap;
__ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch);
client->have_fsid = true;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index af868d3923b9..998e26b75a78 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -3483,9 +3483,6 @@ static int ceph_redirect_decode(void **p, void *end,
goto e_inval;
}
- len = ceph_decode_32(p);
- *p += len; /* skip osd_instructions */
-
/* skip the rest */
*p = struct_end;
out:
@@ -5228,85 +5225,6 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
ceph_msgpool_destroy(&osdc->msgpool_op_reply);
}
-/*
- * Read some contiguous pages. If we cross a stripe boundary, shorten
- * *plen. Return number of bytes read, or error.
- */
-int ceph_osdc_readpages(struct ceph_osd_client *osdc,
- struct ceph_vino vino, struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u32 truncate_seq, u64 truncate_size,
- struct page **pages, int num_pages, int page_align)
-{
- struct ceph_osd_request *req;
- int rc = 0;
-
- dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
- vino.snap, off, *plen);
- req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
- CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
- NULL, truncate_seq, truncate_size,
- false);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- /* it may be a short read due to an object boundary */
- osd_req_op_extent_osd_data_pages(req, 0,
- pages, *plen, page_align, false, false);
-
- dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
- off, *plen, *plen, page_align);
-
- rc = ceph_osdc_start_request(osdc, req, false);
- if (!rc)
- rc = ceph_osdc_wait_request(osdc, req);
-
- ceph_osdc_put_request(req);
- dout("readpages result %d\n", rc);
- return rc;
-}
-EXPORT_SYMBOL(ceph_osdc_readpages);
-
-/*
- * do a synchronous write on N pages
- */
-int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
- struct ceph_file_layout *layout,
- struct ceph_snap_context *snapc,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- struct timespec64 *mtime,
- struct page **pages, int num_pages)
-{
- struct ceph_osd_request *req;
- int rc = 0;
- int page_align = off & ~PAGE_MASK;
-
- req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
- CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
- snapc, truncate_seq, truncate_size,
- true);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- /* it may be a short write due to an object boundary */
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
- false, false);
- dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
-
- req->r_mtime = *mtime;
- rc = ceph_osdc_start_request(osdc, req, true);
- if (!rc)
- rc = ceph_osdc_wait_request(osdc, req);
-
- ceph_osdc_put_request(req);
- if (rc == 0)
- rc = len;
- dout("writepages result %d\n", rc);
- return rc;
-}
-EXPORT_SYMBOL(ceph_osdc_writepages);
-
static int osd_req_op_copy_from_init(struct ceph_osd_request *req,
u64 src_snapid, u64 src_version,
struct ceph_object_id *src_oid,
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 5bf8d22a47ec..39d37d0ef575 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1065,11 +1065,12 @@ static void neigh_timer_handler(struct timer_list *t)
neigh->updated = jiffies;
atomic_set(&neigh->probes, 0);
notify = 1;
- next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
+ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
+ HZ/100);
}
} else {
/* NUD_PROBE|NUD_INCOMPLETE */
- next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
+ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100);
}
if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
@@ -1125,7 +1126,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
neigh->nud_state = NUD_INCOMPLETE;
neigh->updated = now;
next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
- HZ/2);
+ HZ/100);
neigh_add_timer(neigh, next);
immediate_probe = true;
} else {
@@ -1427,7 +1428,8 @@ void __neigh_set_probe_once(struct neighbour *neigh)
neigh->nud_state = NUD_INCOMPLETE;
atomic_set(&neigh->probes, neigh_max_probes(neigh));
neigh_add_timer(neigh,
- jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME));
+ jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
+ HZ/100));
}
EXPORT_SYMBOL(__neigh_set_probe_once);
diff --git a/net/core/sock.c b/net/core/sock.c
index da32d9b6d09f..ce1d8dce9b7a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -574,7 +574,7 @@ static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
/* Sorry... */
ret = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_RAW))
+ if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
goto out;
ret = -EINVAL;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 5390ff541658..e94eb1aac602 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1338,7 +1338,7 @@ static void dsa_hw_port_list_free(struct list_head *hw_port_list)
}
/* Make the hardware datapath to/from @dev limited to a common MTU */
-void dsa_bridge_mtu_normalization(struct dsa_port *dp)
+static void dsa_bridge_mtu_normalization(struct dsa_port *dp)
{
struct list_head hw_port_list;
struct dsa_switch_tree *dst;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a11fd4d67832..24e319dfb510 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1357,7 +1357,7 @@ retry:
regen_advance = idev->cnf.regen_max_retry *
idev->cnf.dad_transmits *
- NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ;
+ max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ;
/* recalculate max_desync_factor each time and update
* idev->desync_factor if it's larger
@@ -3298,6 +3298,10 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
if (netif_is_l3_master(idev->dev))
return;
+ /* no link local addresses on devices flagged as slaves */
+ if (idev->dev->flags & IFF_SLAVE)
+ return;
+
ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
switch (idev->cnf.addr_gen_mode) {
@@ -4117,7 +4121,8 @@ static void addrconf_dad_work(struct work_struct *w)
ifp->dad_probes--;
addrconf_mod_dad_work(ifp,
- NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME));
+ max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME),
+ HZ/100));
spin_unlock(&ifp->lock);
write_unlock_bh(&idev->lock);
@@ -4523,7 +4528,7 @@ restart:
!(ifp->flags&IFA_F_TENTATIVE)) {
unsigned long regen_advance = ifp->idev->cnf.regen_max_retry *
ifp->idev->cnf.dad_transmits *
- NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME) / HZ;
+ max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), HZ/100) / HZ;
if (age >= ifp->prefered_lft - regen_advance) {
struct inet6_ifaddr *ifpub = ifp->ifpub;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 6ffa153e5166..1ecd4e9b0bdf 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1359,8 +1359,8 @@ skip_defrtr:
if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) {
rtime = (rtime*HZ)/1000;
- if (rtime < HZ/10)
- rtime = HZ/10;
+ if (rtime < HZ/100)
+ rtime = HZ/100;
NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime);
in6_dev->tstamp = jiffies;
send_ifinfo_notify = true;
diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c
index dc4f20e23bf7..d38b476fc7f2 100644
--- a/net/ipv6/rpl.c
+++ b/net/ipv6/rpl.c
@@ -48,7 +48,7 @@ void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr,
outhdr->cmpri = 0;
outhdr->cmpre = 0;
- for (i = 0; i <= n; i++)
+ for (i = 0; i < n; i++)
ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[i], daddr,
ipv6_rpl_segdata_pos(inhdr, i),
inhdr->cmpri);
@@ -66,7 +66,7 @@ static unsigned char ipv6_rpl_srh_calc_cmpri(const struct ipv6_rpl_sr_hdr *inhdr
int i;
for (plen = 0; plen < sizeof(*daddr); plen++) {
- for (i = 0; i <= n; i++) {
+ for (i = 0; i < n; i++) {
if (daddr->s6_addr[plen] !=
inhdr->rpl_segaddr[i].s6_addr[plen])
return plen;
@@ -114,7 +114,7 @@ void ipv6_rpl_srh_compress(struct ipv6_rpl_sr_hdr *outhdr,
outhdr->cmpri = cmpri;
outhdr->cmpre = cmpre;
- for (i = 0; i <= n; i++)
+ for (i = 0; i < n; i++)
ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, i),
&inhdr->rpl_segaddr[i], cmpri);
diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c
index a49ddc6cd020..c3ececd7cfc1 100644
--- a/net/ipv6/rpl_iptunnel.c
+++ b/net/ipv6/rpl_iptunnel.c
@@ -210,7 +210,7 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb)
struct dst_entry *orig_dst = skb_dst(skb);
struct dst_entry *dst = NULL;
struct rpl_lwt *rlwt;
- int err = -EINVAL;
+ int err;
rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate);
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index bd220ee4aac9..faf57585b892 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -4,6 +4,8 @@
* Copyright (c) 2017 - 2019, Intel Corporation.
*/
+#define pr_fmt(fmt) "MPTCP: " fmt
+
#include <linux/kernel.h>
#include <net/tcp.h>
#include <net/mptcp.h>
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 064639f72487..977d9c8b1453 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -3,6 +3,8 @@
*
* Copyright (c) 2019, Intel Corporation.
*/
+#define pr_fmt(fmt) "MPTCP: " fmt
+
#include <linux/kernel.h>
#include <net/tcp.h>
#include <net/mptcp.h>
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index a0ce7f324499..86d61ab34c7c 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -4,6 +4,8 @@
* Copyright (c) 2020, Red Hat, Inc.
*/
+#define pr_fmt(fmt) "MPTCP: " fmt
+
#include <linux/inet.h>
#include <linux/kernel.h>
#include <net/tcp.h>
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 1833bc1f4a43..939a5045181a 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -57,10 +57,43 @@ static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk)
return msk->first && !sk_is_mptcp(msk->first);
}
+static struct socket *mptcp_is_tcpsk(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (sock->sk != sk)
+ return NULL;
+
+ if (unlikely(sk->sk_prot == &tcp_prot)) {
+ /* we are being invoked after mptcp_accept() has
+ * accepted a non-mp-capable flow: sk is a tcp_sk,
+ * not an mptcp one.
+ *
+ * Hand the socket over to tcp so all further socket ops
+ * bypass mptcp.
+ */
+ sock->ops = &inet_stream_ops;
+ return sock;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ } else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
+ sock->ops = &inet6_stream_ops;
+ return sock;
+#endif
+ }
+
+ return NULL;
+}
+
static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk)
{
+ struct socket *sock;
+
sock_owned_by_me((const struct sock *)msk);
+ sock = mptcp_is_tcpsk((struct sock *)msk);
+ if (unlikely(sock))
+ return sock;
+
if (likely(!__mptcp_needs_tcp_fallback(msk)))
return NULL;
@@ -84,6 +117,10 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state)
struct socket *ssock;
int err;
+ ssock = __mptcp_tcp_fallback(msk);
+ if (unlikely(ssock))
+ return ssock;
+
ssock = __mptcp_nmpc_socket(msk);
if (ssock)
goto set_state;
@@ -121,6 +158,27 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
MPTCP_SKB_CB(skb)->offset = offset;
}
+/* both sockets must be locked */
+static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk,
+ struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ u64 dsn = mptcp_subflow_get_mapped_dsn(subflow);
+
+ /* revalidate data sequence number.
+ *
+ * mptcp_subflow_data_available() is usually called
+ * without msk lock. Its unlikely (but possible)
+ * that msk->ack_seq has been advanced since the last
+ * call found in-sequence data.
+ */
+ if (likely(dsn == msk->ack_seq))
+ return true;
+
+ subflow->data_avail = 0;
+ return mptcp_subflow_data_available(ssk);
+}
+
static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
struct sock *ssk,
unsigned int *bytes)
@@ -132,6 +190,11 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
struct tcp_sock *tp;
bool done = false;
+ if (!mptcp_subflow_dsn_valid(msk, ssk)) {
+ *bytes = 0;
+ return false;
+ }
+
if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf);
@@ -290,6 +353,15 @@ void mptcp_data_acked(struct sock *sk)
sock_hold(sk);
}
+void mptcp_subflow_eof(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) &&
+ schedule_work(&msk->work))
+ sock_hold(sk);
+}
+
static void mptcp_stop_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -994,6 +1066,27 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
return 0;
}
+static void mptcp_check_for_eof(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ int receivers = 0;
+
+ mptcp_for_each_subflow(msk, subflow)
+ receivers += !subflow->rx_eof;
+
+ if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+ /* hopefully temporary hack: propagate shutdown status
+ * to msk, when all subflows agree on it
+ */
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+
+ smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
+ set_bit(MPTCP_DATA_READY, &msk->flags);
+ sk->sk_data_ready(sk);
+ }
+}
+
static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
@@ -1010,6 +1103,9 @@ static void mptcp_worker(struct work_struct *work)
__mptcp_flush_join_list(msk);
__mptcp_move_skbs(msk);
+ if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
+ mptcp_check_for_eof(msk);
+
if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
goto unlock;
@@ -1752,7 +1848,9 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
msk = mptcp_sk(sk);
lock_sock(sk);
- ssock = __mptcp_nmpc_socket(msk);
+ ssock = __mptcp_tcp_fallback(msk);
+ if (!ssock)
+ ssock = __mptcp_nmpc_socket(msk);
if (ssock) {
mask = ssock->ops->poll(file, ssock, wait);
release_sock(sk);
@@ -1762,9 +1860,6 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
release_sock(sk);
sock_poll_wait(file, sock, wait);
lock_sock(sk);
- ssock = __mptcp_tcp_fallback(msk);
- if (unlikely(ssock))
- return ssock->ops->poll(file, ssock, NULL);
if (test_bit(MPTCP_DATA_READY, &msk->flags))
mask = EPOLLIN | EPOLLRDNORM;
@@ -1783,11 +1878,17 @@ static int mptcp_shutdown(struct socket *sock, int how)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
struct mptcp_subflow_context *subflow;
+ struct socket *ssock;
int ret = 0;
pr_debug("sk=%p, how=%d", msk, how);
lock_sock(sock->sk);
+ ssock = __mptcp_tcp_fallback(msk);
+ if (ssock) {
+ release_sock(sock->sk);
+ return inet_shutdown(ssock, how);
+ }
if (how == SHUT_WR || how == SHUT_RDWR)
inet_sk_state_store(sock->sk, TCP_FIN_WAIT1);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index f733c5425552..67448002a2d7 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -89,6 +89,7 @@
#define MPTCP_DATA_READY 0
#define MPTCP_SEND_SPACE 1
#define MPTCP_WORK_RTX 2
+#define MPTCP_WORK_EOF 3
static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)
{
@@ -339,6 +340,7 @@ void mptcp_finish_connect(struct sock *sk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk);
void mptcp_data_acked(struct sock *sk);
+void mptcp_subflow_eof(struct sock *sk);
int mptcp_token_new_request(struct request_sock *req);
void mptcp_token_destroy_request(u32 token);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index b5180c81588e..50a8bea987c6 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -994,8 +994,7 @@ static void subflow_state_change(struct sock *sk)
if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&
!subflow->rx_eof && subflow_is_done(sk)) {
subflow->rx_eof = 1;
- parent->sk_shutdown |= RCV_SHUTDOWN;
- __subflow_state_change(parent);
+ mptcp_subflow_eof(parent);
}
}
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
index 129a5ad1bc35..33352dd99d4d 100644
--- a/net/mptcp/token.c
+++ b/net/mptcp/token.c
@@ -40,7 +40,7 @@ static int token_used __read_mostly;
/**
* mptcp_token_new_request - create new key/idsn/token for subflow_request
- * @req - the request socket
+ * @req: the request socket
*
* This function is called when a new mptcp connection is coming in.
*
@@ -80,7 +80,7 @@ int mptcp_token_new_request(struct request_sock *req)
/**
* mptcp_token_new_connect - create new key/idsn/token for subflow
- * @sk - the socket that will initiate a connection
+ * @sk: the socket that will initiate a connection
*
* This function is called when a new outgoing mptcp connection is
* initiated.
@@ -125,6 +125,7 @@ int mptcp_token_new_connect(struct sock *sk)
/**
* mptcp_token_new_accept - insert token for later processing
* @token: the token to insert to the tree
+ * @conn: the just cloned socket linked to the new connection
*
* Called when a SYN packet creates a new logical connection, i.e.
* is not a join request.
@@ -169,7 +170,7 @@ struct mptcp_sock *mptcp_token_get_sock(u32 token)
/**
* mptcp_token_destroy_request - remove mptcp connection/token
- * @token - token of mptcp connection to remove
+ * @token: token of mptcp connection to remove
*
* Remove not-yet-fully-established incoming connection identified
* by @token.
@@ -183,7 +184,7 @@ void mptcp_token_destroy_request(u32 token)
/**
* mptcp_token_destroy - remove mptcp connection/token
- * @token - token of mptcp connection to remove
+ * @token: token of mptcp connection to remove
*
* Remove the connection identified by @token.
*/
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 292e71dc7ba4..0e0ded87e27b 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -83,7 +83,7 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nft_set_pipapo.o
ifdef CONFIG_X86_64
-ifneq (,$(findstring -DCONFIG_AS_AVX2=1,$(KBUILD_CFLAGS)))
+ifndef CONFIG_UML
nf_tables-objs += nft_set_pipapo_avx2.o
endif
endif
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d0ab5ffa1e2c..4471393da6d8 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3291,7 +3291,7 @@ static const struct nft_set_type *nft_set_types[] = {
&nft_set_rhash_type,
&nft_set_bitmap_type,
&nft_set_rbtree_type,
-#if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX2)
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
&nft_set_pipapo_avx2_type,
#endif
&nft_set_pipapo_type,
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 87aabf651cfe..8b5acc6910fd 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -2201,7 +2201,7 @@ const struct nft_set_type nft_set_pipapo_type = {
},
};
-#if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX2)
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
const struct nft_set_type nft_set_pipapo_avx2_type = {
.features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT |
NFT_SET_TIMEOUT,
diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h
index 396caf7bfca8..394bcb704db7 100644
--- a/net/netfilter/nft_set_pipapo_avx2.h
+++ b/net/netfilter/nft_set_pipapo_avx2.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _NFT_SET_PIPAPO_AVX2_H
-#ifdef CONFIG_AS_AVX2
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
#include <asm/fpu/xstate.h>
#define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE)
@@ -9,6 +9,6 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
const u32 *key, const struct nft_set_ext **ext);
bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
struct nft_set_estimate *est);
-#endif /* CONFIG_AS_AVX2 */
+#endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */
#endif /* _NFT_SET_PIPAPO_AVX2_H */
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index fd8a01ca7a2d..2398d7238300 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -462,12 +462,14 @@ static void flow_table_copy_flows(struct table_instance *old,
struct hlist_head *head = &old->buckets[i];
if (ufid)
- hlist_for_each_entry(flow, head,
- ufid_table.node[old_ver])
+ hlist_for_each_entry_rcu(flow, head,
+ ufid_table.node[old_ver],
+ lockdep_ovsl_is_held())
ufid_table_instance_insert(new, flow);
else
- hlist_for_each_entry(flow, head,
- flow_table.node[old_ver])
+ hlist_for_each_entry_rcu(flow, head,
+ flow_table.node[old_ver],
+ lockdep_ovsl_is_held())
table_instance_insert(new, flow);
}
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 9904299424a1..61e95029c18f 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -11,6 +11,7 @@
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/slab.h>
+#include <linux/refcount.h>
#include <net/act_api.h>
#include <net/netlink.h>
#include <net/pkt_cls.h>
@@ -26,9 +27,12 @@
#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */
+struct tcindex_data;
+
struct tcindex_filter_result {
struct tcf_exts exts;
struct tcf_result res;
+ struct tcindex_data *p;
struct rcu_work rwork;
};
@@ -49,6 +53,7 @@ struct tcindex_data {
u32 hash; /* hash table size; 0 if undefined */
u32 alloc_hash; /* allocated size */
u32 fall_through; /* 0: only classify if explicit match */
+ refcount_t refcnt; /* a temporary refcnt for perfect hash */
struct rcu_work rwork;
};
@@ -57,6 +62,20 @@ static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
return tcf_exts_has_actions(&r->exts) || r->res.classid;
}
+static void tcindex_data_get(struct tcindex_data *p)
+{
+ refcount_inc(&p->refcnt);
+}
+
+static void tcindex_data_put(struct tcindex_data *p)
+{
+ if (refcount_dec_and_test(&p->refcnt)) {
+ kfree(p->perfect);
+ kfree(p->h);
+ kfree(p);
+ }
+}
+
static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
u16 key)
{
@@ -132,6 +151,7 @@ static int tcindex_init(struct tcf_proto *tp)
p->mask = 0xffff;
p->hash = DEFAULT_HASH_SIZE;
p->fall_through = 1;
+ refcount_set(&p->refcnt, 1); /* Paired with tcindex_destroy_work() */
rcu_assign_pointer(tp->root, p);
return 0;
@@ -141,6 +161,7 @@ static void __tcindex_destroy_rexts(struct tcindex_filter_result *r)
{
tcf_exts_destroy(&r->exts);
tcf_exts_put_net(&r->exts);
+ tcindex_data_put(r->p);
}
static void tcindex_destroy_rexts_work(struct work_struct *work)
@@ -212,6 +233,8 @@ found:
else
__tcindex_destroy_fexts(f);
} else {
+ tcindex_data_get(p);
+
if (tcf_exts_get_net(&r->exts))
tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
else
@@ -228,9 +251,7 @@ static void tcindex_destroy_work(struct work_struct *work)
struct tcindex_data,
rwork);
- kfree(p->perfect);
- kfree(p->h);
- kfree(p);
+ tcindex_data_put(p);
}
static inline int
@@ -248,9 +269,11 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
};
static int tcindex_filter_result_init(struct tcindex_filter_result *r,
+ struct tcindex_data *p,
struct net *net)
{
memset(r, 0, sizeof(*r));
+ r->p = p;
return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT,
TCA_TCINDEX_POLICE);
}
@@ -290,6 +313,7 @@ static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp)
TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
if (err < 0)
goto errout;
+ cp->perfect[i].p = cp;
}
return 0;
@@ -334,6 +358,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
cp->alloc_hash = p->alloc_hash;
cp->fall_through = p->fall_through;
cp->tp = tp;
+ refcount_set(&cp->refcnt, 1); /* Paired with tcindex_destroy_work() */
if (tb[TCA_TCINDEX_HASH])
cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
@@ -366,7 +391,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
}
cp->h = p->h;
- err = tcindex_filter_result_init(&new_filter_result, net);
+ err = tcindex_filter_result_init(&new_filter_result, cp, net);
if (err < 0)
goto errout_alloc;
if (old_r)
@@ -434,7 +459,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
goto errout_alloc;
f->key = handle;
f->next = NULL;
- err = tcindex_filter_result_init(&f->result, net);
+ err = tcindex_filter_result_init(&f->result, cp, net);
if (err < 0) {
kfree(f);
goto errout_alloc;
@@ -447,7 +472,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
}
if (old_r && old_r != r) {
- err = tcindex_filter_result_init(old_r, net);
+ err = tcindex_filter_result_init(old_r, cp, net);
if (err < 0) {
kfree(f);
goto errout_alloc;
@@ -571,6 +596,14 @@ static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held,
for (i = 0; i < p->hash; i++) {
struct tcindex_filter_result *r = p->perfect + i;
+ /* tcf_queue_work() does not guarantee the ordering we
+ * want, so we have to take this refcnt temporarily to
+ * ensure 'p' is freed after all tcindex_filter_result
+ * here. Imperfect hash does not need this, because it
+ * uses linked lists rather than an array.
+ */
+ tcindex_data_get(p);
+
tcf_unbind_filter(tp, &r->res);
if (tcf_exts_get_net(&r->exts))
tcf_queue_work(&r->rwork,
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index ee060d57d216..25fbd8d9de74 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -20,6 +20,7 @@
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/auth.h>
#include <linux/sunrpc/auth_gss.h>
+#include <linux/sunrpc/gss_krb5.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/gss_err.h>
#include <linux/workqueue.h>
@@ -1050,7 +1051,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
goto err_put_mech;
auth = &gss_auth->rpc_auth;
auth->au_cslack = GSS_CRED_SLACK >> 2;
- auth->au_rslack = GSS_VERF_SLACK >> 2;
+ auth->au_rslack = GSS_KRB5_MAX_SLACK_NEEDED >> 2;
auth->au_verfsize = GSS_VERF_SLACK >> 2;
auth->au_ralign = GSS_VERF_SLACK >> 2;
auth->au_flags = 0;
@@ -1724,8 +1725,9 @@ bad_mic:
goto out;
}
-static int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
- struct rpc_task *task, struct xdr_stream *xdr)
+static noinline_for_stack int
+gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_rqst *rqstp = task->tk_rqstp;
struct xdr_buf integ_buf, *snd_buf = &rqstp->rq_snd_buf;
@@ -1816,8 +1818,9 @@ out:
return -EAGAIN;
}
-static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
- struct rpc_task *task, struct xdr_stream *xdr)
+static noinline_for_stack int
+gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_rqst *rqstp = task->tk_rqstp;
struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
@@ -1934,35 +1937,69 @@ gss_unwrap_resp_auth(struct rpc_cred *cred)
return 0;
}
-static int
+/*
+ * RFC 2203, Section 5.3.2.2
+ *
+ * struct rpc_gss_integ_data {
+ * opaque databody_integ<>;
+ * opaque checksum<>;
+ * };
+ *
+ * struct rpc_gss_data_t {
+ * unsigned int seq_num;
+ * proc_req_arg_t arg;
+ * };
+ */
+static noinline_for_stack int
gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp,
struct xdr_stream *xdr)
{
- struct xdr_buf integ_buf, *rcv_buf = &rqstp->rq_rcv_buf;
- u32 data_offset, mic_offset, integ_len, maj_stat;
+ struct xdr_buf gss_data, *rcv_buf = &rqstp->rq_rcv_buf;
struct rpc_auth *auth = cred->cr_auth;
+ u32 len, offset, seqno, maj_stat;
struct xdr_netobj mic;
- __be32 *p;
+ int ret;
- p = xdr_inline_decode(xdr, 2 * sizeof(*p));
- if (unlikely(!p))
+ ret = -EIO;
+ mic.data = NULL;
+
+ /* opaque databody_integ<>; */
+ if (xdr_stream_decode_u32(xdr, &len))
goto unwrap_failed;
- integ_len = be32_to_cpup(p++);
- if (integ_len & 3)
+ if (len & 3)
goto unwrap_failed;
- data_offset = (u8 *)(p) - (u8 *)rcv_buf->head[0].iov_base;
- mic_offset = integ_len + data_offset;
- if (mic_offset > rcv_buf->len)
+ offset = rcv_buf->len - xdr_stream_remaining(xdr);
+ if (xdr_stream_decode_u32(xdr, &seqno))
goto unwrap_failed;
- if (be32_to_cpup(p) != rqstp->rq_seqno)
+ if (seqno != rqstp->rq_seqno)
goto bad_seqno;
+ if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len))
+ goto unwrap_failed;
- if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len))
+ /*
+ * The xdr_stream now points to the beginning of the
+ * upper layer payload, to be passed below to
+ * rpcauth_unwrap_resp_decode(). The checksum, which
+ * follows the upper layer payload in @rcv_buf, is
+ * located and parsed without updating the xdr_stream.
+ */
+
+ /* opaque checksum<>; */
+ offset += len;
+ if (xdr_decode_word(rcv_buf, offset, &len))
+ goto unwrap_failed;
+ offset += sizeof(__be32);
+ if (offset + len > rcv_buf->len)
goto unwrap_failed;
- if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset))
+ mic.len = len;
+ mic.data = kmalloc(len, GFP_NOFS);
+ if (!mic.data)
+ goto unwrap_failed;
+ if (read_bytes_from_xdr_buf(rcv_buf, offset, mic.data, mic.len))
goto unwrap_failed;
- maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
+
+ maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &gss_data, &mic);
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
if (maj_stat != GSS_S_COMPLETE)
@@ -1970,19 +2007,24 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
auth->au_rslack = auth->au_verfsize + 2 + 1 + XDR_QUADLEN(mic.len);
auth->au_ralign = auth->au_verfsize + 2;
- return 0;
+ ret = 0;
+
+out:
+ kfree(mic.data);
+ return ret;
+
unwrap_failed:
trace_rpcgss_unwrap_failed(task);
- return -EIO;
+ goto out;
bad_seqno:
- trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(p));
- return -EIO;
+ trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, seqno);
+ goto out;
bad_mic:
trace_rpcgss_verify_mic(task, maj_stat);
- return -EIO;
+ goto out;
}
-static int
+static noinline_for_stack int
gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred,
struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp,
struct xdr_stream *xdr)
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 07992d3e8703..325a0858700f 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1099,8 +1099,9 @@ rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
task->tk_msg.rpc_proc = msg->rpc_proc;
task->tk_msg.rpc_argp = msg->rpc_argp;
task->tk_msg.rpc_resp = msg->rpc_resp;
- if (msg->rpc_cred != NULL)
- task->tk_msg.rpc_cred = get_cred(msg->rpc_cred);
+ task->tk_msg.rpc_cred = msg->rpc_cred;
+ if (!(task->tk_flags & RPC_TASK_CRED_NOREF))
+ get_cred(task->tk_msg.rpc_cred);
}
}
@@ -1126,6 +1127,9 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
task = rpc_new_task(task_setup_data);
+ if (!RPC_IS_ASYNC(task))
+ task->tk_flags |= RPC_TASK_CRED_NOREF;
+
rpc_task_set_client(task, task_setup_data->rpc_client);
rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 55e900255b0c..7eba20a88438 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -204,10 +204,6 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
struct rpc_task *task,
unsigned char queue_priority)
{
- WARN_ON_ONCE(RPC_IS_QUEUED(task));
- if (RPC_IS_QUEUED(task))
- return;
-
INIT_LIST_HEAD(&task->u.tk_wait.timer_list);
if (RPC_IS_PRIORITY(queue))
__rpc_add_wait_queue_priority(queue, task, queue_priority);
@@ -382,7 +378,7 @@ static void rpc_make_runnable(struct workqueue_struct *wq,
* NB: An RPC task will only receive interrupt-driven events as long
* as it's on a wait queue.
*/
-static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
+static void __rpc_do_sleep_on_priority(struct rpc_wait_queue *q,
struct rpc_task *task,
unsigned char queue_priority)
{
@@ -395,12 +391,23 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
}
+static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
+ struct rpc_task *task,
+ unsigned char queue_priority)
+{
+ if (WARN_ON_ONCE(RPC_IS_QUEUED(task)))
+ return;
+ __rpc_do_sleep_on_priority(q, task, queue_priority);
+}
+
static void __rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q,
struct rpc_task *task, unsigned long timeout,
unsigned char queue_priority)
{
+ if (WARN_ON_ONCE(RPC_IS_QUEUED(task)))
+ return;
if (time_is_after_jiffies(timeout)) {
- __rpc_sleep_on_priority(q, task, queue_priority);
+ __rpc_do_sleep_on_priority(q, task, queue_priority);
__rpc_add_timer(q, task, timeout);
} else
task->tk_status = -ETIMEDOUT;
@@ -1162,7 +1169,8 @@ static void rpc_release_resources_task(struct rpc_task *task)
{
xprt_release(task);
if (task->tk_msg.rpc_cred) {
- put_cred(task->tk_msg.rpc_cred);
+ if (!(task->tk_flags & RPC_TASK_CRED_NOREF))
+ put_cred(task->tk_msg.rpc_cred);
task->tk_msg.rpc_cred = NULL;
}
rpc_task_release_client(task);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index e5497dc2475b..15b58c5144f9 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1235,61 +1235,6 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
}
EXPORT_SYMBOL_GPL(xdr_encode_word);
-/**
- * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf
- * @buf: pointer to buffer containing a mic
- * @mic: on success, returns the address of the mic
- * @offset: the offset in buf where mic may be found
- *
- * This function may modify the xdr buf if the mic is found to be straddling
- * a boundary between head, pages, and tail. On success the mic can be read
- * from the address returned. There is no need to free the mic.
- *
- * Return: Success returns 0, otherwise an integer error.
- */
-int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset)
-{
- struct xdr_buf subbuf;
- unsigned int boundary;
-
- if (xdr_decode_word(buf, offset, &mic->len))
- return -EFAULT;
- offset += 4;
-
- /* Is the mic partially in the head? */
- boundary = buf->head[0].iov_len;
- if (offset < boundary && (offset + mic->len) > boundary)
- xdr_shift_buf(buf, boundary - offset);
-
- /* Is the mic partially in the pages? */
- boundary += buf->page_len;
- if (offset < boundary && (offset + mic->len) > boundary)
- xdr_shrink_pagelen(buf, boundary - offset);
-
- if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len))
- return -EFAULT;
-
- /* Is the mic contained entirely in the head? */
- mic->data = subbuf.head[0].iov_base;
- if (subbuf.head[0].iov_len == mic->len)
- return 0;
- /* ..or is the mic contained entirely in the tail? */
- mic->data = subbuf.tail[0].iov_base;
- if (subbuf.tail[0].iov_len == mic->len)
- return 0;
-
- /* Find a contiguous area in @buf to hold all of @mic */
- if (mic->len > buf->buflen - buf->len)
- return -ENOMEM;
- if (buf->tail[0].iov_len != 0)
- mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
- else
- mic->data = buf->head[0].iov_base + buf->head[0].iov_len;
- __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len);
- return 0;
-}
-EXPORT_SYMBOL_GPL(xdr_buf_read_mic);
-
/* Returns 0 on success, or else a negative error code. */
static int
xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 1a0ae0c61353..c92c1aac270a 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -44,10 +44,10 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
size_t maxmsg;
- maxmsg = min_t(unsigned int, ep->rep_inline_send, ep->rep_inline_recv);
+ maxmsg = min_t(unsigned int, ep->re_inline_send, ep->re_inline_recv);
maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE);
return maxmsg - RPCRDMA_HDRLEN_MIN;
}
@@ -115,7 +115,7 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst)
if (rc < 0)
goto failed_marshal;
- if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+ if (rpcrdma_post_sends(r_xprt, req))
goto drop_connection;
return 0;
@@ -190,7 +190,7 @@ create_req:
if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS)
return NULL;
- size = min_t(size_t, r_xprt->rx_ep.rep_inline_recv, PAGE_SIZE);
+ size = min_t(size_t, r_xprt->rx_ep->re_inline_recv, PAGE_SIZE);
req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL);
if (!req)
return NULL;
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 125297c9aa3e..ef997880e17a 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -52,7 +52,7 @@
/**
* frwr_release_mr - Destroy one MR
- * @mr: MR allocated by frwr_init_mr
+ * @mr: MR allocated by frwr_mr_init
*
*/
void frwr_release_mr(struct rpcrdma_mr *mr)
@@ -74,7 +74,7 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr)
if (mr->mr_dir != DMA_NONE) {
trace_xprtrdma_mr_unmap(mr);
- ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device,
+ ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
mr->mr_sg, mr->mr_nents, mr->mr_dir);
mr->mr_dir = DMA_NONE;
}
@@ -106,21 +106,22 @@ void frwr_reset(struct rpcrdma_req *req)
}
/**
- * frwr_init_mr - Initialize one MR
- * @ia: interface adapter
+ * frwr_mr_init - Initialize one MR
+ * @r_xprt: controlling transport instance
* @mr: generic MR to prepare for FRWR
*
* Returns zero if successful. Otherwise a negative errno
* is returned.
*/
-int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
+int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
{
- unsigned int depth = ia->ri_max_frwr_depth;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ unsigned int depth = ep->re_max_fr_depth;
struct scatterlist *sg;
struct ib_mr *frmr;
int rc;
- frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
+ frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth);
if (IS_ERR(frmr))
goto out_mr_err;
@@ -128,6 +129,7 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
if (!sg)
goto out_list_err;
+ mr->mr_xprt = r_xprt;
mr->frwr.fr_mr = frmr;
mr->mr_dir = DMA_NONE;
INIT_LIST_HEAD(&mr->mr_list);
@@ -149,29 +151,24 @@ out_list_err:
/**
* frwr_query_device - Prepare a transport for use with FRWR
- * @r_xprt: controlling transport instance
+ * @ep: endpoint to fill in
* @device: RDMA device to query
*
* On success, sets:
- * ep->rep_attr
- * ep->rep_max_requests
- * ia->ri_max_rdma_segs
- *
- * And these FRWR-related fields:
- * ia->ri_max_frwr_depth
- * ia->ri_mrtype
+ * ep->re_attr
+ * ep->re_max_requests
+ * ep->re_max_rdma_segs
+ * ep->re_max_fr_depth
+ * ep->re_mrtype
*
* Return values:
* On success, returns zero.
* %-EINVAL - the device does not support FRWR memory registration
* %-ENOMEM - the device is not sufficiently capable for NFS/RDMA
*/
-int frwr_query_device(struct rpcrdma_xprt *r_xprt,
- const struct ib_device *device)
+int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
{
const struct ib_device_attr *attrs = &device->attrs;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
int max_qp_wr, depth, delta;
unsigned int max_sge;
@@ -188,23 +185,23 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt,
pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge);
return -ENOMEM;
}
- ep->rep_attr.cap.max_send_sge = max_sge;
- ep->rep_attr.cap.max_recv_sge = 1;
+ ep->re_attr.cap.max_send_sge = max_sge;
+ ep->re_attr.cap.max_recv_sge = 1;
- ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
+ ep->re_mrtype = IB_MR_TYPE_MEM_REG;
if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
- ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;
+ ep->re_mrtype = IB_MR_TYPE_SG_GAPS;
/* Quirk: Some devices advertise a large max_fast_reg_page_list_len
* capability, but perform optimally when the MRs are not larger
* than a page.
*/
if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS)
- ia->ri_max_frwr_depth = attrs->max_sge_rd;
+ ep->re_max_fr_depth = attrs->max_sge_rd;
else
- ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len;
- if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS)
- ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS;
+ ep->re_max_fr_depth = attrs->max_fast_reg_page_list_len;
+ if (ep->re_max_fr_depth > RPCRDMA_MAX_DATA_SEGS)
+ ep->re_max_fr_depth = RPCRDMA_MAX_DATA_SEGS;
/* Add room for frwr register and invalidate WRs.
* 1. FRWR reg WR for head
@@ -220,11 +217,11 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt,
/* Calculate N if the device max FRWR depth is smaller than
* RPCRDMA_MAX_DATA_SEGS.
*/
- if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) {
- delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth;
+ if (ep->re_max_fr_depth < RPCRDMA_MAX_DATA_SEGS) {
+ delta = RPCRDMA_MAX_DATA_SEGS - ep->re_max_fr_depth;
do {
depth += 2; /* FRWR reg + invalidate */
- delta -= ia->ri_max_frwr_depth;
+ delta -= ep->re_max_fr_depth;
} while (delta > 0);
}
@@ -233,34 +230,34 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt,
max_qp_wr -= 1;
if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE)
return -ENOMEM;
- if (ep->rep_max_requests > max_qp_wr)
- ep->rep_max_requests = max_qp_wr;
- ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
- if (ep->rep_attr.cap.max_send_wr > max_qp_wr) {
- ep->rep_max_requests = max_qp_wr / depth;
- if (!ep->rep_max_requests)
+ if (ep->re_max_requests > max_qp_wr)
+ ep->re_max_requests = max_qp_wr;
+ ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
+ if (ep->re_attr.cap.max_send_wr > max_qp_wr) {
+ ep->re_max_requests = max_qp_wr / depth;
+ if (!ep->re_max_requests)
return -ENOMEM;
- ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
+ ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
}
- ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
- ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
- ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests;
- ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
- ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
-
- ia->ri_max_rdma_segs =
- DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth);
+ ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
+ ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
+ ep->re_attr.cap.max_recv_wr = ep->re_max_requests;
+ ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+ ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
+
+ ep->re_max_rdma_segs =
+ DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ep->re_max_fr_depth);
/* Reply chunks require segments for head and tail buffers */
- ia->ri_max_rdma_segs += 2;
- if (ia->ri_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS)
- ia->ri_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS;
+ ep->re_max_rdma_segs += 2;
+ if (ep->re_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS)
+ ep->re_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS;
/* Ensure the underlying device is capable of conveying the
* largest r/wsize NFS will ask for. This guarantees that
* failing over from one RDMA device to another will not
* break NFS I/O.
*/
- if ((ia->ri_max_rdma_segs * ia->ri_max_frwr_depth) < RPCRDMA_MAX_SEGS)
+ if ((ep->re_max_rdma_segs * ep->re_max_fr_depth) < RPCRDMA_MAX_SEGS)
return -ENOMEM;
return 0;
@@ -286,14 +283,14 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
int nsegs, bool writing, __be32 xid,
struct rpcrdma_mr *mr)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct ib_reg_wr *reg_wr;
int i, n, dma_nents;
struct ib_mr *ibmr;
u8 key;
- if (nsegs > ia->ri_max_frwr_depth)
- nsegs = ia->ri_max_frwr_depth;
+ if (nsegs > ep->re_max_fr_depth)
+ nsegs = ep->re_max_fr_depth;
for (i = 0; i < nsegs;) {
if (seg->mr_page)
sg_set_page(&mr->mr_sg[i],
@@ -306,7 +303,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
++seg;
++i;
- if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS)
+ if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS)
continue;
if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
@@ -315,7 +312,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
mr->mr_dir = rpcrdma_data_dir(writing);
mr->mr_nents = i;
- dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, mr->mr_nents,
+ dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents,
mr->mr_dir);
if (!dma_nents)
goto out_dmamap_err;
@@ -356,8 +353,8 @@ out_mapmr_err:
/**
* frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed FastReg WR
*
*/
static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
@@ -369,20 +366,25 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_fastreg(wc, frwr);
/* The MR will get recycled when the associated req is retransmitted */
+
+ rpcrdma_flush_disconnect(cq, wc);
}
/**
- * frwr_send - post Send WR containing the RPC Call message
- * @ia: interface adapter
- * @req: Prepared RPC Call
+ * frwr_send - post Send WRs containing the RPC Call message
+ * @r_xprt: controlling transport instance
+ * @req: prepared RPC Call
*
* For FRWR, chain any FastReg WRs to the Send WR. Only a
* single ib_post_send call is needed to register memory
* and then post the Send WR.
*
- * Returns the result of ib_post_send.
+ * Returns the return code from ib_post_send.
+ *
+ * Caller must hold the transport send lock to ensure that the
+ * pointers to the transport's rdma_cm_id and QP are stable.
*/
-int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *post_wr;
struct rpcrdma_mr *mr;
@@ -403,7 +405,7 @@ int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
post_wr = &frwr->fr_regwr.wr;
}
- return ib_post_send(ia->ri_id->qp, post_wr, NULL);
+ return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL);
}
/**
@@ -419,7 +421,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
list_for_each_entry(mr, mrs, mr_list)
if (mr->mr_handle == rep->rr_inv_rkey) {
list_del_init(&mr->mr_list);
- trace_xprtrdma_mr_remoteinv(mr);
+ trace_xprtrdma_mr_reminv(mr);
rpcrdma_mr_put(mr);
break; /* only one invalidated MR per RPC */
}
@@ -435,8 +437,8 @@ static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
/**
* frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
*
*/
static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
@@ -449,12 +451,14 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_li(wc, frwr);
__frwr_release_mr(wc, mr);
+
+ rpcrdma_flush_disconnect(cq, wc);
}
/**
* frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
*
* Awaken anyone waiting for an MR to finish being fenced.
*/
@@ -469,6 +473,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
trace_xprtrdma_wc_li_wake(wc, frwr);
__frwr_release_mr(wc, mr);
complete(&frwr->fr_linv_done);
+
+ rpcrdma_flush_disconnect(cq, wc);
}
/**
@@ -526,10 +532,10 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* Transport disconnect drains the receive CQ before it
* replaces the QP. The RPC reply handler won't call us
- * unless ri_id->qp is a valid pointer.
+ * unless re_id->qp is a valid pointer.
*/
bad_wr = NULL;
- rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+ rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
/* The final LOCAL_INV WR in the chain is supposed to
* do the wake. If it was never posted, the wake will
@@ -556,8 +562,8 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/**
* frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
*
*/
static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
@@ -575,6 +581,8 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
/* Ensure @rep is generated before __frwr_release_mr */
smp_rmb();
rpcrdma_complete_rqst(rep);
+
+ rpcrdma_flush_disconnect(cq, wc);
}
/**
@@ -629,10 +637,10 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* Transport disconnect drains the receive CQ before it
* replaces the QP. The RPC reply handler won't call us
- * unless ri_id->qp is a valid pointer.
+ * unless re_id->qp is a valid pointer.
*/
bad_wr = NULL;
- rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+ rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
if (!rc)
return;
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 577513b7642e..4a81e6995d3e 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -103,21 +103,20 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
/**
* rpcrdma_set_max_header_sizes - Initialize inline payload sizes
- * @r_xprt: transport instance to initialize
+ * @ep: endpoint to initialize
*
* The max_inline fields contain the maximum size of an RPC message
* so the marshaling code doesn't have to repeat this calculation
* for every RPC.
*/
-void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep)
{
- unsigned int maxsegs = r_xprt->rx_ia.ri_max_rdma_segs;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ unsigned int maxsegs = ep->re_max_rdma_segs;
- ep->rep_max_inline_send =
- ep->rep_inline_send - rpcrdma_max_call_header_size(maxsegs);
- ep->rep_max_inline_recv =
- ep->rep_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
+ ep->re_max_inline_send =
+ ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs);
+ ep->re_max_inline_recv =
+ ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
}
/* The client can send a request inline as long as the RPCRDMA header
@@ -132,9 +131,10 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
struct rpc_rqst *rqst)
{
struct xdr_buf *xdr = &rqst->rq_snd_buf;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
unsigned int count, remaining, offset;
- if (xdr->len > r_xprt->rx_ep.rep_max_inline_send)
+ if (xdr->len > ep->re_max_inline_send)
return false;
if (xdr->page_len) {
@@ -145,7 +145,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
remaining -= min_t(unsigned int,
PAGE_SIZE - offset, remaining);
offset = 0;
- if (++count > r_xprt->rx_ep.rep_attr.cap.max_send_sge)
+ if (++count > ep->re_attr.cap.max_send_sge)
return false;
}
}
@@ -162,7 +162,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
struct rpc_rqst *rqst)
{
- return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.rep_max_inline_recv;
+ return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv;
}
/* The client is required to provide a Reply chunk if the maximum
@@ -176,7 +176,7 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt,
const struct xdr_buf *buf = &rqst->rq_rcv_buf;
return (buf->head[0].iov_len + buf->tail[0].iov_len) <
- r_xprt->rx_ep.rep_max_inline_recv;
+ r_xprt->rx_ep->re_max_inline_recv;
}
/* Split @vec on page boundaries into SGEs. FMR registers pages, not
@@ -255,7 +255,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
/* When encoding a Read chunk, the tail iovec contains an
* XDR pad and may be omitted.
*/
- if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
+ if (type == rpcrdma_readch && r_xprt->rx_ep->re_implicit_roundup)
goto out;
/* When encoding a Write chunk, some servers need to see an
@@ -263,7 +263,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
* layer provides space in the tail iovec that may be used
* for this purpose.
*/
- if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
+ if (type == rpcrdma_writech && r_xprt->rx_ep->re_implicit_roundup)
goto out;
if (xdrbuf->tail[0].iov_len)
@@ -1450,8 +1450,8 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
if (credits == 0)
credits = 1; /* don't deadlock */
- else if (credits > r_xprt->rx_ep.rep_max_requests)
- credits = r_xprt->rx_ep.rep_max_requests;
+ else if (credits > r_xprt->rx_ep->re_max_requests)
+ credits = r_xprt->rx_ep->re_max_requests;
if (buf->rb_credits != credits)
rpcrdma_update_cwnd(r_xprt, credits);
rpcrdma_post_recvs(r_xprt, false);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 3cfeba68ee9a..659da37020a4 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -240,9 +240,10 @@ xprt_rdma_connect_worker(struct work_struct *work)
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
int rc;
- rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+ rc = rpcrdma_xprt_connect(r_xprt);
xprt_clear_connecting(xprt);
- if (r_xprt->rx_ep.rep_connected > 0) {
+ if (r_xprt->rx_ep && r_xprt->rx_ep->re_connect_status > 0) {
+ xprt->connect_cookie++;
xprt->stat.connect_count++;
xprt->stat.connect_time += (long)jiffies -
xprt->stat.connect_start;
@@ -265,7 +266,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
trace_xprtrdma_op_inject_dsc(r_xprt);
- rdma_disconnect(r_xprt->rx_ia.ri_id);
+ rdma_disconnect(r_xprt->rx_ep->re_id);
}
/**
@@ -284,9 +285,8 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
- rpcrdma_ep_destroy(r_xprt);
+ rpcrdma_xprt_disconnect(r_xprt);
rpcrdma_buffer_destroy(&r_xprt->rx_buf);
- rpcrdma_ia_close(&r_xprt->rx_ia);
xprt_rdma_free_addresses(xprt);
xprt_free(xprt);
@@ -316,10 +316,15 @@ xprt_setup_rdma(struct xprt_create *args)
if (args->addrlen > sizeof(xprt->addr))
return ERR_PTR(-EBADF);
+ if (!try_module_get(THIS_MODULE))
+ return ERR_PTR(-EIO);
+
xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0,
xprt_rdma_slot_table_entries);
- if (!xprt)
+ if (!xprt) {
+ module_put(THIS_MODULE);
return ERR_PTR(-ENOMEM);
+ }
xprt->timeout = &xprt_rdma_default_timeout;
xprt->connect_timeout = xprt->timeout->to_initval;
@@ -347,23 +352,17 @@ xprt_setup_rdma(struct xprt_create *args)
xprt_rdma_format_addresses(xprt, sap);
new_xprt = rpcx_to_rdmax(xprt);
- rc = rpcrdma_ia_open(new_xprt);
- if (rc)
- goto out1;
-
- rc = rpcrdma_ep_create(new_xprt);
- if (rc)
- goto out2;
-
rc = rpcrdma_buffer_create(new_xprt);
- if (rc)
- goto out3;
-
- if (!try_module_get(THIS_MODULE))
- goto out4;
+ if (rc) {
+ xprt_rdma_free_addresses(xprt);
+ xprt_free(xprt);
+ module_put(THIS_MODULE);
+ return ERR_PTR(rc);
+ }
INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
xprt_rdma_connect_worker);
+
xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
dprintk("RPC: %s: %s:%s\n", __func__,
@@ -371,19 +370,6 @@ xprt_setup_rdma(struct xprt_create *args)
xprt->address_strings[RPC_DISPLAY_PORT]);
trace_xprtrdma_create(new_xprt);
return xprt;
-
-out4:
- rpcrdma_buffer_destroy(&new_xprt->rx_buf);
- rc = -ENODEV;
-out3:
- rpcrdma_ep_destroy(new_xprt);
-out2:
- rpcrdma_ia_close(&new_xprt->rx_ia);
-out1:
- trace_xprtrdma_op_destroy(new_xprt);
- xprt_rdma_free_addresses(xprt);
- xprt_free(xprt);
- return ERR_PTR(rc);
}
/**
@@ -398,26 +384,11 @@ out1:
void xprt_rdma_close(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
- might_sleep();
trace_xprtrdma_op_close(r_xprt);
- /* Prevent marshaling and sending of new requests */
- xprt_clear_connected(xprt);
-
- if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) {
- rpcrdma_ia_remove(ia);
- goto out;
- }
-
- if (ep->rep_connected == -ENODEV)
- return;
- rpcrdma_ep_disconnect(ep, ia);
+ rpcrdma_xprt_disconnect(r_xprt);
-out:
xprt->reestablish_timeout = 0;
++xprt->connect_cookie;
xprt_disconnect_done(xprt);
@@ -517,10 +488,11 @@ static void
xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
unsigned long delay;
delay = 0;
- if (r_xprt->rx_ep.rep_connected != 0) {
+ if (ep && ep->re_connect_status != 0) {
delay = xprt_reconnect_delay(xprt);
xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO);
}
@@ -694,7 +666,7 @@ xprt_rdma_send_request(struct rpc_rqst *rqst)
goto drop_connection;
rqst->rq_xtime = ktime_get();
- if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+ if (rpcrdma_post_sends(r_xprt, req))
goto drop_connection;
rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 353f61ac8d51..cdd84c09df10 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -84,6 +84,7 @@ static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep);
static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
+static int rpcrdma_ep_destroy(struct rpcrdma_ep *ep);
static struct rpcrdma_regbuf *
rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction,
gfp_t flags);
@@ -96,17 +97,17 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb);
*/
static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rdma_cm_id *id = r_xprt->rx_ep->re_id;
/* Flush Receives, then wait for deferred Reply work
* to complete.
*/
- ib_drain_rq(ia->ri_id->qp);
+ ib_drain_rq(id->qp);
/* Deferred Reply processing might have scheduled
* local invalidations.
*/
- ib_drain_sq(ia->ri_id->qp);
+ ib_drain_sq(id->qp);
}
/**
@@ -115,26 +116,43 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
* @context: ep that owns QP where event occurred
*
* Called from the RDMA provider (device driver) possibly in an interrupt
- * context.
+ * context. The QP is always destroyed before the ID, so the ID will be
+ * reliably available when this handler is invoked.
*/
-static void
-rpcrdma_qp_event_handler(struct ib_event *event, void *context)
+static void rpcrdma_qp_event_handler(struct ib_event *event, void *context)
{
struct rpcrdma_ep *ep = context;
- struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
- rx_ep);
- trace_xprtrdma_qp_event(r_xprt, event);
+ trace_xprtrdma_qp_event(ep, event);
+}
+
+/**
+ * rpcrdma_flush_disconnect - Disconnect on flushed completion
+ * @cq: completion queue
+ * @wc: work completion entry
+ *
+ * Must be called in process context.
+ */
+void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rpcrdma_xprt *r_xprt = cq->cq_context;
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+ if (wc->status != IB_WC_SUCCESS &&
+ r_xprt->rx_ep->re_connect_status == 1) {
+ r_xprt->rx_ep->re_connect_status = -ECONNABORTED;
+ trace_xprtrdma_flush_dct(r_xprt, wc->status);
+ xprt_force_disconnect(xprt);
+ }
}
/**
* rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
* @cq: completion queue
- * @wc: completed WR
+ * @wc: WCE for a completed Send WR
*
*/
-static void
-rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
+static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
struct rpcrdma_sendctx *sc =
@@ -143,25 +161,25 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_send(sc, wc);
rpcrdma_sendctx_put_locked((struct rpcrdma_xprt *)cq->cq_context, sc);
+ rpcrdma_flush_disconnect(cq, wc);
}
/**
* rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed Receive WR
*
*/
-static void
-rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
+static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
rr_cqe);
- struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+ struct rpcrdma_xprt *r_xprt = cq->cq_context;
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_receive(wc);
- --r_xprt->rx_ep.rep_receive_count;
+ --r_xprt->rx_ep->re_receive_count;
if (wc->status != IB_WC_SUCCESS)
goto out_flushed;
@@ -178,35 +196,35 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
return;
out_flushed:
+ rpcrdma_flush_disconnect(cq, wc);
rpcrdma_rep_destroy(rep);
}
-static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt,
+static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep,
struct rdma_conn_param *param)
{
const struct rpcrdma_connect_private *pmsg = param->private_data;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
unsigned int rsize, wsize;
/* Default settings for RPC-over-RDMA Version One */
- r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
+ ep->re_implicit_roundup = xprt_rdma_pad_optimize;
rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
if (pmsg &&
pmsg->cp_magic == rpcrdma_cmp_magic &&
pmsg->cp_version == RPCRDMA_CMP_VERSION) {
- r_xprt->rx_ia.ri_implicit_roundup = true;
+ ep->re_implicit_roundup = true;
rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
}
- if (rsize < ep->rep_inline_recv)
- ep->rep_inline_recv = rsize;
- if (wsize < ep->rep_inline_send)
- ep->rep_inline_send = wsize;
+ if (rsize < ep->re_inline_recv)
+ ep->re_inline_recv = rsize;
+ if (wsize < ep->re_inline_send)
+ ep->re_inline_send = wsize;
- rpcrdma_set_max_header_sizes(r_xprt);
+ rpcrdma_set_max_header_sizes(ep);
}
/**
@@ -220,116 +238,103 @@ static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt,
static int
rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
- struct rpcrdma_xprt *r_xprt = id->context;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+ struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr;
+ struct rpcrdma_ep *ep = id->context;
+ struct rpc_xprt *xprt = ep->re_xprt;
might_sleep();
- trace_xprtrdma_cm_event(r_xprt, event);
switch (event->event) {
case RDMA_CM_EVENT_ADDR_RESOLVED:
case RDMA_CM_EVENT_ROUTE_RESOLVED:
- ia->ri_async_rc = 0;
- complete(&ia->ri_done);
+ ep->re_async_rc = 0;
+ complete(&ep->re_done);
return 0;
case RDMA_CM_EVENT_ADDR_ERROR:
- ia->ri_async_rc = -EPROTO;
- complete(&ia->ri_done);
+ ep->re_async_rc = -EPROTO;
+ complete(&ep->re_done);
return 0;
case RDMA_CM_EVENT_ROUTE_ERROR:
- ia->ri_async_rc = -ENETUNREACH;
- complete(&ia->ri_done);
+ ep->re_async_rc = -ENETUNREACH;
+ complete(&ep->re_done);
return 0;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
- pr_info("rpcrdma: removing device %s for %s:%s\n",
- ia->ri_id->device->name,
- rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt));
-#endif
- init_completion(&ia->ri_remove_done);
- set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
- ep->rep_connected = -ENODEV;
+ pr_info("rpcrdma: removing device %s for %pISpc\n",
+ ep->re_id->device->name, sap);
+ /* fall through */
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ ep->re_connect_status = -ENODEV;
xprt_force_disconnect(xprt);
- wait_for_completion(&ia->ri_remove_done);
-
- ia->ri_id = NULL;
- /* Return 1 to ensure the core destroys the id. */
- return 1;
+ goto disconnected;
case RDMA_CM_EVENT_ESTABLISHED:
- ++xprt->connect_cookie;
- ep->rep_connected = 1;
- rpcrdma_update_cm_private(r_xprt, &event->param.conn);
- trace_xprtrdma_inline_thresh(r_xprt);
- wake_up_all(&ep->rep_connect_wait);
+ kref_get(&ep->re_kref);
+ ep->re_connect_status = 1;
+ rpcrdma_update_cm_private(ep, &event->param.conn);
+ trace_xprtrdma_inline_thresh(ep);
+ wake_up_all(&ep->re_connect_wait);
break;
case RDMA_CM_EVENT_CONNECT_ERROR:
- ep->rep_connected = -ENOTCONN;
+ ep->re_connect_status = -ENOTCONN;
goto disconnected;
case RDMA_CM_EVENT_UNREACHABLE:
- ep->rep_connected = -ENETUNREACH;
+ ep->re_connect_status = -ENETUNREACH;
goto disconnected;
case RDMA_CM_EVENT_REJECTED:
- dprintk("rpcrdma: connection to %s:%s rejected: %s\n",
- rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
- rdma_reject_msg(id, event->status));
- ep->rep_connected = -ECONNREFUSED;
+ dprintk("rpcrdma: connection to %pISpc rejected: %s\n",
+ sap, rdma_reject_msg(id, event->status));
+ ep->re_connect_status = -ECONNREFUSED;
if (event->status == IB_CM_REJ_STALE_CONN)
- ep->rep_connected = -EAGAIN;
+ ep->re_connect_status = -EAGAIN;
goto disconnected;
case RDMA_CM_EVENT_DISCONNECTED:
- ep->rep_connected = -ECONNABORTED;
+ ep->re_connect_status = -ECONNABORTED;
disconnected:
- xprt_force_disconnect(xprt);
- wake_up_all(&ep->rep_connect_wait);
- break;
+ return rpcrdma_ep_destroy(ep);
default:
break;
}
- dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__,
- rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
- ia->ri_id->device->name, rdma_event_msg(event->event));
+ dprintk("RPC: %s: %pISpc on %s/frwr: %s\n", __func__, sap,
+ ep->re_id->device->name, rdma_event_msg(event->event));
return 0;
}
-static struct rdma_cm_id *
-rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
+static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_ep *ep)
{
unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
struct rdma_cm_id *id;
int rc;
- init_completion(&ia->ri_done);
+ init_completion(&ep->re_done);
- id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler,
- xprt, RDMA_PS_TCP, IB_QPT_RC);
+ id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, ep,
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(id))
return id;
- ia->ri_async_rc = -ETIMEDOUT;
- rc = rdma_resolve_addr(id, NULL,
- (struct sockaddr *)&xprt->rx_xprt.addr,
+ ep->re_async_rc = -ETIMEDOUT;
+ rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr,
RDMA_RESOLVE_TIMEOUT);
if (rc)
goto out;
- rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+ rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout);
if (rc < 0)
goto out;
- rc = ia->ri_async_rc;
+ rc = ep->re_async_rc;
if (rc)
goto out;
- ia->ri_async_rc = -ETIMEDOUT;
+ ep->re_async_rc = -ETIMEDOUT;
rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
if (rc)
goto out;
- rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+ rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout);
if (rc < 0)
goto out;
- rc = ia->ri_async_rc;
+ rc = ep->re_async_rc;
if (rc)
goto out;
@@ -340,356 +345,181 @@ out:
return ERR_PTR(rc);
}
-/*
- * Exported functions.
- */
-
-/**
- * rpcrdma_ia_open - Open and initialize an Interface Adapter.
- * @xprt: transport with IA to (re)initialize
- *
- * Returns 0 on success, negative errno if an appropriate
- * Interface Adapter could not be found and opened.
- */
-int
-rpcrdma_ia_open(struct rpcrdma_xprt *xprt)
+static void rpcrdma_ep_put(struct kref *kref)
{
- struct rpcrdma_ia *ia = &xprt->rx_ia;
- int rc;
+ struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref);
- ia->ri_id = rpcrdma_create_id(xprt, ia);
- if (IS_ERR(ia->ri_id)) {
- rc = PTR_ERR(ia->ri_id);
- goto out_err;
+ if (ep->re_id->qp) {
+ rdma_destroy_qp(ep->re_id);
+ ep->re_id->qp = NULL;
}
- ia->ri_pd = ib_alloc_pd(ia->ri_id->device, 0);
- if (IS_ERR(ia->ri_pd)) {
- rc = PTR_ERR(ia->ri_pd);
- pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
- goto out_err;
- }
+ if (ep->re_attr.recv_cq)
+ ib_free_cq(ep->re_attr.recv_cq);
+ ep->re_attr.recv_cq = NULL;
+ if (ep->re_attr.send_cq)
+ ib_free_cq(ep->re_attr.send_cq);
+ ep->re_attr.send_cq = NULL;
- return 0;
+ if (ep->re_pd)
+ ib_dealloc_pd(ep->re_pd);
+ ep->re_pd = NULL;
-out_err:
- rpcrdma_ia_close(ia);
- return rc;
+ kfree(ep);
+ module_put(THIS_MODULE);
}
-/**
- * rpcrdma_ia_remove - Handle device driver unload
- * @ia: interface adapter being removed
- *
- * Divest transport H/W resources associated with this adapter,
- * but allow it to be restored later.
- *
- * Caller must hold the transport send lock.
+/* Returns:
+ * %0 if @ep still has a positive kref count, or
+ * %1 if @ep was destroyed successfully.
*/
-void
-rpcrdma_ia_remove(struct rpcrdma_ia *ia)
+static int rpcrdma_ep_destroy(struct rpcrdma_ep *ep)
{
- struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
- rx_ia);
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-
- /* This is similar to rpcrdma_ep_destroy, but:
- * - Don't cancel the connect worker.
- * - Don't call rpcrdma_ep_disconnect, which waits
- * for another conn upcall, which will deadlock.
- * - rdma_disconnect is unneeded, the underlying
- * connection is already gone.
- */
- if (ia->ri_id->qp) {
- rpcrdma_xprt_drain(r_xprt);
- rdma_destroy_qp(ia->ri_id);
- ia->ri_id->qp = NULL;
- }
- ib_free_cq(ep->rep_attr.recv_cq);
- ep->rep_attr.recv_cq = NULL;
- ib_free_cq(ep->rep_attr.send_cq);
- ep->rep_attr.send_cq = NULL;
-
- /* The ULP is responsible for ensuring all DMA
- * mappings and MRs are gone.
- */
- rpcrdma_reps_unmap(r_xprt);
- rpcrdma_reqs_reset(r_xprt);
- rpcrdma_mrs_destroy(r_xprt);
- rpcrdma_sendctxs_destroy(r_xprt);
- ib_dealloc_pd(ia->ri_pd);
- ia->ri_pd = NULL;
-
- /* Allow waiters to continue */
- complete(&ia->ri_remove_done);
-
- trace_xprtrdma_remove(r_xprt);
-}
-
-/**
- * rpcrdma_ia_close - Clean up/close an IA.
- * @ia: interface adapter to close
- *
- */
-void
-rpcrdma_ia_close(struct rpcrdma_ia *ia)
-{
- if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
- if (ia->ri_id->qp)
- rdma_destroy_qp(ia->ri_id);
- rdma_destroy_id(ia->ri_id);
- }
- ia->ri_id = NULL;
-
- /* If the pd is still busy, xprtrdma missed freeing a resource */
- if (ia->ri_pd && !IS_ERR(ia->ri_pd))
- ib_dealloc_pd(ia->ri_pd);
- ia->ri_pd = NULL;
+ return kref_put(&ep->re_kref, rpcrdma_ep_put);
}
-/**
- * rpcrdma_ep_create - Create unconnected endpoint
- * @r_xprt: transport to instantiate
- *
- * Returns zero on success, or a negative errno.
- */
-int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
+static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
- struct ib_cq *sendcq, *recvcq;
+ struct rpcrdma_connect_private *pmsg;
+ struct ib_device *device;
+ struct rdma_cm_id *id;
+ struct rpcrdma_ep *ep;
int rc;
- ep->rep_max_requests = r_xprt->rx_xprt.max_reqs;
- ep->rep_inline_send = xprt_rdma_max_inline_write;
- ep->rep_inline_recv = xprt_rdma_max_inline_read;
+ ep = kzalloc(sizeof(*ep), GFP_NOFS);
+ if (!ep)
+ return -EAGAIN;
+ ep->re_xprt = &r_xprt->rx_xprt;
+ kref_init(&ep->re_kref);
- rc = frwr_query_device(r_xprt, ia->ri_id->device);
+ id = rpcrdma_create_id(r_xprt, ep);
+ if (IS_ERR(id)) {
+ rc = PTR_ERR(id);
+ goto out_free;
+ }
+ __module_get(THIS_MODULE);
+ device = id->device;
+ ep->re_id = id;
+
+ ep->re_max_requests = r_xprt->rx_xprt.max_reqs;
+ ep->re_inline_send = xprt_rdma_max_inline_write;
+ ep->re_inline_recv = xprt_rdma_max_inline_read;
+ rc = frwr_query_device(ep, device);
if (rc)
- return rc;
- r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests);
+ goto out_destroy;
+
+ r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests);
- ep->rep_attr.event_handler = rpcrdma_qp_event_handler;
- ep->rep_attr.qp_context = ep;
- ep->rep_attr.srq = NULL;
- ep->rep_attr.cap.max_inline_data = 0;
- ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
- ep->rep_attr.qp_type = IB_QPT_RC;
- ep->rep_attr.port_num = ~0;
+ ep->re_attr.event_handler = rpcrdma_qp_event_handler;
+ ep->re_attr.qp_context = ep;
+ ep->re_attr.srq = NULL;
+ ep->re_attr.cap.max_inline_data = 0;
+ ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ ep->re_attr.qp_type = IB_QPT_RC;
+ ep->re_attr.port_num = ~0;
dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
"iovs: send %d recv %d\n",
__func__,
- ep->rep_attr.cap.max_send_wr,
- ep->rep_attr.cap.max_recv_wr,
- ep->rep_attr.cap.max_send_sge,
- ep->rep_attr.cap.max_recv_sge);
-
- ep->rep_send_batch = ep->rep_max_requests >> 3;
- ep->rep_send_count = ep->rep_send_batch;
- init_waitqueue_head(&ep->rep_connect_wait);
- ep->rep_receive_count = 0;
-
- sendcq = ib_alloc_cq_any(ia->ri_id->device, r_xprt,
- ep->rep_attr.cap.max_send_wr + 1,
- IB_POLL_WORKQUEUE);
- if (IS_ERR(sendcq)) {
- rc = PTR_ERR(sendcq);
- goto out1;
+ ep->re_attr.cap.max_send_wr,
+ ep->re_attr.cap.max_recv_wr,
+ ep->re_attr.cap.max_send_sge,
+ ep->re_attr.cap.max_recv_sge);
+
+ ep->re_send_batch = ep->re_max_requests >> 3;
+ ep->re_send_count = ep->re_send_batch;
+ init_waitqueue_head(&ep->re_connect_wait);
+
+ ep->re_attr.send_cq = ib_alloc_cq_any(device, r_xprt,
+ ep->re_attr.cap.max_send_wr,
+ IB_POLL_WORKQUEUE);
+ if (IS_ERR(ep->re_attr.send_cq)) {
+ rc = PTR_ERR(ep->re_attr.send_cq);
+ goto out_destroy;
}
- recvcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
- ep->rep_attr.cap.max_recv_wr + 1,
- IB_POLL_WORKQUEUE);
- if (IS_ERR(recvcq)) {
- rc = PTR_ERR(recvcq);
- goto out2;
+ ep->re_attr.recv_cq = ib_alloc_cq_any(device, r_xprt,
+ ep->re_attr.cap.max_recv_wr,
+ IB_POLL_WORKQUEUE);
+ if (IS_ERR(ep->re_attr.recv_cq)) {
+ rc = PTR_ERR(ep->re_attr.recv_cq);
+ goto out_destroy;
}
-
- ep->rep_attr.send_cq = sendcq;
- ep->rep_attr.recv_cq = recvcq;
+ ep->re_receive_count = 0;
/* Initialize cma parameters */
- memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
+ memset(&ep->re_remote_cma, 0, sizeof(ep->re_remote_cma));
/* Prepare RDMA-CM private message */
+ pmsg = &ep->re_cm_private;
pmsg->cp_magic = rpcrdma_cmp_magic;
pmsg->cp_version = RPCRDMA_CMP_VERSION;
pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK;
- pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->rep_inline_send);
- pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->rep_inline_recv);
- ep->rep_remote_cma.private_data = pmsg;
- ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
+ pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->re_inline_send);
+ pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->re_inline_recv);
+ ep->re_remote_cma.private_data = pmsg;
+ ep->re_remote_cma.private_data_len = sizeof(*pmsg);
/* Client offers RDMA Read but does not initiate */
- ep->rep_remote_cma.initiator_depth = 0;
- ep->rep_remote_cma.responder_resources =
- min_t(int, U8_MAX, ia->ri_id->device->attrs.max_qp_rd_atom);
+ ep->re_remote_cma.initiator_depth = 0;
+ ep->re_remote_cma.responder_resources =
+ min_t(int, U8_MAX, device->attrs.max_qp_rd_atom);
/* Limit transport retries so client can detect server
* GID changes quickly. RPC layer handles re-establishing
* transport connection and retransmission.
*/
- ep->rep_remote_cma.retry_count = 6;
+ ep->re_remote_cma.retry_count = 6;
/* RPC-over-RDMA handles its own flow control. In addition,
* make all RNR NAKs visible so we know that RPC-over-RDMA
* flow control is working correctly (no NAKs should be seen).
*/
- ep->rep_remote_cma.flow_control = 0;
- ep->rep_remote_cma.rnr_retry_count = 0;
+ ep->re_remote_cma.flow_control = 0;
+ ep->re_remote_cma.rnr_retry_count = 0;
- return 0;
-
-out2:
- ib_free_cq(sendcq);
-out1:
- return rc;
-}
-
-/**
- * rpcrdma_ep_destroy - Disconnect and destroy endpoint.
- * @r_xprt: transport instance to shut down
- *
- */
-void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
- if (ia->ri_id && ia->ri_id->qp) {
- rpcrdma_ep_disconnect(ep, ia);
- rdma_destroy_qp(ia->ri_id);
- ia->ri_id->qp = NULL;
- }
-
- if (ep->rep_attr.recv_cq)
- ib_free_cq(ep->rep_attr.recv_cq);
- if (ep->rep_attr.send_cq)
- ib_free_cq(ep->rep_attr.send_cq);
-}
-
-/* Re-establish a connection after a device removal event.
- * Unlike a normal reconnection, a fresh PD and a new set
- * of MRs and buffers is needed.
- */
-static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
- struct ib_qp_init_attr *qp_init_attr)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- int rc, err;
-
- trace_xprtrdma_reinsert(r_xprt);
-
- rc = -EHOSTUNREACH;
- if (rpcrdma_ia_open(r_xprt))
- goto out1;
-
- rc = -ENOMEM;
- err = rpcrdma_ep_create(r_xprt);
- if (err) {
- pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
- goto out2;
- }
- memcpy(qp_init_attr, &ep->rep_attr, sizeof(*qp_init_attr));
-
- rc = -ENETUNREACH;
- err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr);
- if (err) {
- pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
- goto out3;
- }
- return 0;
-
-out3:
- rpcrdma_ep_destroy(r_xprt);
-out2:
- rpcrdma_ia_close(ia);
-out1:
- return rc;
-}
-
-static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt,
- struct ib_qp_init_attr *qp_init_attr)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rdma_cm_id *id, *old;
- int err, rc;
-
- rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia);
-
- rc = -EHOSTUNREACH;
- id = rpcrdma_create_id(r_xprt, ia);
- if (IS_ERR(id))
- goto out;
-
- /* As long as the new ID points to the same device as the
- * old ID, we can reuse the transport's existing PD and all
- * previously allocated MRs. Also, the same device means
- * the transport's previous DMA mappings are still valid.
- *
- * This is a sanity check only. There should be no way these
- * point to two different devices here.
- */
- old = id;
- rc = -ENETUNREACH;
- if (ia->ri_id->device != id->device) {
- pr_err("rpcrdma: can't reconnect on different device!\n");
+ ep->re_pd = ib_alloc_pd(device, 0);
+ if (IS_ERR(ep->re_pd)) {
+ rc = PTR_ERR(ep->re_pd);
goto out_destroy;
}
- err = rdma_create_qp(id, ia->ri_pd, qp_init_attr);
- if (err)
+ rc = rdma_create_qp(id, ep->re_pd, &ep->re_attr);
+ if (rc)
goto out_destroy;
- /* Atomically replace the transport's ID and QP. */
- rc = 0;
- old = ia->ri_id;
- ia->ri_id = id;
- rdma_destroy_qp(old);
+ r_xprt->rx_ep = ep;
+ return 0;
out_destroy:
- rdma_destroy_id(old);
-out:
+ rpcrdma_ep_destroy(ep);
+ rdma_destroy_id(id);
+out_free:
+ kfree(ep);
+ r_xprt->rx_ep = NULL;
return rc;
}
-/*
- * Connect unconnected endpoint.
+/**
+ * rpcrdma_xprt_connect - Connect an unconnected transport
+ * @r_xprt: controlling transport instance
+ *
+ * Returns 0 on success or a negative errno.
*/
-int
-rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
- rx_ia);
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
- struct ib_qp_init_attr qp_init_attr;
+ struct rpcrdma_ep *ep;
int rc;
retry:
- memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr));
- switch (ep->rep_connected) {
- case 0:
- rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr);
- if (rc) {
- rc = -ENETUNREACH;
- goto out_noupdate;
- }
- break;
- case -ENODEV:
- rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr);
- if (rc)
- goto out_noupdate;
- break;
- default:
- rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr);
- if (rc)
- goto out;
- }
+ rpcrdma_xprt_disconnect(r_xprt);
+ rc = rpcrdma_ep_create(r_xprt);
+ if (rc)
+ return rc;
+ ep = r_xprt->rx_ep;
- ep->rep_connected = 0;
+ ep->re_connect_status = 0;
xprt_clear_connected(xprt);
rpcrdma_reset_cwnd(r_xprt);
@@ -699,64 +529,68 @@ retry:
if (rc)
goto out;
- rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
+ rc = rdma_connect(ep->re_id, &ep->re_remote_cma);
if (rc)
goto out;
if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
- wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
- if (ep->rep_connected <= 0) {
- if (ep->rep_connected == -EAGAIN)
+ wait_event_interruptible(ep->re_connect_wait,
+ ep->re_connect_status != 0);
+ if (ep->re_connect_status <= 0) {
+ if (ep->re_connect_status == -EAGAIN)
goto retry;
- rc = ep->rep_connected;
+ rc = ep->re_connect_status;
goto out;
}
rc = rpcrdma_reqs_setup(r_xprt);
if (rc) {
- rpcrdma_ep_disconnect(ep, ia);
+ rpcrdma_xprt_disconnect(r_xprt);
goto out;
}
rpcrdma_mrs_create(r_xprt);
out:
if (rc)
- ep->rep_connected = rc;
-
-out_noupdate:
+ ep->re_connect_status = rc;
trace_xprtrdma_connect(r_xprt, rc);
return rc;
}
/**
- * rpcrdma_ep_disconnect - Disconnect underlying transport
- * @ep: endpoint to disconnect
- * @ia: associated interface adapter
+ * rpcrdma_xprt_disconnect - Disconnect underlying transport
+ * @r_xprt: controlling transport instance
*
* Caller serializes. Either the transport send lock is held,
* or we're being called to destroy the transport.
+ *
+ * On return, @r_xprt is completely divested of all hardware
+ * resources and prepared for the next ->connect operation.
*/
-void
-rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
- rx_ep);
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ struct rdma_cm_id *id;
int rc;
- /* returns without wait if ID is not connected */
- rc = rdma_disconnect(ia->ri_id);
- if (!rc)
- wait_event_interruptible(ep->rep_connect_wait,
- ep->rep_connected != 1);
- else
- ep->rep_connected = rc;
+ if (!ep)
+ return;
+
+ id = ep->re_id;
+ rc = rdma_disconnect(id);
trace_xprtrdma_disconnect(r_xprt, rc);
rpcrdma_xprt_drain(r_xprt);
+ rpcrdma_reps_unmap(r_xprt);
rpcrdma_reqs_reset(r_xprt);
rpcrdma_mrs_destroy(r_xprt);
rpcrdma_sendctxs_destroy(r_xprt);
+
+ if (rpcrdma_ep_destroy(ep))
+ rdma_destroy_id(id);
+
+ r_xprt->rx_ep = NULL;
}
/* Fixed-size circular FIFO queue. This implementation is wait-free and
@@ -793,7 +627,7 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep)
{
struct rpcrdma_sendctx *sc;
- sc = kzalloc(struct_size(sc, sc_sges, ep->rep_attr.cap.max_send_sge),
+ sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge),
GFP_KERNEL);
if (!sc)
return NULL;
@@ -813,14 +647,14 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
* the ->send_request call to fail temporarily before too many
* Sends are posted.
*/
- i = r_xprt->rx_ep.rep_max_requests + RPCRDMA_MAX_BC_REQUESTS;
+ i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS;
buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL);
if (!buf->rb_sc_ctxs)
return -ENOMEM;
buf->rb_sc_last = i - 1;
for (i = 0; i <= buf->rb_sc_last; i++) {
- sc = rpcrdma_sendctx_create(&r_xprt->rx_ep);
+ sc = rpcrdma_sendctx_create(r_xprt->rx_ep);
if (!sc)
return -ENOMEM;
@@ -924,10 +758,10 @@ static void
rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
unsigned int count;
- for (count = 0; count < ia->ri_max_rdma_segs; count++) {
+ for (count = 0; count < ep->re_max_rdma_segs; count++) {
struct rpcrdma_mr *mr;
int rc;
@@ -935,14 +769,12 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
if (!mr)
break;
- rc = frwr_init_mr(ia, mr);
+ rc = frwr_mr_init(r_xprt, mr);
if (rc) {
kfree(mr);
break;
}
- mr->mr_xprt = r_xprt;
-
spin_lock(&buf->rb_lock);
rpcrdma_mr_push(mr, &buf->rb_mrs);
list_add(&mr->mr_all, &buf->rb_all_mrs);
@@ -973,12 +805,12 @@ rpcrdma_mr_refresh_worker(struct work_struct *work)
void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
- /* If there is no underlying device, it's no use to
- * wake the refresh worker.
+ /* If there is no underlying connection, it's no use
+ * to wake the refresh worker.
*/
- if (ep->rep_connected != -ENODEV) {
+ if (ep->re_connect_status == 1) {
/* The work is scheduled on a WQ_MEM_RECLAIM
* workqueue in order to prevent MR allocation
* from recursing into NFS during direct reclaim.
@@ -1042,7 +874,7 @@ int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* Compute maximum header buffer size in bytes */
maxhdrsize = rpcrdma_fixed_maxsz + 3 +
- r_xprt->rx_ia.ri_max_rdma_segs * rpcrdma_readchunk_maxsz;
+ r_xprt->rx_ep->re_max_rdma_segs * rpcrdma_readchunk_maxsz;
maxhdrsize *= sizeof(__be32);
rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize),
DMA_TO_DEVICE, GFP_KERNEL);
@@ -1120,7 +952,7 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
if (rep == NULL)
goto out;
- rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.rep_inline_recv,
+ rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv,
DMA_FROM_DEVICE, GFP_KERNEL);
if (!rep->rr_rdmabuf)
goto out_free;
@@ -1345,7 +1177,7 @@ void rpcrdma_mr_put(struct rpcrdma_mr *mr)
if (mr->mr_dir != DMA_NONE) {
trace_xprtrdma_mr_unmap(mr);
- ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device,
+ ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
mr->mr_sg, mr->mr_nents, mr->mr_dir);
mr->mr_dir = DMA_NONE;
}
@@ -1463,7 +1295,7 @@ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags)
bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_regbuf *rb)
{
- struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+ struct ib_device *device = r_xprt->rx_ep->re_id->device;
if (rb->rg_direction == DMA_NONE)
return false;
@@ -1476,7 +1308,7 @@ bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
}
rb->rg_device = device;
- rb->rg_iov.lkey = r_xprt->rx_ia.ri_pd->local_dma_lkey;
+ rb->rg_iov.lkey = r_xprt->rx_ep->re_pd->local_dma_lkey;
return true;
}
@@ -1502,31 +1334,28 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb)
}
/**
- * rpcrdma_ep_post - Post WRs to a transport's Send Queue
- * @ia: transport's device information
- * @ep: transport's RDMA endpoint information
+ * rpcrdma_post_sends - Post WRs to a transport's Send Queue
+ * @r_xprt: controlling transport instance
* @req: rpcrdma_req containing the Send WR to post
*
* Returns 0 if the post was successful, otherwise -ENOTCONN
* is returned.
*/
-int
-rpcrdma_ep_post(struct rpcrdma_ia *ia,
- struct rpcrdma_ep *ep,
- struct rpcrdma_req *req)
+int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *send_wr = &req->rl_wr;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
int rc;
- if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) {
+ if (!ep->re_send_count || kref_read(&req->rl_kref) > 1) {
send_wr->send_flags |= IB_SEND_SIGNALED;
- ep->rep_send_count = ep->rep_send_batch;
+ ep->re_send_count = ep->re_send_batch;
} else {
send_wr->send_flags &= ~IB_SEND_SIGNALED;
- --ep->rep_send_count;
+ --ep->re_send_count;
}
- rc = frwr_send(ia, req);
+ rc = frwr_send(r_xprt, req);
trace_xprtrdma_post_send(req, rc);
if (rc)
return -ENOTCONN;
@@ -1542,7 +1371,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct ib_recv_wr *wr, *bad_wr;
struct rpcrdma_rep *rep;
int needed, count, rc;
@@ -1551,9 +1380,9 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
count = 0;
needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
- if (likely(ep->rep_receive_count > needed))
+ if (likely(ep->re_receive_count > needed))
goto out;
- needed -= ep->rep_receive_count;
+ needed -= ep->re_receive_count;
if (!temp)
needed += RPCRDMA_MAX_RECV_BATCH;
@@ -1579,7 +1408,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
if (!wr)
goto out;
- rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr,
+ rc = ib_post_recv(ep->re_id->qp, wr,
(const struct ib_recv_wr **)&bad_wr);
out:
trace_xprtrdma_post_recvs(r_xprt, count, rc);
@@ -1593,6 +1422,6 @@ out:
--count;
}
}
- ep->rep_receive_count += count;
+ ep->re_receive_count += count;
return;
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 37d5080c250b..0a16fdb09b2c 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -65,43 +65,33 @@
#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
/*
- * Interface Adapter -- one per transport instance
+ * RDMA Endpoint -- connection endpoint details
*/
-struct rpcrdma_ia {
- struct rdma_cm_id *ri_id;
- struct ib_pd *ri_pd;
- int ri_async_rc;
- unsigned int ri_max_rdma_segs;
- unsigned int ri_max_frwr_depth;
- bool ri_implicit_roundup;
- enum ib_mr_type ri_mrtype;
- unsigned long ri_flags;
- struct completion ri_done;
- struct completion ri_remove_done;
-};
-
-enum {
- RPCRDMA_IAF_REMOVING = 0,
-};
-
-/*
- * RDMA Endpoint -- one per transport instance
- */
-
struct rpcrdma_ep {
- unsigned int rep_send_count;
- unsigned int rep_send_batch;
- unsigned int rep_max_inline_send;
- unsigned int rep_max_inline_recv;
- int rep_connected;
- struct ib_qp_init_attr rep_attr;
- wait_queue_head_t rep_connect_wait;
- struct rpcrdma_connect_private rep_cm_private;
- struct rdma_conn_param rep_remote_cma;
- unsigned int rep_max_requests; /* depends on device */
- unsigned int rep_inline_send; /* negotiated */
- unsigned int rep_inline_recv; /* negotiated */
- int rep_receive_count;
+ struct kref re_kref;
+ struct rdma_cm_id *re_id;
+ struct ib_pd *re_pd;
+ unsigned int re_max_rdma_segs;
+ unsigned int re_max_fr_depth;
+ bool re_implicit_roundup;
+ enum ib_mr_type re_mrtype;
+ struct completion re_done;
+ unsigned int re_send_count;
+ unsigned int re_send_batch;
+ unsigned int re_max_inline_send;
+ unsigned int re_max_inline_recv;
+ int re_async_rc;
+ int re_connect_status;
+ struct ib_qp_init_attr re_attr;
+ wait_queue_head_t re_connect_wait;
+ struct rpc_xprt *re_xprt;
+ struct rpcrdma_connect_private
+ re_cm_private;
+ struct rdma_conn_param re_remote_cma;
+ int re_receive_count;
+ unsigned int re_max_requests; /* depends on device */
+ unsigned int re_inline_send; /* negotiated */
+ unsigned int re_inline_recv; /* negotiated */
};
/* Pre-allocate extra Work Requests for handling backward receives
@@ -422,8 +412,7 @@ struct rpcrdma_stats {
*/
struct rpcrdma_xprt {
struct rpc_xprt rx_xprt;
- struct rpcrdma_ia rx_ia;
- struct rpcrdma_ep rx_ep;
+ struct rpcrdma_ep *rx_ep;
struct rpcrdma_buffer rx_buf;
struct delayed_work rx_connect_worker;
struct rpc_timeout rx_timeout;
@@ -455,22 +444,13 @@ extern int xprt_rdma_pad_optimize;
extern unsigned int xprt_rdma_memreg_strategy;
/*
- * Interface Adapter calls - xprtrdma/verbs.c
- */
-int rpcrdma_ia_open(struct rpcrdma_xprt *xprt);
-void rpcrdma_ia_remove(struct rpcrdma_ia *ia);
-void rpcrdma_ia_close(struct rpcrdma_ia *);
-
-/*
* Endpoint calls - xprtrdma/verbs.c
*/
-int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt);
-void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt);
-int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
-void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc);
+int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt);
+void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt);
-int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
- struct rpcrdma_req *);
+int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
/*
@@ -536,15 +516,14 @@ rpcrdma_data_dir(bool writing)
/* Memory registration calls xprtrdma/frwr_ops.c
*/
void frwr_reset(struct rpcrdma_req *req);
-int frwr_query_device(struct rpcrdma_xprt *r_xprt,
- const struct ib_device *device);
-int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr);
+int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device);
+int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr);
void frwr_release_mr(struct rpcrdma_mr *mr);
struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_mr_seg *seg,
int nsegs, bool writing, __be32 xid,
struct rpcrdma_mr *mr);
-int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req);
+int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
@@ -569,7 +548,7 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
enum rpcrdma_chunktype rtype);
void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc);
int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
-void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep);
void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt);
void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);
void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 17cb902e5153..0bda8a73e8a8 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1861,7 +1861,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
struct rpc_xprt *xprt = &transport->xprt;
struct file *filp;
struct socket *sock;
- int status = -EIO;
+ int status;
status = __sock_create(xprt->xprt_net, AF_LOCAL,
SOCK_STREAM, 0, &sock, 1);
diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c
index c58504774118..418c46fe5ffc 100644
--- a/samples/hw_breakpoint/data_breakpoint.c
+++ b/samples/hw_breakpoint/data_breakpoint.c
@@ -23,7 +23,7 @@
struct perf_event * __percpu *sample_hbp;
-static char ksym_name[KSYM_NAME_LEN] = "pid_max";
+static char ksym_name[KSYM_NAME_LEN] = "jiffies";
module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO);
MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any"
" write operations on the kernel symbol");
@@ -41,11 +41,15 @@ static int __init hw_break_module_init(void)
{
int ret;
struct perf_event_attr attr;
+ void *addr = __symbol_get(ksym_name);
+
+ if (!addr)
+ return -ENXIO;
hw_breakpoint_init(&attr);
- attr.bp_addr = kallsyms_lookup_name(ksym_name);
+ attr.bp_addr = (unsigned long)addr;
attr.bp_len = HW_BREAKPOINT_LEN_4;
- attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+ attr.bp_type = HW_BREAKPOINT_W;
sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, NULL);
if (IS_ERR((void __force *)sample_hbp)) {
@@ -66,6 +70,7 @@ fail:
static void __exit hw_break_module_exit(void)
{
unregister_wide_hw_breakpoint(sample_hbp);
+ symbol_put(ksym_name);
printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name);
}
diff --git a/scripts/Kconfig.include b/scripts/Kconfig.include
index 8074f14d9d0d..c264da2b9b30 100644
--- a/scripts/Kconfig.include
+++ b/scripts/Kconfig.include
@@ -48,9 +48,6 @@ $(error-if,$(failure,command -v $(LD)),linker '$(LD)' not found)
# Fail if the linker is gold as it's not capable of linking the kernel proper
$(error-if,$(success, $(LD) -v | grep -q gold), gold linker '$(LD)' not supported)
-# gcc version including patch level
-gcc-version := $(shell,$(srctree)/scripts/gcc-version.sh $(CC))
-
# machine bit flags
# $(m32-flag): -m32 if the compiler supports it, or an empty string otherwise.
# $(m64-flag): -m64 if the compiler supports it, or an empty string otherwise.
diff --git a/scripts/Makefile b/scripts/Makefile
index 5e75802b1a44..95ecf970c74c 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -2,10 +2,6 @@
###
# scripts contains sources for various helper programs used throughout
# the kernel for the build process.
-# ---------------------------------------------------------------------------
-# kallsyms: Find all symbols in vmlinux
-
-HOST_EXTRACFLAGS += -I$(srctree)/tools/include
always-$(CONFIG_BUILD_BIN2C) += bin2c
always-$(CONFIG_KALLSYMS) += kallsyms
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index a1730d42e5f3..9fcbfac15d1d 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -46,7 +46,7 @@ include $(kbuild-file)
include scripts/Makefile.lib
# Do not include host rules unless needed
-ifneq ($(hostprogs)$(hostlibs-y)$(hostlibs-m)$(hostcxxlibs-y)$(hostcxxlibs-m),)
+ifneq ($(hostprogs)$(hostcxxlibs-y)$(hostcxxlibs-m),)
include scripts/Makefile.host
endif
@@ -65,7 +65,6 @@ endif
ifneq ($(strip $(lib-y) $(lib-m) $(lib-)),)
lib-target := $(obj)/lib.a
-real-obj-y += $(obj)/lib-ksyms.o
endif
ifdef need-builtin
@@ -410,22 +409,6 @@ $(lib-target): $(lib-y) FORCE
targets += $(lib-target)
-dummy-object = $(obj)/.lib_exports.o
-ksyms-lds = $(dot-target).lds
-
-quiet_cmd_export_list = EXPORTS $@
-cmd_export_list = $(OBJDUMP) -h $< | \
- sed -ne '/___ksymtab/s/.*+\([^ ]*\).*/EXTERN(\1)/p' >$(ksyms-lds);\
- rm -f $(dummy-object);\
- echo | $(CC) $(a_flags) -c -o $(dummy-object) -x assembler -;\
- $(LD) $(ld_flags) -r -o $@ -T $(ksyms-lds) $(dummy-object);\
- rm $(dummy-object) $(ksyms-lds)
-
-$(obj)/lib-ksyms.o: $(lib-target) FORCE
- $(call if_changed,export_list)
-
-targets += $(obj)/lib-ksyms.o
-
endif
# NOTE:
diff --git a/scripts/Makefile.clean b/scripts/Makefile.clean
index 1e4206566a82..075f0cc2d8d7 100644
--- a/scripts/Makefile.clean
+++ b/scripts/Makefile.clean
@@ -30,7 +30,6 @@ subdir-ymn := $(addprefix $(obj)/,$(subdir-ymn))
__clean-files := $(extra-y) $(extra-m) $(extra-) \
$(always) $(always-y) $(always-m) $(always-) $(targets) $(clean-files) \
$(hostprogs) $(hostprogs-y) $(hostprogs-m) $(hostprogs-) \
- $(hostlibs-y) $(hostlibs-m) $(hostlibs-) \
$(hostcxxlibs-y) $(hostcxxlibs-m)
__clean-files := $(filter-out $(no-clean-files), $(__clean-files))
diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn
index ca08f2fe7c34..4aea7cf71d11 100644
--- a/scripts/Makefile.extrawarn
+++ b/scripts/Makefile.extrawarn
@@ -49,6 +49,7 @@ KBUILD_CFLAGS += -Wno-format
KBUILD_CFLAGS += -Wno-sign-compare
KBUILD_CFLAGS += -Wno-format-zero-length
KBUILD_CFLAGS += $(call cc-disable-warning, pointer-to-enum-cast)
+KBUILD_CFLAGS += -Wno-tautological-constant-out-of-range-compare
endif
endif
diff --git a/scripts/Makefile.host b/scripts/Makefile.host
index 3b7121d43324..2045855d0b75 100644
--- a/scripts/Makefile.host
+++ b/scripts/Makefile.host
@@ -39,7 +39,6 @@ $(obj)/%.tab.c $(obj)/%.tab.h: $(src)/%.y FORCE
# They are linked as C++ code to the executable qconf
__hostprogs := $(sort $(hostprogs))
-host-cshlib := $(sort $(hostlibs-y) $(hostlibs-m))
host-cxxshlib := $(sort $(hostcxxlibs-y) $(hostcxxlibs-m))
# C code
@@ -63,7 +62,6 @@ host-cxxmulti := $(foreach m,$(__hostprogs),$(if $($(m)-cxxobjs),$(m)))
host-cxxobjs := $(sort $(foreach m,$(host-cxxmulti),$($(m)-cxxobjs)))
# Object (.o) files used by the shared libaries
-host-cshobjs := $(sort $(foreach m,$(host-cshlib),$($(m:.so=-objs))))
host-cxxshobjs := $(sort $(foreach m,$(host-cxxshlib),$($(m:.so=-objs))))
host-csingle := $(addprefix $(obj)/,$(host-csingle))
@@ -71,9 +69,7 @@ host-cmulti := $(addprefix $(obj)/,$(host-cmulti))
host-cobjs := $(addprefix $(obj)/,$(host-cobjs))
host-cxxmulti := $(addprefix $(obj)/,$(host-cxxmulti))
host-cxxobjs := $(addprefix $(obj)/,$(host-cxxobjs))
-host-cshlib := $(addprefix $(obj)/,$(host-cshlib))
host-cxxshlib := $(addprefix $(obj)/,$(host-cxxshlib))
-host-cshobjs := $(addprefix $(obj)/,$(host-cshobjs))
host-cxxshobjs := $(addprefix $(obj)/,$(host-cxxshobjs))
#####
@@ -141,13 +137,6 @@ $(host-cxxobjs): $(obj)/%.o: $(src)/%.cc FORCE
$(call if_changed_dep,host-cxxobjs)
# Compile .c file, create position independent .o file
-# host-cshobjs -> .o
-quiet_cmd_host-cshobjs = HOSTCC -fPIC $@
- cmd_host-cshobjs = $(HOSTCC) $(hostc_flags) -fPIC -c -o $@ $<
-$(host-cshobjs): $(obj)/%.o: $(src)/%.c FORCE
- $(call if_changed_dep,host-cshobjs)
-
-# Compile .c file, create position independent .o file
# Note that plugin capable gcc versions can be either C or C++ based
# therefore plugin source files have to be compilable in both C and C++ mode.
# This is why a C++ compiler is invoked on a .c file.
@@ -158,16 +147,6 @@ $(host-cxxshobjs): $(obj)/%.o: $(src)/%.c FORCE
$(call if_changed_dep,host-cxxshobjs)
# Link a shared library, based on position independent .o files
-# *.o -> .so shared library (host-cshlib)
-quiet_cmd_host-cshlib = HOSTLLD -shared $@
- cmd_host-cshlib = $(HOSTCC) $(KBUILD_HOSTLDFLAGS) -shared -o $@ \
- $(addprefix $(obj)/, $($(target-stem)-objs)) \
- $(KBUILD_HOSTLDLIBS) $(HOSTLDLIBS_$(target-stem).so)
-$(host-cshlib): FORCE
- $(call if_changed,host-cshlib)
-$(call multi_depend, $(host-cshlib), .so, -objs)
-
-# Link a shared library, based on position independent .o files
# *.o -> .so shared library (host-cxxshlib)
quiet_cmd_host-cxxshlib = HOSTLLD -shared $@
cmd_host-cxxshlib = $(HOSTCXX) $(KBUILD_HOSTLDFLAGS) -shared -o $@ \
@@ -178,4 +157,4 @@ $(host-cxxshlib): FORCE
$(call multi_depend, $(host-cxxshlib), .so, -objs)
targets += $(host-csingle) $(host-cmulti) $(host-cobjs)\
- $(host-cxxmulti) $(host-cxxobjs) $(host-cshlib) $(host-cshobjs) $(host-cxxshlib) $(host-cxxshobjs)
+ $(host-cxxmulti) $(host-cxxobjs) $(host-cxxshlib) $(host-cxxshobjs)
diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan
index 019771b845c5..5b15bc425ec9 100644
--- a/scripts/Makefile.ubsan
+++ b/scripts/Makefile.ubsan
@@ -1,16 +1,26 @@
# SPDX-License-Identifier: GPL-2.0
ifdef CONFIG_UBSAN
+
+ifdef CONFIG_UBSAN_ALIGNMENT
+ CFLAGS_UBSAN += $(call cc-option, -fsanitize=alignment)
+endif
+
+ifdef CONFIG_UBSAN_BOUNDS
+ CFLAGS_UBSAN += $(call cc-option, -fsanitize=bounds)
+endif
+
+ifdef CONFIG_UBSAN_MISC
CFLAGS_UBSAN += $(call cc-option, -fsanitize=shift)
CFLAGS_UBSAN += $(call cc-option, -fsanitize=integer-divide-by-zero)
CFLAGS_UBSAN += $(call cc-option, -fsanitize=unreachable)
CFLAGS_UBSAN += $(call cc-option, -fsanitize=signed-integer-overflow)
- CFLAGS_UBSAN += $(call cc-option, -fsanitize=bounds)
CFLAGS_UBSAN += $(call cc-option, -fsanitize=object-size)
CFLAGS_UBSAN += $(call cc-option, -fsanitize=bool)
CFLAGS_UBSAN += $(call cc-option, -fsanitize=enum)
+endif
-ifdef CONFIG_UBSAN_ALIGNMENT
- CFLAGS_UBSAN += $(call cc-option, -fsanitize=alignment)
+ifdef CONFIG_UBSAN_TRAP
+ CFLAGS_UBSAN += $(call cc-option, -fsanitize-undefined-trap-on-error)
endif
# -fsanitize=* options makes GCC less smart than usual and
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index a63380c6b0d2..d64c67b67e3c 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -64,6 +64,7 @@ my $color = "auto";
my $allow_c99_comments = 1; # Can be overridden by --ignore C99_COMMENT_TOLERANCE
# git output parsing needs US English output, so first set backtick child process LANGUAGE
my $git_command ='export LANGUAGE=en_US.UTF-8; git';
+my $tabsize = 8;
sub help {
my ($exitcode) = @_;
@@ -98,6 +99,7 @@ Options:
--show-types show the specific message type in the output
--max-line-length=n set the maximum line length, if exceeded, warn
--min-conf-desc-length=n set the min description length, if shorter, warn
+ --tab-size=n set the number of spaces for tab (default 8)
--root=PATH PATH to the kernel tree root
--no-summary suppress the per-file summary
--mailback only produce a report in case of warnings/errors
@@ -215,6 +217,7 @@ GetOptions(
'list-types!' => \$list_types,
'max-line-length=i' => \$max_line_length,
'min-conf-desc-length=i' => \$min_conf_desc_length,
+ 'tab-size=i' => \$tabsize,
'root=s' => \$root,
'summary!' => \$summary,
'mailback!' => \$mailback,
@@ -267,6 +270,9 @@ if ($color =~ /^[01]$/) {
die "Invalid color mode: $color\n";
}
+# skip TAB size 1 to avoid additional checks on $tabsize - 1
+die "Invalid TAB size: $tabsize\n" if ($tabsize < 2);
+
sub hash_save_array_words {
my ($hashRef, $arrayRef) = @_;
@@ -804,12 +810,12 @@ sub build_types {
}x;
$Type = qr{
$NonptrType
- (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+)?
+ (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+){0,4}
(?:\s+$Inline|\s+$Modifier)*
}x;
$TypeMisordered = qr{
$NonptrTypeMisordered
- (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+)?
+ (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+){0,4}
(?:\s+$Inline|\s+$Modifier)*
}x;
$Declare = qr{(?:$Storage\s+(?:$Inline\s+)?)?$Type};
@@ -1118,6 +1124,7 @@ sub parse_email {
my ($formatted_email) = @_;
my $name = "";
+ my $name_comment = "";
my $address = "";
my $comment = "";
@@ -1150,6 +1157,10 @@ sub parse_email {
$name = trim($name);
$name =~ s/^\"|\"$//g;
+ $name =~ s/(\s*\([^\)]+\))\s*//;
+ if (defined($1)) {
+ $name_comment = trim($1);
+ }
$address = trim($address);
$address =~ s/^\<|\>$//g;
@@ -1158,7 +1169,7 @@ sub parse_email {
$name = "\"$name\"";
}
- return ($name, $address, $comment);
+ return ($name, $name_comment, $address, $comment);
}
sub format_email {
@@ -1184,6 +1195,23 @@ sub format_email {
return $formatted_email;
}
+sub reformat_email {
+ my ($email) = @_;
+
+ my ($email_name, $name_comment, $email_address, $comment) = parse_email($email);
+ return format_email($email_name, $email_address);
+}
+
+sub same_email_addresses {
+ my ($email1, $email2) = @_;
+
+ my ($email1_name, $name1_comment, $email1_address, $comment1) = parse_email($email1);
+ my ($email2_name, $name2_comment, $email2_address, $comment2) = parse_email($email2);
+
+ return $email1_name eq $email2_name &&
+ $email1_address eq $email2_address;
+}
+
sub which {
my ($bin) = @_;
@@ -1217,7 +1245,7 @@ sub expand_tabs {
if ($c eq "\t") {
$res .= ' ';
$n++;
- for (; ($n % 8) != 0; $n++) {
+ for (; ($n % $tabsize) != 0; $n++) {
$res .= ' ';
}
next;
@@ -2230,7 +2258,7 @@ sub string_find_replace {
sub tabify {
my ($leading) = @_;
- my $source_indent = 8;
+ my $source_indent = $tabsize;
my $max_spaces_before_tab = $source_indent - 1;
my $spaces_to_tab = " " x $source_indent;
@@ -2272,6 +2300,19 @@ sub pos_last_openparen {
return length(expand_tabs(substr($line, 0, $last_openparen))) + 1;
}
+sub get_raw_comment {
+ my ($line, $rawline) = @_;
+ my $comment = '';
+
+ for my $i (0 .. (length($line) - 1)) {
+ if (substr($line, $i, 1) eq "$;") {
+ $comment .= substr($rawline, $i, 1);
+ }
+ }
+
+ return $comment;
+}
+
sub process {
my $filename = shift;
@@ -2294,6 +2335,7 @@ sub process {
my $is_binding_patch = -1;
my $in_header_lines = $file ? 0 : 1;
my $in_commit_log = 0; #Scanning lines before patch
+ my $has_patch_separator = 0; #Found a --- line
my $has_commit_log = 0; #Encountered lines before patch
my $commit_log_lines = 0; #Number of commit log lines
my $commit_log_possible_stack_dump = 0;
@@ -2433,6 +2475,7 @@ sub process {
$sline =~ s/$;/ /g; #with comments as spaces
my $rawline = $rawlines[$linenr - 1];
+ my $raw_comment = get_raw_comment($line, $rawline);
# check if it's a mode change, rename or start of a patch
if (!$in_commit_log &&
@@ -2604,21 +2647,26 @@ sub process {
$author = $1;
$author = encode("utf8", $author) if ($line =~ /=\?utf-8\?/i);
$author =~ s/"//g;
+ $author = reformat_email($author);
}
# Check the patch for a signoff:
- if ($line =~ /^\s*signed-off-by:/i) {
+ if ($line =~ /^\s*signed-off-by:\s*(.*)/i) {
$signoff++;
$in_commit_log = 0;
if ($author ne '') {
- my $l = $line;
- $l =~ s/"//g;
- if ($l =~ /^\s*signed-off-by:\s*\Q$author\E/i) {
- $authorsignoff = 1;
+ if (same_email_addresses($1, $author)) {
+ $authorsignoff = 1;
}
}
}
+# Check for patch separator
+ if ($line =~ /^---$/) {
+ $has_patch_separator = 1;
+ $in_commit_log = 0;
+ }
+
# Check if MAINTAINERS is being updated. If so, there's probably no need to
# emit the "does MAINTAINERS need updating?" message on file add/move/delete
if ($line =~ /^\s*MAINTAINERS\s*\|/) {
@@ -2664,7 +2712,7 @@ sub process {
}
}
- my ($email_name, $email_address, $comment) = parse_email($email);
+ my ($email_name, $name_comment, $email_address, $comment) = parse_email($email);
my $suggested_email = format_email(($email_name, $email_address));
if ($suggested_email eq "") {
ERROR("BAD_SIGN_OFF",
@@ -2675,9 +2723,7 @@ sub process {
$dequoted =~ s/" </ </;
# Don't force email to have quotes
# Allow just an angle bracketed address
- if ("$dequoted$comment" ne $email &&
- "<$email_address>$comment" ne $email &&
- "$suggested_email$comment" ne $email) {
+ if (!same_email_addresses($email, $suggested_email)) {
WARN("BAD_SIGN_OFF",
"email address '$email' might be better as '$suggested_email$comment'\n" . $herecurr);
}
@@ -2720,10 +2766,10 @@ sub process {
"A patch subject line should describe the change not the tool that found it\n" . $herecurr);
}
-# Check for unwanted Gerrit info
- if ($in_commit_log && $line =~ /^\s*change-id:/i) {
+# Check for Gerrit Change-Ids not in any patch context
+ if ($realfile eq '' && !$has_patch_separator && $line =~ /^\s*change-id:/i) {
ERROR("GERRIT_CHANGE_ID",
- "Remove Gerrit Change-Id's before submitting upstream.\n" . $herecurr);
+ "Remove Gerrit Change-Id's before submitting upstream\n" . $herecurr);
}
# Check if the commit log is in a possible stack dump
@@ -2761,7 +2807,7 @@ sub process {
# Check for git id commit length and improperly formed commit descriptions
if ($in_commit_log && !$commit_log_possible_stack_dump &&
- $line !~ /^\s*(?:Link|Patchwork|http|https|BugLink):/i &&
+ $line !~ /^\s*(?:Link|Patchwork|http|https|BugLink|base-commit):/i &&
$line !~ /^This reverts commit [0-9a-f]{7,40}/ &&
($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i ||
($line =~ /(?:\s|^)[0-9a-f]{12,40}(?:[\s"'\(\[]|$)/i &&
@@ -3087,7 +3133,7 @@ sub process {
$comment = '/*';
} elsif ($realfile =~ /\.(c|dts|dtsi)$/) {
$comment = '//';
- } elsif (($checklicenseline == 2) || $realfile =~ /\.(sh|pl|py|awk|tc)$/) {
+ } elsif (($checklicenseline == 2) || $realfile =~ /\.(sh|pl|py|awk|tc|yaml)$/) {
$comment = '#';
} elsif ($realfile =~ /\.rst$/) {
$comment = '..';
@@ -3111,6 +3157,17 @@ sub process {
WARN("SPDX_LICENSE_TAG",
"'$spdx_license' is not supported in LICENSES/...\n" . $herecurr);
}
+ if ($realfile =~ m@^Documentation/devicetree/bindings/@ &&
+ not $spdx_license =~ /GPL-2\.0.*BSD-2-Clause/) {
+ my $msg_level = \&WARN;
+ $msg_level = \&CHK if ($file);
+ if (&{$msg_level}("SPDX_LICENSE_TAG",
+
+ "DT binding documents should be licensed (GPL-2.0-only OR BSD-2-Clause)\n" . $herecurr) &&
+ $fix) {
+ $fixed[$fixlinenr] =~ s/SPDX-License-Identifier: .*/SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)/;
+ }
+ }
}
}
}
@@ -3198,7 +3255,7 @@ sub process {
next if ($realfile !~ /\.(h|c|pl|dtsi|dts)$/);
# at the beginning of a line any tabs must come first and anything
-# more than 8 must use tabs.
+# more than $tabsize must use tabs.
if ($rawline =~ /^\+\s* \t\s*\S/ ||
$rawline =~ /^\+\s* \s*/) {
my $herevet = "$here\n" . cat_vet($rawline) . "\n";
@@ -3217,7 +3274,7 @@ sub process {
"please, no space before tabs\n" . $herevet) &&
$fix) {
while ($fixed[$fixlinenr] =~
- s/(^\+.*) {8,8}\t/$1\t\t/) {}
+ s/(^\+.*) {$tabsize,$tabsize}\t/$1\t\t/) {}
while ($fixed[$fixlinenr] =~
s/(^\+.*) +\t/$1\t/) {}
}
@@ -3239,11 +3296,11 @@ sub process {
if ($perl_version_ok &&
$sline =~ /^\+\t+( +)(?:$c90_Keywords\b|\{\s*$|\}\s*(?:else\b|while\b|\s*$)|$Declare\s*$Ident\s*[;=])/) {
my $indent = length($1);
- if ($indent % 8) {
+ if ($indent % $tabsize) {
if (WARN("TABSTOP",
"Statements should start on a tabstop\n" . $herecurr) &&
$fix) {
- $fixed[$fixlinenr] =~ s@(^\+\t+) +@$1 . "\t" x ($indent/8)@e;
+ $fixed[$fixlinenr] =~ s@(^\+\t+) +@$1 . "\t" x ($indent/$tabsize)@e;
}
}
}
@@ -3261,8 +3318,8 @@ sub process {
my $newindent = $2;
my $goodtabindent = $oldindent .
- "\t" x ($pos / 8) .
- " " x ($pos % 8);
+ "\t" x ($pos / $tabsize) .
+ " " x ($pos % $tabsize);
my $goodspaceindent = $oldindent . " " x $pos;
if ($newindent ne $goodtabindent &&
@@ -3733,11 +3790,11 @@ sub process {
#print "line<$line> prevline<$prevline> indent<$indent> sindent<$sindent> check<$check> continuation<$continuation> s<$s> cond_lines<$cond_lines> stat_real<$stat_real> stat<$stat>\n";
if ($check && $s ne '' &&
- (($sindent % 8) != 0 ||
+ (($sindent % $tabsize) != 0 ||
($sindent < $indent) ||
($sindent == $indent &&
($s !~ /^\s*(?:\}|\{|else\b)/)) ||
- ($sindent > $indent + 8))) {
+ ($sindent > $indent + $tabsize))) {
WARN("SUSPECT_CODE_INDENT",
"suspect code indent for conditional statements ($indent, $sindent)\n" . $herecurr . "$stat_real\n");
}
@@ -4014,7 +4071,7 @@ sub process {
}
# check for function declarations without arguments like "int foo()"
- if ($line =~ /(\b$Type\s+$Ident)\s*\(\s*\)/) {
+ if ($line =~ /(\b$Type\s*$Ident)\s*\(\s*\)/) {
if (ERROR("FUNCTION_WITHOUT_ARGS",
"Bad function definition - $1() should probably be $1(void)\n" . $herecurr) &&
$fix) {
@@ -4582,7 +4639,7 @@ sub process {
($op eq '>' &&
$ca =~ /<\S+\@\S+$/))
{
- $ok = 1;
+ $ok = 1;
}
# for asm volatile statements
@@ -4917,7 +4974,7 @@ sub process {
# conditional.
substr($s, 0, length($c), '');
$s =~ s/\n.*//g;
- $s =~ s/$;//g; # Remove any comments
+ $s =~ s/$;//g; # Remove any comments
if (length($c) && $s !~ /^\s*{?\s*\\*\s*$/ &&
$c !~ /}\s*while\s*/)
{
@@ -4956,7 +5013,7 @@ sub process {
# if and else should not have general statements after it
if ($line =~ /^.\s*(?:}\s*)?else\b(.*)/) {
my $s = $1;
- $s =~ s/$;//g; # Remove any comments
+ $s =~ s/$;//g; # Remove any comments
if ($s !~ /^\s*(?:\sif|(?:{|)\s*\\?\s*$)/) {
ERROR("TRAILING_STATEMENTS",
"trailing statements should be on next line\n" . $herecurr);
@@ -5132,7 +5189,7 @@ sub process {
{
}
- # Flatten any obvious string concatentation.
+ # Flatten any obvious string concatenation.
while ($dstat =~ s/($String)\s*$Ident/$1/ ||
$dstat =~ s/$Ident\s*($String)/$1/)
{
@@ -6230,13 +6287,17 @@ sub process {
}
# check for function declarations that have arguments without identifier names
+# while avoiding uninitialized_var(x)
if (defined $stat &&
- $stat =~ /^.\s*(?:extern\s+)?$Type\s*(?:$Ident|\(\s*\*\s*$Ident\s*\))\s*\(\s*([^{]+)\s*\)\s*;/s &&
- $1 ne "void") {
- my $args = trim($1);
+ $stat =~ /^.\s*(?:extern\s+)?$Type\s*(?:($Ident)|\(\s*\*\s*$Ident\s*\))\s*\(\s*([^{]+)\s*\)\s*;/s &&
+ (!defined($1) ||
+ (defined($1) && $1 ne "uninitialized_var")) &&
+ $2 ne "void") {
+ my $args = trim($2);
while ($args =~ m/\s*($Type\s*(?:$Ident|\(\s*\*\s*$Ident?\s*\)\s*$balanced_parens)?)/g) {
my $arg = trim($1);
- if ($arg =~ /^$Type$/ && $arg !~ /enum\s+$Ident$/) {
+ if ($arg =~ /^$Type$/ &&
+ $arg !~ /enum\s+$Ident$/) {
WARN("FUNCTION_ARGUMENTS",
"function definition argument '$arg' should also have an identifier name\n" . $herecurr);
}
@@ -6389,6 +6450,28 @@ sub process {
}
}
+# check for /* fallthrough */ like comment, prefer fallthrough;
+ my @fallthroughs = (
+ 'fallthrough',
+ '@fallthrough@',
+ 'lint -fallthrough[ \t]*',
+ 'intentional(?:ly)?[ \t]*fall(?:(?:s | |-)[Tt]|t)hr(?:ough|u|ew)',
+ '(?:else,?\s*)?FALL(?:S | |-)?THR(?:OUGH|U|EW)[ \t.!]*(?:-[^\n\r]*)?',
+ 'Fall(?:(?:s | |-)[Tt]|t)hr(?:ough|u|ew)[ \t.!]*(?:-[^\n\r]*)?',
+ 'fall(?:s | |-)?thr(?:ough|u|ew)[ \t.!]*(?:-[^\n\r]*)?',
+ );
+ if ($raw_comment ne '') {
+ foreach my $ft (@fallthroughs) {
+ if ($raw_comment =~ /$ft/) {
+ my $msg_level = \&WARN;
+ $msg_level = \&CHK if ($file);
+ &{$msg_level}("PREFER_FALLTHROUGH",
+ "Prefer 'fallthrough;' over fallthrough comment\n" . $herecurr);
+ last;
+ }
+ }
+ }
+
# check for switch/default statements without a break;
if ($perl_version_ok &&
defined $stat &&
diff --git a/scripts/dummy-tools/gcc b/scripts/dummy-tools/gcc
new file mode 100755
index 000000000000..33487e99d83e
--- /dev/null
+++ b/scripts/dummy-tools/gcc
@@ -0,0 +1,91 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Staring v4.18, Kconfig evaluates compiler capabilities, and hides CONFIG
+# options your compiler does not support. This works well if you configure and
+# build the kernel on the same host machine.
+#
+# It is inconvenient if you prepare the .config that is carried to a different
+# build environment (typically this happens when you package the kernel for
+# distros) because using a different compiler potentially produces different
+# CONFIG options than the real build environment. So, you probably want to make
+# as many options visible as possible. In other words, you need to create a
+# super-set of CONFIG options that cover any build environment. If some of the
+# CONFIG options turned out to be unsupported on the build machine, they are
+# automatically disabled by the nature of Kconfig.
+#
+# However, it is not feasible to get a full-featured compiler for every arch.
+# Hence these dummy toolchains to make all compiler tests pass.
+#
+# Usage:
+#
+# From the top directory of the source tree, run
+#
+# $ make CROSS_COMPILE=scripts/dummy-tools/ oldconfig
+#
+# Most of compiler features are tested by cc-option, which simply checks the
+# exit code of $(CC). This script does nothing and just exits with 0 in most
+# cases. So, $(cc-option, ...) is evaluated as 'y'.
+#
+# This scripts caters to more checks; handle --version and pre-process __GNUC__
+# etc. to pretend to be GCC, and also do right things to satisfy some scripts.
+
+# Check if the first parameter appears in the rest. Succeeds if found.
+# This helper is useful if a particular option was passed to this script.
+# Typically used like this:
+# arg_contain <word-you-are-searching-for> "$@"
+arg_contain ()
+{
+ search="$1"
+ shift
+
+ while [ $# -gt 0 ]
+ do
+ if [ "$search" = "$1" ]; then
+ return 0
+ fi
+ shift
+ done
+
+ return 1
+}
+
+# To set CONFIG_CC_IS_GCC=y
+if arg_contain --version "$@"; then
+ echo "gcc (scripts/dummy-tools/gcc)"
+ exit 0
+fi
+
+if arg_contain -E "$@"; then
+ # For scripts/gcc-version.sh; This emulates GCC 20.0.0
+ if arg_contain - "$@"; then
+ sed 's/^__GNUC__$/20/; s/^__GNUC_MINOR__$/0/; s/^__GNUC_PATCHLEVEL__$/0/'
+ exit 0
+ else
+ echo "no input files" >&2
+ exit 1
+ fi
+fi
+
+if arg_contain -S "$@"; then
+ # For scripts/gcc-x86-*-has-stack-protector.sh
+ if arg_contain -fstack-protector "$@"; then
+ echo "%gs"
+ exit 0
+ fi
+fi
+
+# For scripts/gcc-plugin.sh
+if arg_contain -print-file-name=plugin "$@"; then
+ plugin_dir=$(mktemp -d)
+
+ sed -n 's/.*#include "\(.*\)"/\1/p' $(dirname $0)/../gcc-plugins/gcc-common.h |
+ while read header
+ do
+ mkdir -p $plugin_dir/include/$(dirname $header)
+ touch $plugin_dir/include/$header
+ done
+
+ echo $plugin_dir
+ exit 0
+fi
diff --git a/scripts/dummy-tools/ld b/scripts/dummy-tools/ld
new file mode 100755
index 000000000000..f68233050405
--- /dev/null
+++ b/scripts/dummy-tools/ld
@@ -0,0 +1,30 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+# Dummy script that always succeeds.
+
+# Check if the first parameter appears in the rest. Succeeds if found.
+# This helper is useful if a particular option was passed to this script.
+# Typically used like this:
+# arg_contain <word-you-are-searching-for> "$@"
+arg_contain ()
+{
+ search="$1"
+ shift
+
+ while [ $# -gt 0 ]
+ do
+ if [ "$search" = "$1" ]; then
+ return 0
+ fi
+ shift
+ done
+
+ return 1
+}
+
+if arg_contain --version "$@" || arg_contain -v "$@"; then
+ progname=$(basename $0)
+ echo "GNU $progname (scripts/dummy-tools/$progname) 2.50"
+ exit 0
+fi
diff --git a/scripts/dummy-tools/nm b/scripts/dummy-tools/nm
new file mode 120000
index 000000000000..c0648b38dd42
--- /dev/null
+++ b/scripts/dummy-tools/nm
@@ -0,0 +1 @@
+ld \ No newline at end of file
diff --git a/scripts/dummy-tools/objcopy b/scripts/dummy-tools/objcopy
new file mode 120000
index 000000000000..c0648b38dd42
--- /dev/null
+++ b/scripts/dummy-tools/objcopy
@@ -0,0 +1 @@
+ld \ No newline at end of file
diff --git a/scripts/gcc-plugin.sh b/scripts/gcc-plugin.sh
index d3caefe53eab..b79fd0bea838 100755
--- a/scripts/gcc-plugin.sh
+++ b/scripts/gcc-plugin.sh
@@ -1,49 +1,14 @@
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
-srctree=$(dirname "$0")
-
-SHOW_ERROR=
-if [ "$1" = "--show-error" ] ; then
- SHOW_ERROR=1
- shift || true
-fi
-
-gccplugins_dir=$($3 -print-file-name=plugin)
-plugincc=$($1 -E -x c++ - -o /dev/null -I"${srctree}"/gcc-plugins -I"${gccplugins_dir}"/include 2>&1 <<EOF
-#include "gcc-common.h"
-#if BUILDING_GCC_VERSION >= 4008 || defined(ENABLE_BUILD_WITH_CXX)
-#warning $2 CXX
-#else
-#warning $1 CC
-#endif
-EOF
-)
-if [ $? -ne 0 ]
-then
- if [ -n "$SHOW_ERROR" ] ; then
- echo "${plugincc}" >&2
- fi
- exit 1
-fi
+set -e
-case "$plugincc" in
- *"$1 CC"*)
- echo "$1"
- exit 0
- ;;
-
- *"$2 CXX"*)
- # the c++ compiler needs another test, see below
- ;;
+srctree=$(dirname "$0")
- *)
- exit 1
- ;;
-esac
+gccplugins_dir=$($* -print-file-name=plugin)
# we need a c++ compiler that supports the designated initializer GNU extension
-plugincc=$($2 -c -x c++ -std=gnu++98 - -fsyntax-only -I"${srctree}"/gcc-plugins -I"${gccplugins_dir}"/include 2>&1 <<EOF
+$HOSTCC -c -x c++ -std=gnu++98 - -fsyntax-only -I $srctree/gcc-plugins -I $gccplugins_dir/include 2>/dev/null <<EOF
#include "gcc-common.h"
class test {
public:
@@ -52,15 +17,3 @@ public:
.test = 1
};
EOF
-)
-
-if [ $? -eq 0 ]
-then
- echo "$2"
- exit 0
-fi
-
-if [ -n "$SHOW_ERROR" ] ; then
- echo "${plugincc}" >&2
-fi
-exit 1
diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig
index f8ca236d6165..013ba3a57669 100644
--- a/scripts/gcc-plugins/Kconfig
+++ b/scripts/gcc-plugins/Kconfig
@@ -1,13 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
-preferred-plugin-hostcc := $(if-success,[ $(gcc-version) -ge 40800 ],$(HOSTCXX),$(HOSTCC))
-
-config PLUGIN_HOSTCC
- string
- default "$(shell,$(srctree)/scripts/gcc-plugin.sh "$(preferred-plugin-hostcc)" "$(HOSTCXX)" "$(CC)")" if CC_IS_GCC
- help
- Host compiler used to build GCC plugins. This can be $(HOSTCXX),
- $(HOSTCC), or a null string if GCC plugin is unsupported.
-
config HAVE_GCC_PLUGINS
bool
help
@@ -17,7 +8,8 @@ config HAVE_GCC_PLUGINS
menuconfig GCC_PLUGINS
bool "GCC plugins"
depends on HAVE_GCC_PLUGINS
- depends on PLUGIN_HOSTCC != ""
+ depends on CC_IS_GCC && GCC_VERSION >= 40800
+ depends on $(success,$(srctree)/scripts/gcc-plugin.sh $(CC))
default y
help
GCC plugins are loadable modules that provide extra features to the
diff --git a/scripts/gcc-plugins/Makefile b/scripts/gcc-plugins/Makefile
index efff00959a9c..f22858b2c3d6 100644
--- a/scripts/gcc-plugins/Makefile
+++ b/scripts/gcc-plugins/Makefile
@@ -1,18 +1,9 @@
# SPDX-License-Identifier: GPL-2.0
-PLUGINCC := $(CONFIG_PLUGIN_HOSTCC:"%"=%)
GCC_PLUGINS_DIR := $(shell $(CC) -print-file-name=plugin)
-ifeq ($(PLUGINCC),$(HOSTCC))
- HOSTLIBS := hostlibs
- HOST_EXTRACFLAGS += -I$(GCC_PLUGINS_DIR)/include -I$(src) -std=gnu99 -ggdb
- export HOST_EXTRACFLAGS
-else
- HOSTLIBS := hostcxxlibs
- HOST_EXTRACXXFLAGS += -I$(GCC_PLUGINS_DIR)/include -I$(src) -std=gnu++98 -fno-rtti
- HOST_EXTRACXXFLAGS += -fno-exceptions -fasynchronous-unwind-tables -ggdb
- HOST_EXTRACXXFLAGS += -Wno-narrowing -Wno-unused-variable -Wno-c++11-compat
- export HOST_EXTRACXXFLAGS
-endif
+HOST_EXTRACXXFLAGS += -I$(GCC_PLUGINS_DIR)/include -I$(src) -std=gnu++98 -fno-rtti
+HOST_EXTRACXXFLAGS += -fno-exceptions -fasynchronous-unwind-tables -ggdb
+HOST_EXTRACXXFLAGS += -Wno-narrowing -Wno-unused-variable -Wno-c++11-compat
$(obj)/randomize_layout_plugin.o: $(objtree)/$(obj)/randomize_layout_seed.h
quiet_cmd_create_randomize_layout_seed = GENSEED $@
@@ -22,9 +13,9 @@ $(objtree)/$(obj)/randomize_layout_seed.h: FORCE
$(call if_changed,create_randomize_layout_seed)
targets = randomize_layout_seed.h randomize_layout_hash.h
-$(HOSTLIBS)-y := $(foreach p,$(GCC_PLUGIN),$(if $(findstring /,$(p)),,$(p)))
-always-y := $($(HOSTLIBS)-y)
+hostcxxlibs-y := $(foreach p,$(GCC_PLUGIN),$(if $(findstring /,$(p)),,$(p)))
+always-y := $(hostcxxlibs-y)
-$(foreach p,$($(HOSTLIBS)-y:%.so=%),$(eval $(p)-objs := $(p).o))
+$(foreach p,$(hostcxxlibs-y:%.so=%),$(eval $(p)-objs := $(p).o))
clean-files += *.so
diff --git a/scripts/kconfig/qconf.cc b/scripts/kconfig/qconf.cc
index 50a5245d87bb..c0ac8f7b5f1a 100644
--- a/scripts/kconfig/qconf.cc
+++ b/scripts/kconfig/qconf.cc
@@ -154,9 +154,9 @@ void ConfigItem::updateMenu(void)
if (!sym_is_changeable(sym) && list->optMode == normalOpt) {
setPixmap(promptColIdx, QIcon());
- setText(noColIdx, QString::null);
- setText(modColIdx, QString::null);
- setText(yesColIdx, QString::null);
+ setText(noColIdx, QString());
+ setText(modColIdx, QString());
+ setText(yesColIdx, QString());
break;
}
expr = sym_get_tristate_value(sym);
@@ -276,7 +276,7 @@ void ConfigLineEdit::show(ConfigItem* i)
if (sym_get_string_value(item->menu->sym))
setText(QString::fromLocal8Bit(sym_get_string_value(item->menu->sym)));
else
- setText(QString::null);
+ setText(QString());
Parent::show();
setFocus();
}
@@ -316,7 +316,10 @@ ConfigList::ConfigList(ConfigView* p, const char *name)
setVerticalScrollMode(ScrollPerPixel);
setHorizontalScrollMode(ScrollPerPixel);
- setHeaderLabels(QStringList() << "Option" << "Name" << "N" << "M" << "Y" << "Value");
+ if (mode == symbolMode)
+ setHeaderLabels(QStringList() << "Item" << "Name" << "N" << "M" << "Y" << "Value");
+ else
+ setHeaderLabels(QStringList() << "Option" << "Name" << "N" << "M" << "Y" << "Value");
connect(this, SIGNAL(itemSelectionChanged(void)),
SLOT(updateSelection(void)));
@@ -397,6 +400,11 @@ void ConfigList::updateSelection(void)
struct menu *menu;
enum prop_type type;
+ if (mode == symbolMode)
+ setHeaderLabels(QStringList() << "Item" << "Name" << "N" << "M" << "Y" << "Value");
+ else
+ setHeaderLabels(QStringList() << "Option" << "Name" << "N" << "M" << "Y" << "Value");
+
if (selectedItems().count() == 0)
return;
@@ -625,7 +633,7 @@ void ConfigList::updateMenuList(ConfigItem *parent, struct menu* menu)
last = item;
continue;
}
- hide:
+hide:
if (item && item->menu == child) {
last = parent->firstChild();
if (last == item)
@@ -690,7 +698,7 @@ void ConfigList::updateMenuList(ConfigList *parent, struct menu* menu)
last = item;
continue;
}
- hide:
+hide:
if (item && item->menu == child) {
last = (ConfigItem*)parent->topLevelItem(0);
if (last == item)
@@ -734,7 +742,10 @@ void ConfigList::keyPressEvent(QKeyEvent* ev)
type = menu->prompt ? menu->prompt->type : P_UNKNOWN;
if (type == P_MENU && rootEntry != menu &&
mode != fullMode && mode != menuMode) {
- emit menuSelected(menu);
+ if (mode == menuMode)
+ emit menuSelected(menu);
+ else
+ emit itemSelected(menu);
break;
}
case Qt::Key_Space:
@@ -826,7 +837,7 @@ void ConfigList::mouseMoveEvent(QMouseEvent* e)
void ConfigList::mouseDoubleClickEvent(QMouseEvent* e)
{
- QPoint p = e->pos(); // TODO: Check if this works(was contentsToViewport).
+ QPoint p = e->pos();
ConfigItem* item = (ConfigItem*)itemAt(p);
struct menu *menu;
enum prop_type ptype;
@@ -841,9 +852,12 @@ void ConfigList::mouseDoubleClickEvent(QMouseEvent* e)
if (!menu)
goto skip;
ptype = menu->prompt ? menu->prompt->type : P_UNKNOWN;
- if (ptype == P_MENU && (mode == singleMode || mode == symbolMode))
- emit menuSelected(menu);
- else if (menu->sym)
+ if (ptype == P_MENU) {
+ if (mode == singleMode)
+ emit itemSelected(menu);
+ else if (mode == symbolMode)
+ emit menuSelected(menu);
+ } else if (menu->sym)
changeValue(item);
skip:
@@ -1223,10 +1237,11 @@ QMenu* ConfigInfoView::createStandardContextMenu(const QPoint & pos)
{
QMenu* popup = Parent::createStandardContextMenu(pos);
QAction* action = new QAction("Show Debug Info", popup);
- action->setCheckable(true);
- connect(action, SIGNAL(toggled(bool)), SLOT(setShowDebug(bool)));
- connect(this, SIGNAL(showDebugChanged(bool)), action, SLOT(setOn(bool)));
- action->setChecked(showDebug());
+
+ action->setCheckable(true);
+ connect(action, SIGNAL(toggled(bool)), SLOT(setShowDebug(bool)));
+ connect(this, SIGNAL(showDebugChanged(bool)), action, SLOT(setOn(bool)));
+ action->setChecked(showDebug());
popup->addSeparator();
popup->addAction(action);
return popup;
@@ -1352,21 +1367,32 @@ ConfigMainWindow::ConfigMainWindow(void)
if ((x.isValid())&&(y.isValid()))
move(x.toInt(), y.toInt());
- split1 = new QSplitter(this);
+ QWidget *widget = new QWidget(this);
+ QVBoxLayout *layout = new QVBoxLayout(widget);
+ setCentralWidget(widget);
+
+ split1 = new QSplitter(widget);
split1->setOrientation(Qt::Horizontal);
- setCentralWidget(split1);
+ split1->setChildrenCollapsible(false);
- menuView = new ConfigView(split1, "menu");
+ menuView = new ConfigView(widget, "menu");
menuList = menuView->list;
- split2 = new QSplitter(split1);
+ split2 = new QSplitter(widget);
+ split2->setChildrenCollapsible(false);
split2->setOrientation(Qt::Vertical);
// create config tree
- configView = new ConfigView(split2, "config");
+ configView = new ConfigView(widget, "config");
configList = configView->list;
- helpText = new ConfigInfoView(split2, "help");
+ helpText = new ConfigInfoView(widget, "help");
+
+ layout->addWidget(split2);
+ split2->addWidget(split1);
+ split1->addWidget(configView);
+ split1->addWidget(menuView);
+ split2->addWidget(helpText);
setTabOrder(configList, helpText);
configList->setFocus();
@@ -1484,6 +1510,8 @@ ConfigMainWindow::ConfigMainWindow(void)
helpText, SLOT(setInfo(struct menu *)));
connect(configList, SIGNAL(menuSelected(struct menu *)),
SLOT(changeMenu(struct menu *)));
+ connect(configList, SIGNAL(itemSelected(struct menu *)),
+ SLOT(changeItens(struct menu *)));
connect(configList, SIGNAL(parentSelected()),
SLOT(goBack()));
connect(menuList, SIGNAL(menuChanged(struct menu *)),
@@ -1580,15 +1608,26 @@ void ConfigMainWindow::searchConfig(void)
searchWindow->show();
}
-void ConfigMainWindow::changeMenu(struct menu *menu)
+void ConfigMainWindow::changeItens(struct menu *menu)
{
configList->setRootMenu(menu);
+
if (configList->rootEntry->parent == &rootmenu)
backAction->setEnabled(false);
else
backAction->setEnabled(true);
}
+void ConfigMainWindow::changeMenu(struct menu *menu)
+{
+ menuList->setRootMenu(menu);
+
+ if (menuList->rootEntry->parent == &rootmenu)
+ backAction->setEnabled(false);
+ else
+ backAction->setEnabled(true);
+}
+
void ConfigMainWindow::setMenuLink(struct menu *menu)
{
struct menu *parent;
@@ -1698,14 +1737,14 @@ void ConfigMainWindow::showSplitView(void)
fullViewAction->setEnabled(true);
fullViewAction->setChecked(false);
- configList->mode = symbolMode;
+ configList->mode = menuMode;
if (configList->rootEntry == &rootmenu)
configList->updateListAll();
else
configList->setRootMenu(&rootmenu);
configList->setAllOpen(true);
configApp->processEvents();
- menuList->mode = menuMode;
+ menuList->mode = symbolMode;
menuList->setRootMenu(&rootmenu);
menuList->setAllOpen(true);
menuView->show();
@@ -1733,7 +1772,6 @@ void ConfigMainWindow::showFullView(void)
/*
* ask for saving configuration before quitting
- * TODO ask only when something changed
*/
void ConfigMainWindow::closeEvent(QCloseEvent* e)
{
diff --git a/scripts/kconfig/qconf.h b/scripts/kconfig/qconf.h
index 45bfe9b2b966..c879d79ce817 100644
--- a/scripts/kconfig/qconf.h
+++ b/scripts/kconfig/qconf.h
@@ -71,6 +71,7 @@ public slots:
signals:
void menuChanged(struct menu *menu);
void menuSelected(struct menu *menu);
+ void itemSelected(struct menu *menu);
void parentSelected(void);
void gotFocus(struct menu *);
@@ -298,6 +299,7 @@ public:
ConfigMainWindow(void);
public slots:
void changeMenu(struct menu *);
+ void changeItens(struct menu *);
void setMenuLink(struct menu *);
void listFocusChanged(void);
void goBack(void);
diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
index 3ff26e5b2eac..5b80a4699740 100755
--- a/scripts/mkcompile_h
+++ b/scripts/mkcompile_h
@@ -7,6 +7,7 @@ SMP=$3
PREEMPT=$4
PREEMPT_RT=$5
CC=$6
+LD=$7
vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
@@ -61,7 +62,10 @@ UTS_VERSION="$(echo $UTS_VERSION $CONFIG_FLAGS $TIMESTAMP | cut -b -$UTS_LEN)"
printf '#define LINUX_COMPILE_BY "%s"\n' "$LINUX_COMPILE_BY"
echo \#define LINUX_COMPILE_HOST \"$LINUX_COMPILE_HOST\"
- echo \#define LINUX_COMPILER \"`$CC -v 2>&1 | grep ' version ' | sed 's/[[:space:]]*$//'`\"
+ CC_VERSION=$($CC -v 2>&1 | grep ' version ' | sed 's/[[:space:]]*$//')
+ LD_VERSION=$($LD -v | head -n1 | sed 's/(compatible with [^)]*)//' \
+ | sed 's/[[:space:]]*$//')
+ printf '#define LINUX_COMPILER "%s"\n' "$CC_VERSION, $LD_VERSION"
} > .tmpcompile
# Only replace the real compile.h if the new one is different,
diff --git a/sound/core/oss/pcm_plugin.c b/sound/core/oss/pcm_plugin.c
index fbda4ebf38b3..59d62f05658f 100644
--- a/sound/core/oss/pcm_plugin.c
+++ b/sound/core/oss/pcm_plugin.c
@@ -197,7 +197,8 @@ int snd_pcm_plugin_free(struct snd_pcm_plugin *plugin)
}
static snd_pcm_sframes_t calc_dst_frames(struct snd_pcm_substream *plug,
- snd_pcm_sframes_t frames)
+ snd_pcm_sframes_t frames,
+ bool check_size)
{
struct snd_pcm_plugin *plugin, *plugin_next;
@@ -209,7 +210,7 @@ static snd_pcm_sframes_t calc_dst_frames(struct snd_pcm_substream *plug,
if (frames < 0)
return frames;
}
- if (frames > plugin->buf_frames)
+ if (check_size && frames > plugin->buf_frames)
frames = plugin->buf_frames;
plugin = plugin_next;
}
@@ -217,13 +218,14 @@ static snd_pcm_sframes_t calc_dst_frames(struct snd_pcm_substream *plug,
}
static snd_pcm_sframes_t calc_src_frames(struct snd_pcm_substream *plug,
- snd_pcm_sframes_t frames)
+ snd_pcm_sframes_t frames,
+ bool check_size)
{
struct snd_pcm_plugin *plugin, *plugin_prev;
plugin = snd_pcm_plug_last(plug);
while (plugin && frames > 0) {
- if (frames > plugin->buf_frames)
+ if (check_size && frames > plugin->buf_frames)
frames = plugin->buf_frames;
plugin_prev = plugin->prev;
if (plugin->src_frames) {
@@ -242,9 +244,9 @@ snd_pcm_sframes_t snd_pcm_plug_client_size(struct snd_pcm_substream *plug, snd_p
return -ENXIO;
switch (snd_pcm_plug_stream(plug)) {
case SNDRV_PCM_STREAM_PLAYBACK:
- return calc_src_frames(plug, drv_frames);
+ return calc_src_frames(plug, drv_frames, false);
case SNDRV_PCM_STREAM_CAPTURE:
- return calc_dst_frames(plug, drv_frames);
+ return calc_dst_frames(plug, drv_frames, false);
default:
snd_BUG();
return -EINVAL;
@@ -257,9 +259,9 @@ snd_pcm_sframes_t snd_pcm_plug_slave_size(struct snd_pcm_substream *plug, snd_pc
return -ENXIO;
switch (snd_pcm_plug_stream(plug)) {
case SNDRV_PCM_STREAM_PLAYBACK:
- return calc_dst_frames(plug, clt_frames);
+ return calc_dst_frames(plug, clt_frames, false);
case SNDRV_PCM_STREAM_CAPTURE:
- return calc_src_frames(plug, clt_frames);
+ return calc_src_frames(plug, clt_frames, false);
default:
snd_BUG();
return -EINVAL;
@@ -622,7 +624,7 @@ snd_pcm_sframes_t snd_pcm_plug_write_transfer(struct snd_pcm_substream *plug, st
src_channels = dst_channels;
plugin = next;
}
- return snd_pcm_plug_client_size(plug, frames);
+ return calc_src_frames(plug, frames, true);
}
snd_pcm_sframes_t snd_pcm_plug_read_transfer(struct snd_pcm_substream *plug, struct snd_pcm_plugin_channel *dst_channels_final, snd_pcm_uframes_t size)
@@ -632,7 +634,7 @@ snd_pcm_sframes_t snd_pcm_plug_read_transfer(struct snd_pcm_substream *plug, str
snd_pcm_sframes_t frames = size;
int err;
- frames = snd_pcm_plug_slave_size(plug, frames);
+ frames = calc_src_frames(plug, frames, true);
if (frames < 0)
return frames;
diff --git a/sound/pci/hda/hda_beep.c b/sound/pci/hda/hda_beep.c
index f5fd62ed4df5..841523f6b88d 100644
--- a/sound/pci/hda/hda_beep.c
+++ b/sound/pci/hda/hda_beep.c
@@ -290,8 +290,12 @@ int snd_hda_mixer_amp_switch_get_beep(struct snd_kcontrol *kcontrol,
{
struct hda_codec *codec = snd_kcontrol_chip(kcontrol);
struct hda_beep *beep = codec->beep;
+ int chs = get_amp_channels(kcontrol);
+
if (beep && (!beep->enabled || !ctl_has_mute(kcontrol))) {
- ucontrol->value.integer.value[0] =
+ if (chs & 1)
+ ucontrol->value.integer.value[0] = beep->enabled;
+ if (chs & 2)
ucontrol->value.integer.value[1] = beep->enabled;
return 0;
}
diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index 92a042e34d3e..bd093593f8fb 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -2076,6 +2076,17 @@ static void pcm_mmap_prepare(struct snd_pcm_substream *substream,
#endif
}
+/* Blacklist for skipping the whole probe:
+ * some HD-audio PCI entries are exposed without any codecs, and such devices
+ * should be ignored from the beginning.
+ */
+static const struct snd_pci_quirk driver_blacklist[] = {
+ SND_PCI_QUIRK(0x1043, 0x874f, "ASUS ROG Zenith II / Strix", 0),
+ SND_PCI_QUIRK(0x1462, 0xcb59, "MSI TRX40 Creator", 0),
+ SND_PCI_QUIRK(0x1462, 0xcb60, "MSI TRX40", 0),
+ {}
+};
+
static const struct hda_controller_ops pci_hda_ops = {
.disable_msi_reset_irq = disable_msi_reset_irq,
.pcm_mmap_prepare = pcm_mmap_prepare,
@@ -2092,6 +2103,11 @@ static int azx_probe(struct pci_dev *pci,
bool schedule_probe;
int err;
+ if (snd_pci_quirk_lookup(pci, driver_blacklist)) {
+ dev_info(&pci->dev, "Skipping the blacklisted device\n");
+ return -ENODEV;
+ }
+
if (dev >= SNDRV_CARDS)
return -ENODEV;
if (!enable[dev]) {
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index f66a48154a57..de2826f90d34 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -86,6 +86,14 @@ struct alc_spec {
unsigned int gpio_mute_led_mask;
unsigned int gpio_mic_led_mask;
+ unsigned int mute_led_coef_idx;
+ unsigned int mute_led_coefbit_mask;
+ unsigned int mute_led_coefbit_on;
+ unsigned int mute_led_coefbit_off;
+ unsigned int mic_led_coef_idx;
+ unsigned int mic_led_coefbit_mask;
+ unsigned int mic_led_coefbit_on;
+ unsigned int mic_led_coefbit_off;
hda_nid_t headset_mic_pin;
hda_nid_t headphone_mic_pin;
@@ -2447,6 +2455,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = {
SND_PCI_QUIRK(0x1458, 0xa0b8, "Gigabyte AZ370-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS),
SND_PCI_QUIRK(0x1458, 0xa0cd, "Gigabyte X570 Aorus Master", ALC1220_FIXUP_CLEVO_P950),
SND_PCI_QUIRK(0x1462, 0x1228, "MSI-GP63", ALC1220_FIXUP_CLEVO_P950),
+ SND_PCI_QUIRK(0x1462, 0x1275, "MSI-GL63", ALC1220_FIXUP_CLEVO_P950),
SND_PCI_QUIRK(0x1462, 0x1276, "MSI-GL73", ALC1220_FIXUP_CLEVO_P950),
SND_PCI_QUIRK(0x1462, 0x1293, "MSI-GP65", ALC1220_FIXUP_CLEVO_P950),
SND_PCI_QUIRK(0x1462, 0x7350, "MSI-7350", ALC889_FIXUP_CD),
@@ -4178,6 +4187,111 @@ static void alc280_fixup_hp_gpio4(struct hda_codec *codec,
}
}
+/* update mute-LED according to the speaker mute state via COEF bit */
+static void alc_fixup_mute_led_coefbit_hook(void *private_data, int enabled)
+{
+ struct hda_codec *codec = private_data;
+ struct alc_spec *spec = codec->spec;
+
+ if (spec->mute_led_polarity)
+ enabled = !enabled;
+
+ /* temporarily power up/down for setting COEF bit */
+ enabled ? alc_update_coef_idx(codec, spec->mute_led_coef_idx,
+ spec->mute_led_coefbit_mask, spec->mute_led_coefbit_off) :
+ alc_update_coef_idx(codec, spec->mute_led_coef_idx,
+ spec->mute_led_coefbit_mask, spec->mute_led_coefbit_on);
+}
+
+static void alc285_fixup_hp_mute_led_coefbit(struct hda_codec *codec,
+ const struct hda_fixup *fix,
+ int action)
+{
+ struct alc_spec *spec = codec->spec;
+
+ if (action == HDA_FIXUP_ACT_PRE_PROBE) {
+ spec->mute_led_polarity = 0;
+ spec->mute_led_coef_idx = 0x0b;
+ spec->mute_led_coefbit_mask = 1<<3;
+ spec->mute_led_coefbit_on = 1<<3;
+ spec->mute_led_coefbit_off = 0;
+ spec->gen.vmaster_mute.hook = alc_fixup_mute_led_coefbit_hook;
+ spec->gen.vmaster_mute_enum = 1;
+ }
+}
+
+static void alc236_fixup_hp_mute_led_coefbit(struct hda_codec *codec,
+ const struct hda_fixup *fix,
+ int action)
+{
+ struct alc_spec *spec = codec->spec;
+
+ if (action == HDA_FIXUP_ACT_PRE_PROBE) {
+ spec->mute_led_polarity = 0;
+ spec->mute_led_coef_idx = 0x34;
+ spec->mute_led_coefbit_mask = 1<<5;
+ spec->mute_led_coefbit_on = 0;
+ spec->mute_led_coefbit_off = 1<<5;
+ spec->gen.vmaster_mute.hook = alc_fixup_mute_led_coefbit_hook;
+ spec->gen.vmaster_mute_enum = 1;
+ }
+}
+
+/* turn on/off mic-mute LED per capture hook by coef bit */
+static void alc_hp_cap_micmute_update(struct hda_codec *codec)
+{
+ struct alc_spec *spec = codec->spec;
+
+ if (spec->gen.micmute_led.led_value)
+ alc_update_coef_idx(codec, spec->mic_led_coef_idx,
+ spec->mic_led_coefbit_mask, spec->mic_led_coefbit_on);
+ else
+ alc_update_coef_idx(codec, spec->mic_led_coef_idx,
+ spec->mic_led_coefbit_mask, spec->mic_led_coefbit_off);
+}
+
+static void alc285_fixup_hp_coef_micmute_led(struct hda_codec *codec,
+ const struct hda_fixup *fix, int action)
+{
+ struct alc_spec *spec = codec->spec;
+
+ if (action == HDA_FIXUP_ACT_PRE_PROBE) {
+ spec->mic_led_coef_idx = 0x19;
+ spec->mic_led_coefbit_mask = 1<<13;
+ spec->mic_led_coefbit_on = 1<<13;
+ spec->mic_led_coefbit_off = 0;
+ snd_hda_gen_add_micmute_led(codec, alc_hp_cap_micmute_update);
+ }
+}
+
+static void alc236_fixup_hp_coef_micmute_led(struct hda_codec *codec,
+ const struct hda_fixup *fix, int action)
+{
+ struct alc_spec *spec = codec->spec;
+
+ if (action == HDA_FIXUP_ACT_PRE_PROBE) {
+ spec->mic_led_coef_idx = 0x35;
+ spec->mic_led_coefbit_mask = 3<<2;
+ spec->mic_led_coefbit_on = 2<<2;
+ spec->mic_led_coefbit_off = 1<<2;
+ snd_hda_gen_add_micmute_led(codec, alc_hp_cap_micmute_update);
+ }
+}
+
+static void alc285_fixup_hp_mute_led(struct hda_codec *codec,
+ const struct hda_fixup *fix, int action)
+{
+ alc285_fixup_hp_mute_led_coefbit(codec, fix, action);
+ alc285_fixup_hp_coef_micmute_led(codec, fix, action);
+}
+
+static void alc236_fixup_hp_mute_led(struct hda_codec *codec,
+ const struct hda_fixup *fix, int action)
+{
+ alc236_fixup_hp_mute_led_coefbit(codec, fix, action);
+ alc236_fixup_hp_coef_micmute_led(codec, fix, action);
+}
+
#if IS_REACHABLE(CONFIG_INPUT)
static void gpio2_mic_hotkey_event(struct hda_codec *codec,
struct hda_jack_callback *event)
@@ -5964,6 +6078,8 @@ enum {
ALC285_FIXUP_THINKPAD_HEADSET_JACK,
ALC294_FIXUP_ASUS_HPE,
ALC285_FIXUP_HP_GPIO_LED,
+ ALC285_FIXUP_HP_MUTE_LED,
+ ALC236_FIXUP_HP_MUTE_LED,
};
static const struct hda_fixup alc269_fixups[] = {
@@ -7089,6 +7205,14 @@ static const struct hda_fixup alc269_fixups[] = {
.type = HDA_FIXUP_FUNC,
.v.func = alc285_fixup_hp_gpio_led,
},
+ [ALC285_FIXUP_HP_MUTE_LED] = {
+ .type = HDA_FIXUP_FUNC,
+ .v.func = alc285_fixup_hp_mute_led,
+ },
+ [ALC236_FIXUP_HP_MUTE_LED] = {
+ .type = HDA_FIXUP_FUNC,
+ .v.func = alc236_fixup_hp_mute_led,
+ },
};
static const struct snd_pci_quirk alc269_fixup_tbl[] = {
@@ -7234,6 +7358,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
SND_PCI_QUIRK(0x103c, 0x8497, "HP Envy x360", ALC269_FIXUP_HP_MUTE_LED_MIC3),
SND_PCI_QUIRK(0x103c, 0x84e7, "HP Pavilion 15", ALC269_FIXUP_HP_MUTE_LED_MIC3),
SND_PCI_QUIRK(0x103c, 0x8736, "HP", ALC285_FIXUP_HP_GPIO_LED),
+ SND_PCI_QUIRK(0x103c, 0x877a, "HP", ALC285_FIXUP_HP_MUTE_LED),
+ SND_PCI_QUIRK(0x103c, 0x877d, "HP", ALC236_FIXUP_HP_MUTE_LED),
SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC),
SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300),
SND_PCI_QUIRK(0x1043, 0x106d, "Asus K53BE", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
@@ -7325,6 +7451,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
SND_PCI_QUIRK(0x17aa, 0x225d, "Thinkpad T480", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
SND_PCI_QUIRK(0x17aa, 0x2292, "Thinkpad X1 Yoga 7th", ALC285_FIXUP_THINKPAD_HEADSET_JACK),
SND_PCI_QUIRK(0x17aa, 0x2293, "Thinkpad X1 Carbon 7th", ALC285_FIXUP_THINKPAD_HEADSET_JACK),
+ SND_PCI_QUIRK(0x17aa, 0x22be, "Thinkpad X1 Carbon 8th", ALC285_FIXUP_THINKPAD_HEADSET_JACK),
SND_PCI_QUIRK(0x17aa, 0x30bb, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
SND_PCI_QUIRK(0x17aa, 0x30e2, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
SND_PCI_QUIRK(0x17aa, 0x310c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION),
diff --git a/sound/pci/ice1712/prodigy_hifi.c b/sound/pci/ice1712/prodigy_hifi.c
index 91f83cef0e56..9aa12a67d370 100644
--- a/sound/pci/ice1712/prodigy_hifi.c
+++ b/sound/pci/ice1712/prodigy_hifi.c
@@ -536,7 +536,7 @@ static int wm_adc_mux_enum_get(struct snd_kcontrol *kcontrol,
struct snd_ice1712 *ice = snd_kcontrol_chip(kcontrol);
mutex_lock(&ice->gpio_mutex);
- ucontrol->value.integer.value[0] = wm_get(ice, WM_ADC_MUX) & 0x1f;
+ ucontrol->value.enumerated.item[0] = wm_get(ice, WM_ADC_MUX) & 0x1f;
mutex_unlock(&ice->gpio_mutex);
return 0;
}
@@ -550,7 +550,7 @@ static int wm_adc_mux_enum_put(struct snd_kcontrol *kcontrol,
mutex_lock(&ice->gpio_mutex);
oval = wm_get(ice, WM_ADC_MUX);
- nval = (oval & 0xe0) | ucontrol->value.integer.value[0];
+ nval = (oval & 0xe0) | ucontrol->value.enumerated.item[0];
if (nval != oval) {
wm_put(ice, WM_ADC_MUX, nval);
change = 1;
diff --git a/sound/soc/amd/raven/acp3x-i2s.c b/sound/soc/amd/raven/acp3x-i2s.c
index 3a3c47e820ab..f160d35a6832 100644
--- a/sound/soc/amd/raven/acp3x-i2s.c
+++ b/sound/soc/amd/raven/acp3x-i2s.c
@@ -139,6 +139,7 @@ static int acp3x_i2s_hwparams(struct snd_pcm_substream *substream,
rv_writel(adata->tdm_fmt, rtd->acp3x_base + frmt_reg);
}
val = rv_readl(rtd->acp3x_base + reg_val);
+ val &= ~ACP3x_ITER_IRER_SAMP_LEN_MASK;
val = val | (rtd->xfer_resolution << 3);
rv_writel(val, rtd->acp3x_base + reg_val);
return 0;
diff --git a/sound/soc/amd/raven/acp3x.h b/sound/soc/amd/raven/acp3x.h
index 21e7ac017f2b..03fe93913e12 100644
--- a/sound/soc/amd/raven/acp3x.h
+++ b/sound/soc/amd/raven/acp3x.h
@@ -76,6 +76,8 @@
#define ACP_POWERED_OFF 0x02
#define ACP_POWER_OFF_IN_PROGRESS 0x03
+#define ACP3x_ITER_IRER_SAMP_LEN_MASK 0x38
+
struct acp3x_platform_info {
u16 play_i2s_instance;
u16 cap_i2s_instance;
diff --git a/sound/soc/bcm/bcm63xx-pcm-whistler.c b/sound/soc/bcm/bcm63xx-pcm-whistler.c
index e46c390683e7..b7a1efc7406e 100644
--- a/sound/soc/bcm/bcm63xx-pcm-whistler.c
+++ b/sound/soc/bcm/bcm63xx-pcm-whistler.c
@@ -181,7 +181,7 @@ bcm63xx_pcm_pointer(struct snd_soc_component *component,
snd_pcm_uframes_t x;
struct bcm63xx_runtime_data *prtd = substream->runtime->private_data;
- if ((void *)prtd->dma_addr_next == NULL)
+ if (!prtd->dma_addr_next)
prtd->dma_addr_next = substream->runtime->dma_addr;
x = bytes_to_frames(substream->runtime,
diff --git a/sound/soc/codecs/cs4270.c b/sound/soc/codecs/cs4270.c
index 5f25b9f872bd..8a02791e44ad 100644
--- a/sound/soc/codecs/cs4270.c
+++ b/sound/soc/codecs/cs4270.c
@@ -137,6 +137,9 @@ struct cs4270_private {
/* power domain regulators */
struct regulator_bulk_data supplies[ARRAY_SIZE(supply_names)];
+
+ /* reset gpio */
+ struct gpio_desc *reset_gpio;
};
static const struct snd_soc_dapm_widget cs4270_dapm_widgets[] = {
@@ -649,6 +652,22 @@ static const struct regmap_config cs4270_regmap = {
};
/**
+ * cs4270_i2c_remove - deinitialize the I2C interface of the CS4270
+ * @i2c_client: the I2C client object
+ *
+ * This function puts the chip into low power mode when the i2c device
+ * is removed.
+ */
+static int cs4270_i2c_remove(struct i2c_client *i2c_client)
+{
+ struct cs4270_private *cs4270 = i2c_get_clientdata(i2c_client);
+
+ gpiod_set_value_cansleep(cs4270->reset_gpio, 0);
+
+ return 0;
+}
+
+/**
* cs4270_i2c_probe - initialize the I2C interface of the CS4270
* @i2c_client: the I2C client object
* @id: the I2C device ID (ignored)
@@ -660,7 +679,6 @@ static int cs4270_i2c_probe(struct i2c_client *i2c_client,
const struct i2c_device_id *id)
{
struct cs4270_private *cs4270;
- struct gpio_desc *reset_gpiod;
unsigned int val;
int ret, i;
@@ -679,10 +697,21 @@ static int cs4270_i2c_probe(struct i2c_client *i2c_client,
if (ret < 0)
return ret;
- reset_gpiod = devm_gpiod_get_optional(&i2c_client->dev, "reset",
- GPIOD_OUT_HIGH);
- if (PTR_ERR(reset_gpiod) == -EPROBE_DEFER)
- return -EPROBE_DEFER;
+ /* reset the device */
+ cs4270->reset_gpio = devm_gpiod_get_optional(&i2c_client->dev, "reset",
+ GPIOD_OUT_LOW);
+ if (IS_ERR(cs4270->reset_gpio)) {
+ dev_dbg(&i2c_client->dev, "Error getting CS4270 reset GPIO\n");
+ return PTR_ERR(cs4270->reset_gpio);
+ }
+
+ if (cs4270->reset_gpio) {
+ dev_dbg(&i2c_client->dev, "Found reset GPIO\n");
+ gpiod_set_value_cansleep(cs4270->reset_gpio, 1);
+ }
+
+ /* Sleep 500ns before i2c communications */
+ ndelay(500);
cs4270->regmap = devm_regmap_init_i2c(i2c_client, &cs4270_regmap);
if (IS_ERR(cs4270->regmap))
@@ -735,6 +764,7 @@ static struct i2c_driver cs4270_i2c_driver = {
},
.id_table = cs4270_id,
.probe = cs4270_i2c_probe,
+ .remove = cs4270_i2c_remove,
};
module_i2c_driver(cs4270_i2c_driver);
diff --git a/sound/soc/codecs/rt5645.c b/sound/soc/codecs/rt5645.c
index 92d67010aeed..6ba1849a77b0 100644
--- a/sound/soc/codecs/rt5645.c
+++ b/sound/soc/codecs/rt5645.c
@@ -3758,6 +3758,14 @@ static const struct dmi_system_id dmi_platform_data[] = {
},
.driver_data = (void *)&kahlee_platform_data,
},
+ {
+ .ident = "Medion E1239T",
+ .matches = {
+ DMI_EXACT_MATCH(DMI_SYS_VENDOR, "MEDION"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "E1239T MD60568"),
+ },
+ .driver_data = (void *)&intel_braswell_platform_data,
+ },
{ }
};
diff --git a/sound/soc/codecs/rt5682.c b/sound/soc/codecs/rt5682.c
index c9268a230daa..d36f560ad7a8 100644
--- a/sound/soc/codecs/rt5682.c
+++ b/sound/soc/codecs/rt5682.c
@@ -3703,7 +3703,7 @@ static const struct acpi_device_id rt5682_acpi_match[] = {
MODULE_DEVICE_TABLE(acpi, rt5682_acpi_match);
#endif
-static struct i2c_driver rt5682_i2c_driver = {
+static struct i2c_driver __maybe_unused rt5682_i2c_driver = {
.driver = {
.name = "rt5682",
.of_match_table = of_match_ptr(rt5682_of_match),
@@ -3713,7 +3713,10 @@ static struct i2c_driver rt5682_i2c_driver = {
.shutdown = rt5682_i2c_shutdown,
.id_table = rt5682_i2c_id,
};
+
+#ifdef CONFIG_I2C
module_i2c_driver(rt5682_i2c_driver);
+#endif
MODULE_DESCRIPTION("ASoC RT5682 driver");
MODULE_AUTHOR("Bard Liao <bardliao@realtek.com>");
diff --git a/sound/soc/intel/atom/sst-atom-controls.c b/sound/soc/intel/atom/sst-atom-controls.c
index f883c9340eee..69f3af4524ab 100644
--- a/sound/soc/intel/atom/sst-atom-controls.c
+++ b/sound/soc/intel/atom/sst-atom-controls.c
@@ -50,6 +50,8 @@ static int sst_fill_and_send_cmd_unlocked(struct sst_data *drv,
{
int ret = 0;
+ WARN_ON(!mutex_is_locked(&drv->lock));
+
ret = sst_fill_byte_control(drv, ipc_msg,
block, task_id, pipe_id, len, cmd_data);
if (ret < 0)
@@ -966,7 +968,9 @@ static int sst_set_be_modules(struct snd_soc_dapm_widget *w,
dev_dbg(c->dev, "Enter: widget=%s\n", w->name);
if (SND_SOC_DAPM_EVENT_ON(event)) {
+ mutex_lock(&drv->lock);
ret = sst_send_slot_map(drv);
+ mutex_unlock(&drv->lock);
if (ret)
return ret;
ret = sst_send_pipe_module_params(w, k);
diff --git a/sound/soc/intel/atom/sst/sst_pvt.c b/sound/soc/intel/atom/sst/sst_pvt.c
index 13db2854db3e..053c27707147 100644
--- a/sound/soc/intel/atom/sst/sst_pvt.c
+++ b/sound/soc/intel/atom/sst/sst_pvt.c
@@ -223,9 +223,9 @@ int sst_prepare_and_post_msg(struct intel_sst_drv *sst,
size_t mbox_data_len, const void *mbox_data, void **data,
bool large, bool fill_dsp, bool sync, bool response)
{
+ struct sst_block *block = NULL;
struct ipc_post *msg = NULL;
struct ipc_dsp_hdr dsp_hdr;
- struct sst_block *block;
int ret = 0, pvt_id;
pvt_id = sst_assign_pvt_id(sst);
diff --git a/sound/soc/intel/boards/bdw-rt5650.c b/sound/soc/intel/boards/bdw-rt5650.c
index 6c2fdb5659ed..af2f50293208 100644
--- a/sound/soc/intel/boards/bdw-rt5650.c
+++ b/sound/soc/intel/boards/bdw-rt5650.c
@@ -254,7 +254,6 @@ static struct snd_soc_dai_link bdw_rt5650_dais[] = {
.no_pcm = 1,
.dai_fmt = SND_SOC_DAIFMT_DSP_B | SND_SOC_DAIFMT_NB_NF |
SND_SOC_DAIFMT_CBS_CFS,
- .ignore_suspend = 1,
.ignore_pmdown_time = 1,
.be_hw_params_fixup = broadwell_ssp0_fixup,
.ops = &bdw_rt5650_ops,
diff --git a/sound/soc/intel/boards/bdw-rt5677.c b/sound/soc/intel/boards/bdw-rt5677.c
index 6b4b64098d36..cc41a348295e 100644
--- a/sound/soc/intel/boards/bdw-rt5677.c
+++ b/sound/soc/intel/boards/bdw-rt5677.c
@@ -340,7 +340,6 @@ static struct snd_soc_dai_link bdw_rt5677_dais[] = {
.no_pcm = 1,
.dai_fmt = SND_SOC_DAIFMT_I2S | SND_SOC_DAIFMT_NB_NF |
SND_SOC_DAIFMT_CBS_CFS,
- .ignore_suspend = 1,
.ignore_pmdown_time = 1,
.be_hw_params_fixup = broadwell_ssp0_fixup,
.ops = &bdw_rt5677_ops,
diff --git a/sound/soc/intel/boards/broadwell.c b/sound/soc/intel/boards/broadwell.c
index acb4e36682cb..f9a8336a0541 100644
--- a/sound/soc/intel/boards/broadwell.c
+++ b/sound/soc/intel/boards/broadwell.c
@@ -217,7 +217,6 @@ static struct snd_soc_dai_link broadwell_rt286_dais[] = {
.init = broadwell_rt286_codec_init,
.dai_fmt = SND_SOC_DAIFMT_I2S | SND_SOC_DAIFMT_NB_NF |
SND_SOC_DAIFMT_CBS_CFS,
- .ignore_suspend = 1,
.ignore_pmdown_time = 1,
.be_hw_params_fixup = broadwell_ssp0_fixup,
.ops = &broadwell_rt286_ops,
diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c
index 33fb8ea4e5cb..08f4ae964b02 100644
--- a/sound/soc/intel/boards/bytcr_rt5640.c
+++ b/sound/soc/intel/boards/bytcr_rt5640.c
@@ -591,6 +591,17 @@ static const struct dmi_system_id byt_rt5640_quirk_table[] = {
BYT_RT5640_SSP0_AIF1 |
BYT_RT5640_MCLK_EN),
},
+ {
+ /* MPMAN MPWIN895CL */
+ .matches = {
+ DMI_EXACT_MATCH(DMI_SYS_VENDOR, "MPMAN"),
+ DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "MPWIN8900CL"),
+ },
+ .driver_data = (void *)(BYTCR_INPUT_DEFAULTS |
+ BYT_RT5640_MONO_SPEAKER |
+ BYT_RT5640_SSP0_AIF1 |
+ BYT_RT5640_MCLK_EN),
+ },
{ /* MSI S100 tablet */
.matches = {
DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Micro-Star International Co., Ltd."),
diff --git a/sound/soc/intel/boards/haswell.c b/sound/soc/intel/boards/haswell.c
index 3ed53d7db4e6..74af090f2657 100644
--- a/sound/soc/intel/boards/haswell.c
+++ b/sound/soc/intel/boards/haswell.c
@@ -162,7 +162,6 @@ static struct snd_soc_dai_link haswell_rt5640_dais[] = {
.no_pcm = 1,
.dai_fmt = SND_SOC_DAIFMT_I2S | SND_SOC_DAIFMT_NB_NF |
SND_SOC_DAIFMT_CBS_CFS,
- .ignore_suspend = 1,
.ignore_pmdown_time = 1,
.be_hw_params_fixup = haswell_ssp0_fixup,
.ops = &haswell_rt5640_ops,
diff --git a/sound/soc/qcom/qdsp6/q6asm-dai.c b/sound/soc/qcom/qdsp6/q6asm-dai.c
index f6c7cddf08e8..125af00bba53 100644
--- a/sound/soc/qcom/qdsp6/q6asm-dai.c
+++ b/sound/soc/qcom/qdsp6/q6asm-dai.c
@@ -78,7 +78,7 @@ struct q6asm_dai_data {
};
static const struct snd_pcm_hardware q6asm_dai_hardware_capture = {
- .info = (SNDRV_PCM_INFO_MMAP |
+ .info = (SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_BATCH |
SNDRV_PCM_INFO_BLOCK_TRANSFER |
SNDRV_PCM_INFO_MMAP_VALID |
SNDRV_PCM_INFO_INTERLEAVED |
@@ -100,7 +100,7 @@ static const struct snd_pcm_hardware q6asm_dai_hardware_capture = {
};
static struct snd_pcm_hardware q6asm_dai_hardware_playback = {
- .info = (SNDRV_PCM_INFO_MMAP |
+ .info = (SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_BATCH |
SNDRV_PCM_INFO_BLOCK_TRANSFER |
SNDRV_PCM_INFO_MMAP_VALID |
SNDRV_PCM_INFO_INTERLEAVED |
diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c
index 19142f6e533c..8f3cad8db89a 100644
--- a/sound/soc/soc-dai.c
+++ b/sound/soc/soc-dai.c
@@ -295,12 +295,12 @@ int snd_soc_dai_startup(struct snd_soc_dai *dai,
{
int ret = 0;
- if (!dai->started &&
+ if (!dai->started[substream->stream] &&
dai->driver->ops->startup)
ret = dai->driver->ops->startup(substream, dai);
if (ret == 0)
- dai->started = 1;
+ dai->started[substream->stream] = 1;
return ret;
}
@@ -308,11 +308,11 @@ int snd_soc_dai_startup(struct snd_soc_dai *dai,
void snd_soc_dai_shutdown(struct snd_soc_dai *dai,
struct snd_pcm_substream *substream)
{
- if (dai->started &&
+ if (dai->started[substream->stream] &&
dai->driver->ops->shutdown)
dai->driver->ops->shutdown(substream, dai);
- dai->started = 0;
+ dai->started[substream->stream] = 0;
}
int snd_soc_dai_prepare(struct snd_soc_dai *dai,
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index 04da7928c873..679ed60d850e 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -802,7 +802,13 @@ static void dapm_set_mixer_path_status(struct snd_soc_dapm_path *p, int i,
val = max - val;
p->connect = !!val;
} else {
- p->connect = 0;
+ /* since a virtual mixer has no backing registers to
+ * decide which path to connect, it will try to match
+ * with initial state. This is to ensure
+ * that the default mixer choice will be
+ * correctly powered up during initialization.
+ */
+ p->connect = invert;
}
}
diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c
index 652657dc6809..55ffb34be95e 100644
--- a/sound/soc/soc-ops.c
+++ b/sound/soc/soc-ops.c
@@ -825,7 +825,7 @@ int snd_soc_get_xr_sx(struct snd_kcontrol *kcontrol,
unsigned int regbase = mc->regbase;
unsigned int regcount = mc->regcount;
unsigned int regwshift = component->val_bytes * BITS_PER_BYTE;
- unsigned int regwmask = (1<<regwshift)-1;
+ unsigned int regwmask = (1UL<<regwshift)-1;
unsigned int invert = mc->invert;
unsigned long mask = (1UL<<mc->nbits)-1;
long min = mc->min;
@@ -874,7 +874,7 @@ int snd_soc_put_xr_sx(struct snd_kcontrol *kcontrol,
unsigned int regbase = mc->regbase;
unsigned int regcount = mc->regcount;
unsigned int regwshift = component->val_bytes * BITS_PER_BYTE;
- unsigned int regwmask = (1<<regwshift)-1;
+ unsigned int regwmask = (1UL<<regwshift)-1;
unsigned int invert = mc->invert;
unsigned long mask = (1UL<<mc->nbits)-1;
long max = mc->max;
diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
index e256d438ee68..289aebc15529 100644
--- a/sound/soc/soc-pcm.c
+++ b/sound/soc/soc-pcm.c
@@ -2324,7 +2324,8 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
switch (cmd) {
case SNDRV_PCM_TRIGGER_START:
if ((be->dpcm[stream].state != SND_SOC_DPCM_STATE_PREPARE) &&
- (be->dpcm[stream].state != SND_SOC_DPCM_STATE_STOP))
+ (be->dpcm[stream].state != SND_SOC_DPCM_STATE_STOP) &&
+ (be->dpcm[stream].state != SND_SOC_DPCM_STATE_PAUSED))
continue;
ret = dpcm_do_trigger(dpcm, be_substream, cmd);
@@ -2354,7 +2355,8 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
be->dpcm[stream].state = SND_SOC_DPCM_STATE_START;
break;
case SNDRV_PCM_TRIGGER_STOP:
- if (be->dpcm[stream].state != SND_SOC_DPCM_STATE_START)
+ if ((be->dpcm[stream].state != SND_SOC_DPCM_STATE_START) &&
+ (be->dpcm[stream].state != SND_SOC_DPCM_STATE_PAUSED))
continue;
if (!snd_soc_dpcm_can_be_free_stop(fe, be, stream))
diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c
index 1f81cd2d29cf..87f75edba3dc 100644
--- a/sound/soc/soc-topology.c
+++ b/sound/soc/soc-topology.c
@@ -362,7 +362,7 @@ static int soc_tplg_add_kcontrol(struct soc_tplg *tplg,
struct snd_soc_component *comp = tplg->comp;
return soc_tplg_add_dcontrol(comp->card->snd_card,
- comp->dev, k, NULL, comp, kcontrol);
+ comp->dev, k, comp->name_prefix, comp, kcontrol);
}
/* remove a mixer kcontrol */
diff --git a/sound/soc/sof/loader.c b/sound/soc/sof/loader.c
index 1f2e0be812bd..64af08293daa 100644
--- a/sound/soc/sof/loader.c
+++ b/sound/soc/sof/loader.c
@@ -597,7 +597,7 @@ int snd_sof_run_firmware(struct snd_sof_dev *sdev)
}
if (sdev->fw_state == SOF_FW_BOOT_COMPLETE)
- dev_info(sdev->dev, "firmware boot complete\n");
+ dev_dbg(sdev->dev, "firmware boot complete\n");
else
return -EIO; /* FW boots but fw_ready op failed */
diff --git a/sound/soc/stm/stm32_sai_sub.c b/sound/soc/stm/stm32_sai_sub.c
index 2bd280c01c33..0d0c9afd8791 100644
--- a/sound/soc/stm/stm32_sai_sub.c
+++ b/sound/soc/stm/stm32_sai_sub.c
@@ -1556,8 +1556,10 @@ static int stm32_sai_sub_probe(struct platform_device *pdev)
ret = snd_soc_register_component(&pdev->dev, &stm32_component,
&sai->cpu_dai_drv, 1);
- if (ret)
+ if (ret) {
+ snd_dmaengine_pcm_unregister(&pdev->dev);
return ret;
+ }
if (STM_SAI_PROTOCOL_IS_SPDIF(sai))
conf = &stm32_sai_pcm_config_spdif;
diff --git a/sound/usb/mixer_maps.c b/sound/usb/mixer_maps.c
index 5ebca8013840..72b575c34860 100644
--- a/sound/usb/mixer_maps.c
+++ b/sound/usb/mixer_maps.c
@@ -359,6 +359,14 @@ static const struct usbmix_name_map corsair_virtuoso_map[] = {
{ 0 }
};
+/* Some mobos shipped with a dummy HD-audio show the invalid GET_MIN/GET_MAX
+ * response for Input Gain Pad (id=19, control=12). Skip it.
+ */
+static const struct usbmix_name_map asus_rog_map[] = {
+ { 19, NULL, 12 }, /* FU, Input Gain Pad */
+ {}
+};
+
/*
* Control map entries
*/
@@ -488,6 +496,26 @@ static const struct usbmix_ctl_map usbmix_ctl_maps[] = {
.id = USB_ID(0x1b1c, 0x0a42),
.map = corsair_virtuoso_map,
},
+ { /* Gigabyte TRX40 Aorus Pro WiFi */
+ .id = USB_ID(0x0414, 0xa002),
+ .map = asus_rog_map,
+ },
+ { /* ASUS ROG Zenith II */
+ .id = USB_ID(0x0b05, 0x1916),
+ .map = asus_rog_map,
+ },
+ { /* ASUS ROG Strix */
+ .id = USB_ID(0x0b05, 0x1917),
+ .map = asus_rog_map,
+ },
+ { /* MSI TRX40 Creator */
+ .id = USB_ID(0x0db0, 0x0d64),
+ .map = asus_rog_map,
+ },
+ { /* MSI TRX40 */
+ .id = USB_ID(0x0db0, 0x543d),
+ .map = asus_rog_map,
+ },
{ 0 } /* terminator */
};
diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h
index 1c8719292eee..e009d584e7d0 100644
--- a/sound/usb/quirks-table.h
+++ b/sound/usb/quirks-table.h
@@ -3592,5 +3592,47 @@ AU0828_DEVICE(0x2040, 0x7270, "Hauppauge", "HVR-950Q"),
}
}
},
+{
+ /*
+ * Pioneer DJ DJM-250MK2
+ * PCM is 8 channels out @ 48 fixed (endpoints 0x01).
+ * The output from computer to the mixer is usable.
+ *
+ * The input (phono or line to computer) is not working.
+ * It should be at endpoint 0x82 and probably also 8 channels,
+ * but it seems that it works only with Pioneer proprietary software.
+ * Even on officially supported OS, the Audacity was unable to record
+ * and Mixxx to recognize the control vinyls.
+ */
+ USB_DEVICE_VENDOR_SPEC(0x2b73, 0x0017),
+ .driver_info = (unsigned long) &(const struct snd_usb_audio_quirk) {
+ .ifnum = QUIRK_ANY_INTERFACE,
+ .type = QUIRK_COMPOSITE,
+ .data = (const struct snd_usb_audio_quirk[]) {
+ {
+ .ifnum = 0,
+ .type = QUIRK_AUDIO_FIXED_ENDPOINT,
+ .data = &(const struct audioformat) {
+ .formats = SNDRV_PCM_FMTBIT_S24_3LE,
+ .channels = 8, // outputs
+ .iface = 0,
+ .altsetting = 1,
+ .altset_idx = 1,
+ .endpoint = 0x01,
+ .ep_attr = USB_ENDPOINT_XFER_ISOC|
+ USB_ENDPOINT_SYNC_ASYNC,
+ .rates = SNDRV_PCM_RATE_48000,
+ .rate_min = 48000,
+ .rate_max = 48000,
+ .nr_rates = 1,
+ .rate_table = (unsigned int[]) { 48000 }
+ }
+ },
+ {
+ .ifnum = -1
+ }
+ }
+ }
+},
#undef USB_DEVICE_VENDOR_SPEC
diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index 86f192a3043d..a8ece1701068 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -1827,6 +1827,7 @@ struct registration_quirk {
static const struct registration_quirk registration_quirks[] = {
REG_QUIRK_ENTRY(0x0951, 0x16d8, 2), /* Kingston HyperX AMP */
+ REG_QUIRK_ENTRY(0x0951, 0x16ed, 2), /* Kingston HyperX Cloud Alpha S */
{ 0 } /* terminator */
};
diff --git a/tools/laptop/freefall/freefall.c b/tools/laptop/freefall/freefall.c
index d29a86cda87f..d77d7861787c 100644
--- a/tools/laptop/freefall/freefall.c
+++ b/tools/laptop/freefall/freefall.c
@@ -4,7 +4,7 @@
* Copyright 2008 Eric Piel
* Copyright 2009 Pavel Machek <pavel@ucw.cz>
* Copyright 2012 Sonal Santan
- * Copyright 2014 Pali Rohár <pali.rohar@gmail.com>
+ * Copyright 2014 Pali Rohár <pali@kernel.org>
*/
#include <stdio.h>
diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c
index 2548ff8c4d9c..06ac7bd2144b 100644
--- a/tools/lib/rbtree.c
+++ b/tools/lib/rbtree.c
@@ -497,7 +497,7 @@ struct rb_node *rb_next(const struct rb_node *node)
if (node->rb_right) {
node = node->rb_right;
while (node->rb_left)
- node=node->rb_left;
+ node = node->rb_left;
return (struct rb_node *)node;
}
@@ -528,7 +528,7 @@ struct rb_node *rb_prev(const struct rb_node *node)
if (node->rb_left) {
node = node->rb_left;
while (node->rb_right)
- node=node->rb_right;
+ node = node->rb_right;
return (struct rb_node *)node;
}
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index ee08aeff30a1..f591c4d1b6fe 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -3,9 +3,15 @@ include ../scripts/Makefile.include
include ../scripts/Makefile.arch
# always use the host compiler
+ifneq ($(LLVM),)
+HOSTAR ?= llvm-ar
+HOSTCC ?= clang
+HOSTLD ?= ld.lld
+else
HOSTAR ?= ar
HOSTCC ?= gcc
HOSTLD ?= ld
+endif
AR = $(HOSTAR)
CC = $(HOSTCC)
LD = $(HOSTLD)
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index dbebf05f5931..47f9cc9dcd94 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -21,8 +21,8 @@ DRIVERS := ../../../drivers
NVDIMM_SRC := $(DRIVERS)/nvdimm
ACPI_SRC := $(DRIVERS)/acpi/nfit
DAX_SRC := $(DRIVERS)/dax
-ccflags-y := -I$(src)/$(NVDIMM_SRC)/
-ccflags-y += -I$(src)/$(ACPI_SRC)/
+ccflags-y := -I$(srctree)/drivers/nvdimm/
+ccflags-y += -I$(srctree)/drivers/acpi/nfit/
obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
diff --git a/tools/testing/nvdimm/test/Kbuild b/tools/testing/nvdimm/test/Kbuild
index fb3c3d7cdb9b..75baebf8f4ba 100644
--- a/tools/testing/nvdimm/test/Kbuild
+++ b/tools/testing/nvdimm/test/Kbuild
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-y := -I$(src)/../../../../drivers/nvdimm/
-ccflags-y += -I$(src)/../../../../drivers/acpi/nfit/
+ccflags-y := -I$(srctree)/drivers/nvdimm/
+ccflags-y += -I$(srctree)/drivers/acpi/nfit/
obj-m += nfit_test.o
obj-m += nfit_test_iomap.o
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index bf6422a6af7f..a8ee5c4d41eb 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -3164,7 +3164,9 @@ static __init int nfit_test_init(void)
mcsafe_test();
dax_pmem_test();
dax_pmem_core_test();
+#ifdef CONFIG_DEV_DAX_PMEM_COMPAT
dax_pmem_compat_test();
+#endif
nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm);
diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
index 37a04dab56f0..11eee0b60040 100644
--- a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
+++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
@@ -7,13 +7,14 @@
#include <pthread.h>
#include <sys/epoll.h>
#include <sys/socket.h>
+#include <sys/eventfd.h>
#include "../../kselftest_harness.h"
struct epoll_mtcontext
{
int efd[3];
int sfd[4];
- int count;
+ volatile int count;
pthread_t main;
pthread_t waiter;
@@ -3071,4 +3072,68 @@ TEST(epoll58)
close(ctx.sfd[3]);
}
+static void *epoll59_thread(void *ctx_)
+{
+ struct epoll_mtcontext *ctx = ctx_;
+ struct epoll_event e;
+ int i;
+
+ for (i = 0; i < 100000; i++) {
+ while (ctx->count == 0)
+ ;
+
+ e.events = EPOLLIN | EPOLLERR | EPOLLET;
+ epoll_ctl(ctx->efd[0], EPOLL_CTL_MOD, ctx->sfd[0], &e);
+ ctx->count = 0;
+ }
+
+ return NULL;
+}
+
+/*
+ * t0
+ * (p) \
+ * e0
+ * (et) /
+ * e0
+ *
+ * Based on https://bugzilla.kernel.org/show_bug.cgi?id=205933
+ */
+TEST(epoll59)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+ int i, ret;
+
+ signal(SIGUSR1, signal_handler);
+
+ ctx.efd[0] = epoll_create1(0);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.sfd[0] = eventfd(1, 0);
+ ASSERT_GE(ctx.sfd[0], 0);
+
+ e.events = EPOLLIN | EPOLLERR | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ ASSERT_EQ(pthread_create(&emitter, NULL, epoll59_thread, &ctx), 0);
+
+ for (i = 0; i < 100000; i++) {
+ ret = epoll_wait(ctx.efd[0], &e, 1, 1000);
+ ASSERT_GT(ret, 0);
+
+ while (ctx.count != 0)
+ ;
+ ctx.count = 1;
+ }
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+}
+
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/kmod/kmod.sh b/tools/testing/selftests/kmod/kmod.sh
index 8b944cf042f6..3702dbcc90a7 100755
--- a/tools/testing/selftests/kmod/kmod.sh
+++ b/tools/testing/selftests/kmod/kmod.sh
@@ -61,6 +61,8 @@ ALL_TESTS="$ALL_TESTS 0006:10:1"
ALL_TESTS="$ALL_TESTS 0007:5:1"
ALL_TESTS="$ALL_TESTS 0008:150:1"
ALL_TESTS="$ALL_TESTS 0009:150:1"
+ALL_TESTS="$ALL_TESTS 0010:1:1"
+ALL_TESTS="$ALL_TESTS 0011:1:1"
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
@@ -149,6 +151,7 @@ function load_req_mod()
test_finish()
{
+ echo "$MODPROBE" > /proc/sys/kernel/modprobe
echo "Test completed"
}
@@ -443,6 +446,30 @@ kmod_test_0009()
config_expect_result ${FUNCNAME[0]} SUCCESS
}
+kmod_test_0010()
+{
+ kmod_defaults_driver
+ config_num_threads 1
+ echo "/KMOD_TEST_NONEXISTENT" > /proc/sys/kernel/modprobe
+ config_trigger ${FUNCNAME[0]}
+ config_expect_result ${FUNCNAME[0]} -ENOENT
+ echo "$MODPROBE" > /proc/sys/kernel/modprobe
+}
+
+kmod_test_0011()
+{
+ kmod_defaults_driver
+ config_num_threads 1
+ # This causes the kernel to not even try executing modprobe. The error
+ # code is still -ENOENT like when modprobe doesn't exist, so we can't
+ # easily test for the exact difference. But this still is a useful test
+ # since there was a bug where request_module() returned 0 in this case.
+ echo > /proc/sys/kernel/modprobe
+ config_trigger ${FUNCNAME[0]}
+ config_expect_result ${FUNCNAME[0]} -ENOENT
+ echo "$MODPROBE" > /proc/sys/kernel/modprobe
+}
+
list_tests()
{
echo "Test ID list:"
@@ -460,6 +487,8 @@ list_tests()
echo "0007 x $(get_test_count 0007) - multithreaded tests with default setup test request_module() and get_fs_type()"
echo "0008 x $(get_test_count 0008) - multithreaded - push kmod_concurrent over max_modprobes for request_module()"
echo "0009 x $(get_test_count 0009) - multithreaded - push kmod_concurrent over max_modprobes for get_fs_type()"
+ echo "0010 x $(get_test_count 0010) - test nonexistent modprobe path"
+ echo "0011 x $(get_test_count 0011) - test completely disabling module autoloading"
}
usage()
@@ -505,18 +534,23 @@ function test_num()
fi
}
-function get_test_count()
+function get_test_data()
{
test_num $1
- TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}')
+ local field_num=$(echo $1 | sed 's/^0*//')
+ echo $ALL_TESTS | awk '{print $'$field_num'}'
+}
+
+function get_test_count()
+{
+ TEST_DATA=$(get_test_data $1)
LAST_TWO=${TEST_DATA#*:*}
echo ${LAST_TWO%:*}
}
function get_test_enabled()
{
- test_num $1
- TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}')
+ TEST_DATA=$(get_test_data $1)
echo ${TEST_DATA#*:*:}
}
@@ -611,6 +645,7 @@ test_reqs
allow_user_defaults
load_req_mod
+MODPROBE=$(</proc/sys/kernel/modprobe)
trap "test_finish" EXIT
parse_args $@
diff --git a/tools/testing/selftests/powerpc/eeh/eeh-basic.sh b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh
index f988d2f42e8f..8a8d0f456946 100755
--- a/tools/testing/selftests/powerpc/eeh/eeh-basic.sh
+++ b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh
@@ -41,6 +41,11 @@ for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do
continue;
fi
+ if [ "ahci" = "$(basename $(realpath /sys/bus/pci/devices/$dev/driver))" ] ; then
+ echo "$dev, Skipped: ahci doesn't support recovery"
+ continue
+ fi
+
# Don't inject errosr into an already-frozen PE. This happens with
# PEs that contain multiple PCI devices (e.g. multi-function cards)
# and injecting new errors during the recovery process will probably
diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile
index 0b0db8d3857c..5881e97c73c1 100644
--- a/tools/testing/selftests/powerpc/tm/Makefile
+++ b/tools/testing/selftests/powerpc/tm/Makefile
@@ -25,6 +25,7 @@ $(OUTPUT)/tm-unavailable: CFLAGS += -O0 -pthread -m64 -Wno-error=uninitialized -
$(OUTPUT)/tm-trap: CFLAGS += -O0 -pthread -m64
$(OUTPUT)/tm-signal-context-force-tm: CFLAGS += -pthread -m64
$(OUTPUT)/tm-signal-pagefault: CFLAGS += -pthread -m64
+$(OUTPUT)/tm-poison: CFLAGS += -m64
SIGNAL_CONTEXT_CHK_TESTS := $(patsubst %,$(OUTPUT)/%,$(SIGNAL_CONTEXT_CHK_TESTS))
$(SIGNAL_CONTEXT_CHK_TESTS): tm-signal.S
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index d3362777a425..61e5cfeb1350 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -54,6 +54,7 @@
#include <linux/userfaultfd.h>
#include <setjmp.h>
#include <stdbool.h>
+#include <assert.h>
#include "../kselftest.h"
@@ -76,6 +77,8 @@ static int test_type;
#define ALARM_INTERVAL_SECS 10
static volatile bool test_uffdio_copy_eexist = true;
static volatile bool test_uffdio_zeropage_eexist = true;
+/* Whether to test uffd write-protection */
+static bool test_uffdio_wp = false;
static bool map_shared;
static int huge_fd;
@@ -86,6 +89,13 @@ static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
static char *zeropage;
pthread_attr_t attr;
+/* Userfaultfd test statistics */
+struct uffd_stats {
+ int cpu;
+ unsigned long missing_faults;
+ unsigned long wp_faults;
+};
+
/* pthread_mutex_t starts at page offset 0 */
#define area_mutex(___area, ___nr) \
((pthread_mutex_t *) ((___area) + (___nr)*page_size))
@@ -125,6 +135,37 @@ static void usage(void)
exit(1);
}
+static void uffd_stats_reset(struct uffd_stats *uffd_stats,
+ unsigned long n_cpus)
+{
+ int i;
+
+ for (i = 0; i < n_cpus; i++) {
+ uffd_stats[i].cpu = i;
+ uffd_stats[i].missing_faults = 0;
+ uffd_stats[i].wp_faults = 0;
+ }
+}
+
+static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
+{
+ int i;
+ unsigned long long miss_total = 0, wp_total = 0;
+
+ for (i = 0; i < n_cpus; i++) {
+ miss_total += stats[i].missing_faults;
+ wp_total += stats[i].wp_faults;
+ }
+
+ printf("userfaults: %llu missing (", miss_total);
+ for (i = 0; i < n_cpus; i++)
+ printf("%lu+", stats[i].missing_faults);
+ printf("\b), %llu wp (", wp_total);
+ for (i = 0; i < n_cpus; i++)
+ printf("%lu+", stats[i].wp_faults);
+ printf("\b)\n");
+}
+
static int anon_release_pages(char *rel_area)
{
int ret = 0;
@@ -245,10 +286,15 @@ struct uffd_test_ops {
void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
};
-#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \
+#define SHMEM_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \
(1 << _UFFDIO_COPY) | \
(1 << _UFFDIO_ZEROPAGE))
+#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \
+ (1 << _UFFDIO_COPY) | \
+ (1 << _UFFDIO_ZEROPAGE) | \
+ (1 << _UFFDIO_WRITEPROTECT))
+
static struct uffd_test_ops anon_uffd_test_ops = {
.expected_ioctls = ANON_EXPECTED_IOCTLS,
.allocate_area = anon_allocate_area,
@@ -257,7 +303,7 @@ static struct uffd_test_ops anon_uffd_test_ops = {
};
static struct uffd_test_ops shmem_uffd_test_ops = {
- .expected_ioctls = ANON_EXPECTED_IOCTLS,
+ .expected_ioctls = SHMEM_EXPECTED_IOCTLS,
.allocate_area = shmem_allocate_area,
.release_pages = shmem_release_pages,
.alias_mapping = noop_alias_mapping,
@@ -281,6 +327,21 @@ static int my_bcmp(char *str1, char *str2, size_t n)
return 0;
}
+static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
+{
+ struct uffdio_writeprotect prms = { 0 };
+
+ /* Write protection page faults */
+ prms.range.start = start;
+ prms.range.len = len;
+ /* Undo write-protect, do wakeup after that */
+ prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
+
+ if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
+ fprintf(stderr, "clear WP failed for address 0x%Lx\n",
+ start), exit(1);
+}
+
static void *locking_thread(void *arg)
{
unsigned long cpu = (unsigned long) arg;
@@ -419,7 +480,10 @@ static int __copy_page(int ufd, unsigned long offset, bool retry)
uffdio_copy.dst = (unsigned long) area_dst + offset;
uffdio_copy.src = (unsigned long) area_src + offset;
uffdio_copy.len = page_size;
- uffdio_copy.mode = 0;
+ if (test_uffdio_wp)
+ uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
+ else
+ uffdio_copy.mode = 0;
uffdio_copy.copy = 0;
if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
/* real retval in ufdio_copy.copy */
@@ -467,8 +531,8 @@ static int uffd_read_msg(int ufd, struct uffd_msg *msg)
return 0;
}
-/* Return 1 if page fault handled by us; otherwise 0 */
-static int uffd_handle_page_fault(struct uffd_msg *msg)
+static void uffd_handle_page_fault(struct uffd_msg *msg,
+ struct uffd_stats *stats)
{
unsigned long offset;
@@ -476,25 +540,32 @@ static int uffd_handle_page_fault(struct uffd_msg *msg)
fprintf(stderr, "unexpected msg event %u\n",
msg->event), exit(1);
- if (bounces & BOUNCE_VERIFY &&
- msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
- fprintf(stderr, "unexpected write fault\n"), exit(1);
+ if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
+ wp_range(uffd, msg->arg.pagefault.address, page_size, false);
+ stats->wp_faults++;
+ } else {
+ /* Missing page faults */
+ if (bounces & BOUNCE_VERIFY &&
+ msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+ fprintf(stderr, "unexpected write fault\n"), exit(1);
- offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
- offset &= ~(page_size-1);
+ offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
+ offset &= ~(page_size-1);
- return copy_page(uffd, offset);
+ if (copy_page(uffd, offset))
+ stats->missing_faults++;
+ }
}
static void *uffd_poll_thread(void *arg)
{
- unsigned long cpu = (unsigned long) arg;
+ struct uffd_stats *stats = (struct uffd_stats *)arg;
+ unsigned long cpu = stats->cpu;
struct pollfd pollfd[2];
struct uffd_msg msg;
struct uffdio_register uffd_reg;
int ret;
char tmp_chr;
- unsigned long userfaults = 0;
pollfd[0].fd = uffd;
pollfd[0].events = POLLIN;
@@ -524,7 +595,7 @@ static void *uffd_poll_thread(void *arg)
msg.event), exit(1);
break;
case UFFD_EVENT_PAGEFAULT:
- userfaults += uffd_handle_page_fault(&msg);
+ uffd_handle_page_fault(&msg, stats);
break;
case UFFD_EVENT_FORK:
close(uffd);
@@ -543,50 +614,67 @@ static void *uffd_poll_thread(void *arg)
break;
}
}
- return (void *)userfaults;
+
+ return NULL;
}
pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
static void *uffd_read_thread(void *arg)
{
- unsigned long *this_cpu_userfaults;
+ struct uffd_stats *stats = (struct uffd_stats *)arg;
struct uffd_msg msg;
- this_cpu_userfaults = (unsigned long *) arg;
- *this_cpu_userfaults = 0;
-
pthread_mutex_unlock(&uffd_read_mutex);
/* from here cancellation is ok */
for (;;) {
if (uffd_read_msg(uffd, &msg))
continue;
- (*this_cpu_userfaults) += uffd_handle_page_fault(&msg);
+ uffd_handle_page_fault(&msg, stats);
}
- return (void *)NULL;
+
+ return NULL;
}
static void *background_thread(void *arg)
{
unsigned long cpu = (unsigned long) arg;
- unsigned long page_nr;
+ unsigned long page_nr, start_nr, mid_nr, end_nr;
+
+ start_nr = cpu * nr_pages_per_cpu;
+ end_nr = (cpu+1) * nr_pages_per_cpu;
+ mid_nr = (start_nr + end_nr) / 2;
- for (page_nr = cpu * nr_pages_per_cpu;
- page_nr < (cpu+1) * nr_pages_per_cpu;
- page_nr++)
+ /* Copy the first half of the pages */
+ for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
+ copy_page_retry(uffd, page_nr * page_size);
+
+ /*
+ * If we need to test uffd-wp, set it up now. Then we'll have
+ * at least the first half of the pages mapped already which
+ * can be write-protected for testing
+ */
+ if (test_uffdio_wp)
+ wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
+ nr_pages_per_cpu * page_size, true);
+
+ /*
+ * Continue the 2nd half of the page copying, handling write
+ * protection faults if any
+ */
+ for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
copy_page_retry(uffd, page_nr * page_size);
return NULL;
}
-static int stress(unsigned long *userfaults)
+static int stress(struct uffd_stats *uffd_stats)
{
unsigned long cpu;
pthread_t locking_threads[nr_cpus];
pthread_t uffd_threads[nr_cpus];
pthread_t background_threads[nr_cpus];
- void **_userfaults = (void **) userfaults;
finished = 0;
for (cpu = 0; cpu < nr_cpus; cpu++) {
@@ -595,12 +683,13 @@ static int stress(unsigned long *userfaults)
return 1;
if (bounces & BOUNCE_POLL) {
if (pthread_create(&uffd_threads[cpu], &attr,
- uffd_poll_thread, (void *)cpu))
+ uffd_poll_thread,
+ (void *)&uffd_stats[cpu]))
return 1;
} else {
if (pthread_create(&uffd_threads[cpu], &attr,
uffd_read_thread,
- &_userfaults[cpu]))
+ (void *)&uffd_stats[cpu]))
return 1;
pthread_mutex_lock(&uffd_read_mutex);
}
@@ -637,7 +726,8 @@ static int stress(unsigned long *userfaults)
fprintf(stderr, "pipefd write error\n");
return 1;
}
- if (pthread_join(uffd_threads[cpu], &_userfaults[cpu]))
+ if (pthread_join(uffd_threads[cpu],
+ (void *)&uffd_stats[cpu]))
return 1;
} else {
if (pthread_cancel(uffd_threads[cpu]))
@@ -735,17 +825,31 @@ static int faulting_process(int signal_test)
}
for (nr = 0; nr < split_nr_pages; nr++) {
+ int steps = 1;
+ unsigned long offset = nr * page_size;
+
if (signal_test) {
if (sigsetjmp(*sigbuf, 1) != 0) {
- if (nr == lastnr) {
+ if (steps == 1 && nr == lastnr) {
fprintf(stderr, "Signal repeated\n");
return 1;
}
lastnr = nr;
if (signal_test == 1) {
- if (copy_page(uffd, nr * page_size))
- signalled++;
+ if (steps == 1) {
+ /* This is a MISSING request */
+ steps++;
+ if (copy_page(uffd, offset))
+ signalled++;
+ } else {
+ /* This is a WP request */
+ assert(steps == 2);
+ wp_range(uffd,
+ (__u64)area_dst +
+ offset,
+ page_size, false);
+ }
} else {
signalled++;
continue;
@@ -758,8 +862,13 @@ static int faulting_process(int signal_test)
fprintf(stderr,
"nr %lu memory corruption %Lu %Lu\n",
nr, count,
- count_verify[nr]), exit(1);
- }
+ count_verify[nr]);
+ }
+ /*
+ * Trigger write protection if there is by writting
+ * the same value back.
+ */
+ *area_count(area_dst, nr) = count;
}
if (signal_test)
@@ -781,6 +890,11 @@ static int faulting_process(int signal_test)
nr, count,
count_verify[nr]), exit(1);
}
+ /*
+ * Trigger write protection if there is by writting
+ * the same value back.
+ */
+ *area_count(area_dst, nr) = count;
}
if (uffd_test_ops->release_pages(area_dst))
@@ -884,6 +998,8 @@ static int userfaultfd_zeropage_test(void)
uffdio_register.range.start = (unsigned long) area_dst;
uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (test_uffdio_wp)
+ uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
fprintf(stderr, "register failure\n"), exit(1);
@@ -908,11 +1024,11 @@ static int userfaultfd_events_test(void)
{
struct uffdio_register uffdio_register;
unsigned long expected_ioctls;
- unsigned long userfaults;
pthread_t uffd_mon;
int err, features;
pid_t pid;
char c;
+ struct uffd_stats stats = { 0 };
printf("testing events (fork, remap, remove): ");
fflush(stdout);
@@ -929,6 +1045,8 @@ static int userfaultfd_events_test(void)
uffdio_register.range.start = (unsigned long) area_dst;
uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (test_uffdio_wp)
+ uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
fprintf(stderr, "register failure\n"), exit(1);
@@ -939,7 +1057,7 @@ static int userfaultfd_events_test(void)
"unexpected missing ioctl for anon memory\n"),
exit(1);
- if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL))
+ if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
perror("uffd_poll_thread create"), exit(1);
pid = fork();
@@ -955,13 +1073,14 @@ static int userfaultfd_events_test(void)
if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
perror("pipe write"), exit(1);
- if (pthread_join(uffd_mon, (void **)&userfaults))
+ if (pthread_join(uffd_mon, NULL))
return 1;
close(uffd);
- printf("userfaults: %ld\n", userfaults);
- return userfaults != nr_pages;
+ uffd_stats_report(&stats, 1);
+
+ return stats.missing_faults != nr_pages;
}
static int userfaultfd_sig_test(void)
@@ -973,6 +1092,7 @@ static int userfaultfd_sig_test(void)
int err, features;
pid_t pid;
char c;
+ struct uffd_stats stats = { 0 };
printf("testing signal delivery: ");
fflush(stdout);
@@ -988,6 +1108,8 @@ static int userfaultfd_sig_test(void)
uffdio_register.range.start = (unsigned long) area_dst;
uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (test_uffdio_wp)
+ uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
fprintf(stderr, "register failure\n"), exit(1);
@@ -1004,7 +1126,7 @@ static int userfaultfd_sig_test(void)
if (uffd_test_ops->release_pages(area_dst))
return 1;
- if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL))
+ if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
perror("uffd_poll_thread create"), exit(1);
pid = fork();
@@ -1030,6 +1152,7 @@ static int userfaultfd_sig_test(void)
close(uffd);
return userfaults != 0;
}
+
static int userfaultfd_stress(void)
{
void *area;
@@ -1038,7 +1161,7 @@ static int userfaultfd_stress(void)
struct uffdio_register uffdio_register;
unsigned long cpu;
int err;
- unsigned long userfaults[nr_cpus];
+ struct uffd_stats uffd_stats[nr_cpus];
uffd_test_ops->allocate_area((void **)&area_src);
if (!area_src)
@@ -1119,6 +1242,8 @@ static int userfaultfd_stress(void)
uffdio_register.range.start = (unsigned long) area_dst;
uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (test_uffdio_wp)
+ uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
fprintf(stderr, "register failure\n");
return 1;
@@ -1167,10 +1292,17 @@ static int userfaultfd_stress(void)
if (uffd_test_ops->release_pages(area_dst))
return 1;
+ uffd_stats_reset(uffd_stats, nr_cpus);
+
/* bounce pass */
- if (stress(userfaults))
+ if (stress(uffd_stats))
return 1;
+ /* Clear all the write protections if there is any */
+ if (test_uffdio_wp)
+ wp_range(uffd, (unsigned long)area_dst,
+ nr_pages * page_size, false);
+
/* unregister */
if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
fprintf(stderr, "unregister failure\n");
@@ -1209,10 +1341,7 @@ static int userfaultfd_stress(void)
area_src_alias = area_dst_alias;
area_dst_alias = tmp_area;
- printf("userfaults:");
- for (cpu = 0; cpu < nr_cpus; cpu++)
- printf(" %lu", userfaults[cpu]);
- printf("\n");
+ uffd_stats_report(uffd_stats, nr_cpus);
}
if (err)
@@ -1252,6 +1381,8 @@ static void set_test_type(const char *type)
if (!strcmp(type, "anon")) {
test_type = TEST_ANON;
uffd_test_ops = &anon_uffd_test_ops;
+ /* Only enable write-protect test for anonymous test */
+ test_uffdio_wp = true;
} else if (!strcmp(type, "hugetlb")) {
test_type = TEST_HUGETLB;
uffd_test_ops = &hugetlb_uffd_test_ops;
diff --git a/tools/thermal/tmon/tmon.c b/tools/thermal/tmon/tmon.c
index 83ec6e482f12..7eb3216a27f4 100644
--- a/tools/thermal/tmon/tmon.c
+++ b/tools/thermal/tmon/tmon.c
@@ -46,7 +46,7 @@ static void start_daemon_mode(void);
pthread_t event_tid;
pthread_mutex_t input_lock;
-void usage()
+void usage(void)
{
printf("Usage: tmon [OPTION...]\n");
printf(" -c, --control cooling device in control\n");
@@ -62,7 +62,7 @@ void usage()
exit(0);
}
-void version()
+void version(void)
{
printf("TMON version %s\n", VERSION);
exit(EXIT_SUCCESS);
@@ -70,7 +70,6 @@ void version()
static void tmon_cleanup(void)
{
-
syslog(LOG_INFO, "TMON exit cleanup\n");
fflush(stdout);
refresh();
@@ -96,7 +95,6 @@ static void tmon_cleanup(void)
exit(1);
}
-
static void tmon_sig_handler(int sig)
{
syslog(LOG_INFO, "TMON caught signal %d\n", sig);
@@ -120,7 +118,6 @@ static void tmon_sig_handler(int sig)
tmon_exit = true;
}
-
static void start_syslog(void)
{
if (debug_on)
@@ -167,7 +164,6 @@ static void prepare_logging(void)
return;
}
-
fprintf(tmon_log, "#----------- THERMAL SYSTEM CONFIG -------------\n");
for (i = 0; i < ptdata.nr_tz_sensor; i++) {
char binding_str[33]; /* size of long + 1 */
@@ -175,7 +171,7 @@ static void prepare_logging(void)
memset(binding_str, 0, sizeof(binding_str));
for (j = 0; j < 32; j++)
- binding_str[j] = (ptdata.tzi[i].cdev_binding & 1<<j) ?
+ binding_str[j] = (ptdata.tzi[i].cdev_binding & (1 << j)) ?
'1' : '0';
fprintf(tmon_log, "#thermal zone %s%02d cdevs binding: %32s\n",
@@ -187,7 +183,6 @@ static void prepare_logging(void)
trip_type_name[ptdata.tzi[i].tp[j].type],
ptdata.tzi[i].tp[j].temp);
}
-
}
for (i = 0; i < ptdata.nr_cooling_dev; i++)
@@ -219,7 +214,6 @@ static struct option opts[] = {
{ 0, 0, NULL, 0 }
};
-
int main(int argc, char **argv)
{
int err = 0;
@@ -283,7 +277,7 @@ int main(int argc, char **argv)
if (signal(SIGINT, tmon_sig_handler) == SIG_ERR)
syslog(LOG_DEBUG, "Cannot handle SIGINT\n");
if (signal(SIGTERM, tmon_sig_handler) == SIG_ERR)
- syslog(LOG_DEBUG, "Cannot handle SIGINT\n");
+ syslog(LOG_DEBUG, "Cannot handle SIGTERM\n");
if (probe_thermal_sysfs()) {
pthread_mutex_destroy(&input_lock);
@@ -328,8 +322,7 @@ int main(int argc, char **argv)
show_cooling_device();
}
time_elapsed += ticktime;
- controller_handler(trec[0].temp[target_tz_index] / 1000,
- &yk);
+ controller_handler(trec[0].temp[target_tz_index] / 1000, &yk);
trec[0].pid_out_pct = yk;
if (!dialogue_on)
show_control_w();
@@ -340,14 +333,15 @@ int main(int argc, char **argv)
return 0;
}
-static void start_daemon_mode()
+static void start_daemon_mode(void)
{
daemon_mode = 1;
/* fork */
pid_t sid, pid = fork();
- if (pid < 0) {
+
+ if (pid < 0)
exit(EXIT_FAILURE);
- } else if (pid > 0)
+ else if (pid > 0)
/* kill parent */
exit(EXIT_SUCCESS);
@@ -366,11 +360,9 @@ static void start_daemon_mode()
if ((chdir("/")) < 0)
exit(EXIT_FAILURE);
-
sleep(10);
close(STDIN_FILENO);
close(STDOUT_FILENO);
close(STDERR_FILENO);
-
}
diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index 8e2a908115c2..f33f32f1d208 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -8,7 +8,32 @@ CFLAGS += -g -O2 -Werror -Wall -I. -I../include/ -I ../../usr/include/ -Wno-poin
vpath %.c ../../drivers/virtio ../../drivers/vhost
mod:
${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test V=${V}
-.PHONY: all test mod clean
+
+#oot: build vhost as an out of tree module for a distro kernel
+#no effort is taken to make it actually build or work, but tends to mostly work
+#if the distro kernel is very close to upstream
+#unsupported! this is a development tool only, don't use the
+#resulting modules in production!
+OOT_KSRC=/lib/modules/$$(uname -r)/build
+OOT_VHOST=`pwd`/../../drivers/vhost
+#Everyone depends on vhost
+#Tweak the below to enable more modules
+OOT_CONFIGS=\
+ CONFIG_VHOST=m \
+ CONFIG_VHOST_NET=n \
+ CONFIG_VHOST_SCSI=n \
+ CONFIG_VHOST_VSOCK=n
+OOT_BUILD=KCFLAGS="-I "${OOT_VHOST} ${MAKE} -C ${OOT_KSRC} V=${V}
+oot-build:
+ echo "UNSUPPORTED! Don't use the resulting modules in production!"
+ ${OOT_BUILD} M=`pwd`/vhost_test
+ ${OOT_BUILD} M=${OOT_VHOST} ${OOT_CONFIGS}
+
+oot-clean: oot-build
+oot: oot-build
+oot-clean: OOT_BUILD+=clean
+
+.PHONY: all test mod clean vhost oot oot-clean oot-build
clean:
${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
vhost_test/Module.symvers vhost_test/modules.order *.d
diff --git a/usr/include/Makefile b/usr/include/Makefile
index a339ef325aa5..b568a95d1f62 100644
--- a/usr/include/Makefile
+++ b/usr/include/Makefile
@@ -93,7 +93,7 @@ no-header-test += asm-generic/%
extra-y := $(patsubst $(obj)/%.h,%.hdrtest, $(shell find $(obj) -name '*.h' 2>/dev/null))
-# Include the header to detect missing include guard.
+# Include the header twice to detect missing include guard.
quiet_cmd_hdrtest = HDRTEST $<
cmd_hdrtest = \
$(CC) $(c_flags) -S -o /dev/null -x c /dev/null \