diff options
Diffstat (limited to 'arch')
227 files changed, 6603 insertions, 838 deletions
diff --git a/arch/arm/boot/dts/rockchip/rk3128.dtsi b/arch/arm/boot/dts/rockchip/rk3128.dtsi index b63bd4ad3143..88a4b0d6d928 100644 --- a/arch/arm/boot/dts/rockchip/rk3128.dtsi +++ b/arch/arm/boot/dts/rockchip/rk3128.dtsi @@ -64,7 +64,8 @@ compatible = "arm,armv7-timer"; interrupts = <GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_HIGH)>, <GIC_PPI 14 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_HIGH)>, - <GIC_PPI 11 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_HIGH)>; + <GIC_PPI 11 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_HIGH)>, + <GIC_PPI 10 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_HIGH)>; arm,cpu-registers-not-fw-configured; clock-frequency = <24000000>; }; @@ -233,7 +234,7 @@ compatible = "rockchip,rk3128-timer", "rockchip,rk3288-timer"; reg = <0x20044000 0x20>; interrupts = <GIC_SPI 28 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&cru PCLK_TIMER>, <&xin24m>; + clocks = <&cru PCLK_TIMER>, <&cru SCLK_TIMER0>; clock-names = "pclk", "timer"; }; @@ -241,7 +242,7 @@ compatible = "rockchip,rk3128-timer", "rockchip,rk3288-timer"; reg = <0x20044020 0x20>; interrupts = <GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&cru PCLK_TIMER>, <&xin24m>; + clocks = <&cru PCLK_TIMER>, <&cru SCLK_TIMER1>; clock-names = "pclk", "timer"; }; @@ -249,7 +250,7 @@ compatible = "rockchip,rk3128-timer", "rockchip,rk3288-timer"; reg = <0x20044040 0x20>; interrupts = <GIC_SPI 59 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&cru PCLK_TIMER>, <&xin24m>; + clocks = <&cru PCLK_TIMER>, <&cru SCLK_TIMER2>; clock-names = "pclk", "timer"; }; @@ -257,7 +258,7 @@ compatible = "rockchip,rk3128-timer", "rockchip,rk3288-timer"; reg = <0x20044060 0x20>; interrupts = <GIC_SPI 60 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&cru PCLK_TIMER>, <&xin24m>; + clocks = <&cru PCLK_TIMER>, <&cru SCLK_TIMER3>; clock-names = "pclk", "timer"; }; @@ -265,7 +266,7 @@ compatible = "rockchip,rk3128-timer", "rockchip,rk3288-timer"; reg = <0x20044080 0x20>; interrupts = <GIC_SPI 61 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&cru PCLK_TIMER>, <&xin24m>; + clocks = <&cru PCLK_TIMER>, <&cru SCLK_TIMER4>; clock-names = "pclk", "timer"; }; @@ -273,7 +274,7 @@ compatible = "rockchip,rk3128-timer", "rockchip,rk3288-timer"; reg = <0x200440a0 0x20>; interrupts = <GIC_SPI 62 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&cru PCLK_TIMER>, <&xin24m>; + clocks = <&cru PCLK_TIMER>, <&cru SCLK_TIMER5>; clock-names = "pclk", "timer"; }; @@ -426,7 +427,7 @@ i2c0: i2c@20072000 { compatible = "rockchip,rk3128-i2c", "rockchip,rk3288-i2c"; - reg = <20072000 0x1000>; + reg = <0x20072000 0x1000>; interrupts = <GIC_SPI 24 IRQ_TYPE_LEVEL_HIGH>; clock-names = "i2c"; clocks = <&cru PCLK_I2C0>; @@ -458,6 +459,7 @@ interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>; arm,pl330-broken-no-flushp; + arm,pl330-periph-burst; clocks = <&cru ACLK_DMAC>; clock-names = "apb_pclk"; #dma-cells = <1>; diff --git a/arch/arm/boot/dts/ti/omap/motorola-mapphone-common.dtsi b/arch/arm/boot/dts/ti/omap/motorola-mapphone-common.dtsi index 091ba310053e..d2d516d113ba 100644 --- a/arch/arm/boot/dts/ti/omap/motorola-mapphone-common.dtsi +++ b/arch/arm/boot/dts/ti/omap/motorola-mapphone-common.dtsi @@ -614,12 +614,12 @@ /* Configure pwm clock source for timers 8 & 9 */ &timer8 { assigned-clocks = <&abe_clkctrl OMAP4_TIMER8_CLKCTRL 24>; - assigned-clock-parents = <&sys_clkin_ck>; + assigned-clock-parents = <&sys_32k_ck>; }; &timer9 { assigned-clocks = <&l4_per_clkctrl OMAP4_TIMER9_CLKCTRL 24>; - assigned-clock-parents = <&sys_clkin_ck>; + assigned-clock-parents = <&sys_32k_ck>; }; /* @@ -640,6 +640,7 @@ &uart3 { interrupts-extended = <&wakeupgen GIC_SPI 74 IRQ_TYPE_LEVEL_HIGH &omap4_pmx_core 0x17c>; + overrun-throttle-ms = <500>; }; &uart4 { diff --git a/arch/arm/boot/dts/ti/omap/omap3-cpu-thermal.dtsi b/arch/arm/boot/dts/ti/omap/omap3-cpu-thermal.dtsi index 0da759f8e2c2..7dd2340bc5e4 100644 --- a/arch/arm/boot/dts/ti/omap/omap3-cpu-thermal.dtsi +++ b/arch/arm/boot/dts/ti/omap/omap3-cpu-thermal.dtsi @@ -12,8 +12,7 @@ cpu_thermal: cpu-thermal { polling-delay = <1000>; /* milliseconds */ coefficients = <0 20000>; - /* sensor ID */ - thermal-sensors = <&bandgap 0>; + thermal-sensors = <&bandgap>; cpu_trips: trips { cpu_alert0: cpu_alert { diff --git a/arch/arm/boot/dts/ti/omap/omap4-cpu-thermal.dtsi b/arch/arm/boot/dts/ti/omap/omap4-cpu-thermal.dtsi index 801b4f10350c..d484ec1e4fd8 100644 --- a/arch/arm/boot/dts/ti/omap/omap4-cpu-thermal.dtsi +++ b/arch/arm/boot/dts/ti/omap/omap4-cpu-thermal.dtsi @@ -12,7 +12,10 @@ cpu_thermal: cpu_thermal { polling-delay-passive = <250>; /* milliseconds */ polling-delay = <1000>; /* milliseconds */ - /* sensor ID */ + /* + * See 44xx files for single sensor addressing, omap5 and dra7 need + * also sensor ID for addressing. + */ thermal-sensors = <&bandgap 0>; cpu_trips: trips { diff --git a/arch/arm/boot/dts/ti/omap/omap4-l4-abe.dtsi b/arch/arm/boot/dts/ti/omap/omap4-l4-abe.dtsi index 7ae8b620515c..59f546a278f8 100644 --- a/arch/arm/boot/dts/ti/omap/omap4-l4-abe.dtsi +++ b/arch/arm/boot/dts/ti/omap/omap4-l4-abe.dtsi @@ -109,6 +109,8 @@ reg = <0x0 0xff>, /* MPU private access */ <0x49022000 0xff>; /* L3 Interconnect */ reg-names = "mpu", "dma"; + clocks = <&abe_clkctrl OMAP4_MCBSP1_CLKCTRL 24>; + clock-names = "fck"; interrupts = <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>; interrupt-names = "common"; ti,buffer-size = <128>; @@ -142,6 +144,8 @@ reg = <0x0 0xff>, /* MPU private access */ <0x49024000 0xff>; /* L3 Interconnect */ reg-names = "mpu", "dma"; + clocks = <&abe_clkctrl OMAP4_MCBSP2_CLKCTRL 24>; + clock-names = "fck"; interrupts = <GIC_SPI 22 IRQ_TYPE_LEVEL_HIGH>; interrupt-names = "common"; ti,buffer-size = <128>; @@ -175,6 +179,8 @@ reg = <0x0 0xff>, /* MPU private access */ <0x49026000 0xff>; /* L3 Interconnect */ reg-names = "mpu", "dma"; + clocks = <&abe_clkctrl OMAP4_MCBSP3_CLKCTRL 24>; + clock-names = "fck"; interrupts = <GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>; interrupt-names = "common"; ti,buffer-size = <128>; diff --git a/arch/arm/boot/dts/ti/omap/omap4-l4.dtsi b/arch/arm/boot/dts/ti/omap/omap4-l4.dtsi index 46b8f9efd413..3fcef3080eae 100644 --- a/arch/arm/boot/dts/ti/omap/omap4-l4.dtsi +++ b/arch/arm/boot/dts/ti/omap/omap4-l4.dtsi @@ -2043,6 +2043,8 @@ compatible = "ti,omap4-mcbsp"; reg = <0x0 0xff>; /* L4 Interconnect */ reg-names = "mpu"; + clocks = <&l4_per_clkctrl OMAP4_MCBSP4_CLKCTRL 24>; + clock-names = "fck"; interrupts = <GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>; interrupt-names = "common"; ti,buffer-size = <128>; diff --git a/arch/arm/boot/dts/ti/omap/omap443x.dtsi b/arch/arm/boot/dts/ti/omap/omap443x.dtsi index 238aceb799f8..2104170fe2cd 100644 --- a/arch/arm/boot/dts/ti/omap/omap443x.dtsi +++ b/arch/arm/boot/dts/ti/omap/omap443x.dtsi @@ -69,6 +69,7 @@ }; &cpu_thermal { + thermal-sensors = <&bandgap>; coefficients = <0 20000>; }; diff --git a/arch/arm/boot/dts/ti/omap/omap4460.dtsi b/arch/arm/boot/dts/ti/omap/omap4460.dtsi index 1b27a862ae81..a6764750d447 100644 --- a/arch/arm/boot/dts/ti/omap/omap4460.dtsi +++ b/arch/arm/boot/dts/ti/omap/omap4460.dtsi @@ -79,6 +79,7 @@ }; &cpu_thermal { + thermal-sensors = <&bandgap>; coefficients = <348 (-9301)>; }; diff --git a/arch/arm/boot/dts/ti/omap/omap5-l4-abe.dtsi b/arch/arm/boot/dts/ti/omap/omap5-l4-abe.dtsi index a03bca5a3584..97b0c3b5f573 100644 --- a/arch/arm/boot/dts/ti/omap/omap5-l4-abe.dtsi +++ b/arch/arm/boot/dts/ti/omap/omap5-l4-abe.dtsi @@ -109,6 +109,8 @@ reg = <0x0 0xff>, /* MPU private access */ <0x49022000 0xff>; /* L3 Interconnect */ reg-names = "mpu", "dma"; + clocks = <&abe_clkctrl OMAP5_MCBSP1_CLKCTRL 24>; + clock-names = "fck"; interrupts = <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>; interrupt-names = "common"; ti,buffer-size = <128>; @@ -142,6 +144,8 @@ reg = <0x0 0xff>, /* MPU private access */ <0x49024000 0xff>; /* L3 Interconnect */ reg-names = "mpu", "dma"; + clocks = <&abe_clkctrl OMAP5_MCBSP2_CLKCTRL 24>; + clock-names = "fck"; interrupts = <GIC_SPI 22 IRQ_TYPE_LEVEL_HIGH>; interrupt-names = "common"; ti,buffer-size = <128>; @@ -175,6 +179,8 @@ reg = <0x0 0xff>, /* MPU private access */ <0x49026000 0xff>; /* L3 Interconnect */ reg-names = "mpu", "dma"; + clocks = <&abe_clkctrl OMAP5_MCBSP3_CLKCTRL 24>; + clock-names = "fck"; interrupts = <GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>; interrupt-names = "common"; ti,buffer-size = <128>; diff --git a/arch/arm/include/asm/hardware/locomo.h b/arch/arm/include/asm/hardware/locomo.h index 246a3de25931..aaaedafef7cc 100644 --- a/arch/arm/include/asm/hardware/locomo.h +++ b/arch/arm/include/asm/hardware/locomo.h @@ -195,7 +195,7 @@ struct locomo_driver { #define LOCOMO_DRIVER_NAME(_ldev) ((_ldev)->dev.driver->name) -void locomo_lcd_power(struct locomo_dev *, int, unsigned int); +extern void locomolcd_power(int on); int locomo_driver_register(struct locomo_driver *); void locomo_driver_unregister(struct locomo_driver *); diff --git a/arch/arm/mach-omap1/board-ams-delta.c b/arch/arm/mach-omap1/board-ams-delta.c index 9808cd27e2cf..67de96c7717d 100644 --- a/arch/arm/mach-omap1/board-ams-delta.c +++ b/arch/arm/mach-omap1/board-ams-delta.c @@ -550,6 +550,7 @@ static struct platform_device *ams_delta_devices[] __initdata = { &ams_delta_nand_device, &ams_delta_lcd_device, &cx20442_codec_device, + &modem_nreset_device, }; static struct gpiod_lookup_table *ams_delta_gpio_tables[] __initdata = { @@ -782,26 +783,28 @@ static struct plat_serial8250_port ams_delta_modem_ports[] = { { }, }; +static int ams_delta_modem_pm_activate(struct device *dev) +{ + modem_priv.regulator = regulator_get(dev, "RESET#"); + if (IS_ERR(modem_priv.regulator)) + return -EPROBE_DEFER; + + return 0; +} + +static struct dev_pm_domain ams_delta_modem_pm_domain = { + .activate = ams_delta_modem_pm_activate, +}; + static struct platform_device ams_delta_modem_device = { .name = "serial8250", .id = PLAT8250_DEV_PLATFORM1, .dev = { .platform_data = ams_delta_modem_ports, + .pm_domain = &ams_delta_modem_pm_domain, }, }; -static int __init modem_nreset_init(void) -{ - int err; - - err = platform_device_register(&modem_nreset_device); - if (err) - pr_err("Couldn't register the modem regulator device\n"); - - return err; -} - - /* * This function expects MODEM IRQ number already assigned to the port. * The MODEM device requires its RESET# pin kept high during probe. @@ -833,37 +836,6 @@ static int __init ams_delta_modem_init(void) } arch_initcall_sync(ams_delta_modem_init); -static int __init late_init(void) -{ - int err; - - err = modem_nreset_init(); - if (err) - return err; - - /* - * Once the modem device is registered, the modem_nreset - * regulator can be requested on behalf of that device. - */ - modem_priv.regulator = regulator_get(&ams_delta_modem_device.dev, - "RESET#"); - if (IS_ERR(modem_priv.regulator)) { - err = PTR_ERR(modem_priv.regulator); - goto unregister; - } - return 0; - -unregister: - platform_device_unregister(&ams_delta_modem_device); - return err; -} - -static void __init ams_delta_init_late(void) -{ - omap1_init_late(); - late_init(); -} - static void __init ams_delta_map_io(void) { omap1_map_io(); @@ -877,7 +849,7 @@ MACHINE_START(AMS_DELTA, "Amstrad E3 (Delta)") .init_early = omap1_init_early, .init_irq = omap1_init_irq, .init_machine = ams_delta_init, - .init_late = ams_delta_init_late, + .init_late = omap1_init_late, .init_time = omap1_timer_init, .restart = omap1_restart, MACHINE_END diff --git a/arch/arm/mach-omap1/timer32k.c b/arch/arm/mach-omap1/timer32k.c index 410d17d1d443..f618a6df2938 100644 --- a/arch/arm/mach-omap1/timer32k.c +++ b/arch/arm/mach-omap1/timer32k.c @@ -176,17 +176,18 @@ static u64 notrace omap_32k_read_sched_clock(void) return sync32k_cnt_reg ? readl_relaxed(sync32k_cnt_reg) : 0; } +static struct timespec64 persistent_ts; +static cycles_t cycles; +static unsigned int persistent_mult, persistent_shift; + /** * omap_read_persistent_clock64 - Return time from a persistent clock. + * @ts: &struct timespec64 for the returned time * * Reads the time from a source which isn't disabled during PM, the * 32k sync timer. Convert the cycles elapsed since last read into * nsecs and adds to a monotonically increasing timespec64. */ -static struct timespec64 persistent_ts; -static cycles_t cycles; -static unsigned int persistent_mult, persistent_shift; - static void omap_read_persistent_clock64(struct timespec64 *ts) { unsigned long long nsecs; @@ -206,10 +207,9 @@ static void omap_read_persistent_clock64(struct timespec64 *ts) /** * omap_init_clocksource_32k - setup and register counter 32k as a * kernel clocksource - * @pbase: base addr of counter_32k module - * @size: size of counter_32k to map + * @vbase: base addr of counter_32k module * - * Returns 0 upon success or negative error code upon failure. + * Returns: %0 upon success or negative error code upon failure. * */ static int __init omap_init_clocksource_32k(void __iomem *vbase) diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c index 1e17b5f77588..ba71928c0fcb 100644 --- a/arch/arm/mach-omap2/omap_hwmod.c +++ b/arch/arm/mach-omap2/omap_hwmod.c @@ -2209,7 +2209,7 @@ int omap_hwmod_parse_module_range(struct omap_hwmod *oh, return err; pr_debug("omap_hwmod: %s %pOFn at %pR\n", - oh->name, np, &res); + oh->name, np, res); if (oh && oh->mpu_rt_idx) { omap_hwmod_fix_mpu_rt_idx(oh, np, res); diff --git a/arch/arm/mach-omap2/pm44xx.c b/arch/arm/mach-omap2/pm44xx.c index f57802f3ee3a..37b168119fe4 100644 --- a/arch/arm/mach-omap2/pm44xx.c +++ b/arch/arm/mach-omap2/pm44xx.c @@ -99,7 +99,7 @@ static int omap4_pm_suspend(void) * possible causes. * http://www.spinics.net/lists/arm-kernel/msg218641.html */ - pr_warn("A possible cause could be an old bootloader - try u-boot >= v2012.07\n"); + pr_debug("A possible cause could be an old bootloader - try u-boot >= v2012.07\n"); } else { pr_info("Successfully put all powerdomains to target state\n"); } @@ -257,7 +257,7 @@ int __init omap4_pm_init(void) * http://www.spinics.net/lists/arm-kernel/msg218641.html */ if (cpu_is_omap44xx()) - pr_warn("OMAP4 PM: u-boot >= v2012.07 is required for full PM support\n"); + pr_debug("OMAP4 PM: u-boot >= v2012.07 is required for full PM support\n"); ret = pwrdm_for_each(pwrdms_setup, NULL); if (ret) { diff --git a/arch/arm/mach-sa1100/include/mach/collie.h b/arch/arm/mach-sa1100/include/mach/collie.h index b7bc23ffd3c6..c95273c9567b 100644 --- a/arch/arm/mach-sa1100/include/mach/collie.h +++ b/arch/arm/mach-sa1100/include/mach/collie.h @@ -16,8 +16,6 @@ #include "hardware.h" /* Gives GPIO_MAX */ -extern void locomolcd_power(int on); - #define COLLIE_SCOOP_GPIO_BASE (GPIO_MAX + 1) #define COLLIE_GPIO_CHARGE_ON (COLLIE_SCOOP_GPIO_BASE + 0) #define COLLIE_SCP_DIAG_BOOT1 SCOOP_GPCR_PA12 diff --git a/arch/arm/mm/cache-uniphier.c b/arch/arm/mm/cache-uniphier.c index ff2881458504..84a2f17ff32d 100644 --- a/arch/arm/mm/cache-uniphier.c +++ b/arch/arm/mm/cache-uniphier.c @@ -58,11 +58,13 @@ ((op & UNIPHIER_SSCOQM_S_MASK) == UNIPHIER_SSCOQM_S_RANGE) /** - * uniphier_cache_data - UniPhier outer cache specific data + * struct uniphier_cache_data - UniPhier outer cache specific data * * @ctrl_base: virtual base address of control registers * @rev_base: virtual base address of revision registers * @op_base: virtual base address of operation registers + * @way_ctrl_base: virtual address of the way control registers for this + * SoC revision * @way_mask: each bit specifies if the way is present * @nsets: number of associativity sets * @line_size: line size in bytes diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b10515c0200b..78f20e632712 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1037,6 +1037,19 @@ config ARM64_ERRATUM_2645198 If unsure, say Y. +config ARM64_ERRATUM_2966298 + bool "Cortex-A520: 2966298: workaround for speculatively executed unprivileged load" + default y + help + This option adds the workaround for ARM Cortex-A520 erratum 2966298. + + On an affected Cortex-A520 core, a speculatively executed unprivileged + load might leak data from a privileged level via a cache side channel. + + Work around this problem by executing a TLBI before returning to EL0. + + If unsure, say Y. + config CAVIUM_ERRATUM_22375 bool "Cavium erratum 22375, 24313" default y diff --git a/arch/arm64/boot/dts/freescale/Makefile b/arch/arm64/boot/dts/freescale/Makefile index c6872b7e9471..89aee6c92576 100644 --- a/arch/arm64/boot/dts/freescale/Makefile +++ b/arch/arm64/boot/dts/freescale/Makefile @@ -66,6 +66,7 @@ dtb-$(CONFIG_ARCH_MXC) += imx8mm-mx8menlo.dtb dtb-$(CONFIG_ARCH_MXC) += imx8mm-nitrogen-r2.dtb dtb-$(CONFIG_ARCH_MXC) += imx8mm-phg.dtb dtb-$(CONFIG_ARCH_MXC) += imx8mm-phyboard-polis-rdk.dtb +dtb-$(CONFIG_ARCH_MXC) += imx8mm-prt8mm.dtb dtb-$(CONFIG_ARCH_MXC) += imx8mm-tqma8mqml-mba8mx.dtb dtb-$(CONFIG_ARCH_MXC) += imx8mm-var-som-symphony.dtb dtb-$(CONFIG_ARCH_MXC) += imx8mm-venice-gw71xx-0x.dtb diff --git a/arch/arm64/boot/dts/freescale/imx8mm-evk.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-evk.dtsi index e31ab8b4f54f..a882c86ec313 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-evk.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-evk.dtsi @@ -26,7 +26,7 @@ port { hdmi_connector_in: endpoint { - remote-endpoint = <&adv7533_out>; + remote-endpoint = <&adv7535_out>; }; }; }; @@ -72,6 +72,13 @@ enable-active-high; }; + reg_vddext_3v3: regulator-vddext-3v3 { + compatible = "regulator-fixed"; + regulator-name = "VDDEXT_3V3"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + }; + backlight: backlight { compatible = "pwm-backlight"; pwms = <&pwm1 0 5000000 0>; @@ -317,15 +324,16 @@ hdmi@3d { compatible = "adi,adv7535"; - reg = <0x3d>, <0x3c>, <0x3e>, <0x3f>; - reg-names = "main", "cec", "edid", "packet"; + reg = <0x3d>; + interrupt-parent = <&gpio1>; + interrupts = <9 IRQ_TYPE_EDGE_FALLING>; adi,dsi-lanes = <4>; - - adi,input-depth = <8>; - adi,input-colorspace = "rgb"; - adi,input-clock = "1x"; - adi,input-style = <1>; - adi,input-justification = "evenly"; + avdd-supply = <&buck5_reg>; + dvdd-supply = <&buck5_reg>; + pvdd-supply = <&buck5_reg>; + a2vdd-supply = <&buck5_reg>; + v3p3-supply = <®_vddext_3v3>; + v1p2-supply = <&buck5_reg>; ports { #address-cells = <1>; @@ -334,7 +342,7 @@ port@0 { reg = <0>; - adv7533_in: endpoint { + adv7535_in: endpoint { remote-endpoint = <&dsi_out>; }; }; @@ -342,7 +350,7 @@ port@1 { reg = <1>; - adv7533_out: endpoint { + adv7535_out: endpoint { remote-endpoint = <&hdmi_connector_in>; }; }; @@ -448,7 +456,7 @@ reg = <1>; dsi_out: endpoint { - remote-endpoint = <&adv7533_in>; + remote-endpoint = <&adv7535_in>; data-lanes = <1 2 3 4>; }; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts b/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts index 06e91297fb16..acd265d8b58e 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts +++ b/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts @@ -381,9 +381,10 @@ &sai3 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_sai3>; - assigned-clocks = <&clk IMX8MP_CLK_SAI3>; + assigned-clocks = <&clk IMX8MP_CLK_SAI3>, + <&clk IMX8MP_AUDIO_PLL2> ; assigned-clock-parents = <&clk IMX8MP_AUDIO_PLL2_OUT>; - assigned-clock-rates = <12288000>; + assigned-clock-rates = <12288000>, <361267200>; fsl,sai-mclk-direction-output; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mp.dtsi b/arch/arm64/boot/dts/freescale/imx8mp.dtsi index 6f2f50e1639c..83d907294fbc 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp.dtsi @@ -790,6 +790,12 @@ reg = <IMX8MP_POWER_DOMAIN_AUDIOMIX>; clocks = <&clk IMX8MP_CLK_AUDIO_ROOT>, <&clk IMX8MP_CLK_AUDIO_AXI>; + assigned-clocks = <&clk IMX8MP_CLK_AUDIO_AHB>, + <&clk IMX8MP_CLK_AUDIO_AXI_SRC>; + assigned-clock-parents = <&clk IMX8MP_SYS_PLL1_800M>, + <&clk IMX8MP_SYS_PLL1_800M>; + assigned-clock-rates = <400000000>, + <600000000>; }; pgc_gpu2d: power-domain@6 { diff --git a/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi b/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi index 1c71c08becde..f6e422dc2663 100644 --- a/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi +++ b/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi @@ -81,7 +81,7 @@ &gpio1 { pmic-irq-hog { gpio-hog; - gpios = <2 GPIO_ACTIVE_LOW>; + gpios = <3 GPIO_ACTIVE_LOW>; input; line-name = "PMIC_IRQ#"; }; diff --git a/arch/arm64/boot/dts/freescale/imx93.dtsi b/arch/arm64/boot/dts/freescale/imx93.dtsi index 6f85a05ee7e1..dcf6e4846ac9 100644 --- a/arch/arm64/boot/dts/freescale/imx93.dtsi +++ b/arch/arm64/boot/dts/freescale/imx93.dtsi @@ -185,7 +185,7 @@ #size-cells = <1>; ranges; - anomix_ns_gpr: syscon@44210000 { + aonmix_ns_gpr: syscon@44210000 { compatible = "fsl,imx93-aonmix-ns-syscfg", "syscon"; reg = <0x44210000 0x1000>; }; @@ -319,6 +319,7 @@ assigned-clock-parents = <&clk IMX93_CLK_SYS_PLL_PFD1_DIV2>; assigned-clock-rates = <40000000>; fsl,clk-source = /bits/ 8 <0>; + fsl,stop-mode = <&aonmix_ns_gpr 0x14 0>; status = "disabled"; }; @@ -591,6 +592,7 @@ assigned-clock-parents = <&clk IMX93_CLK_SYS_PLL_PFD1_DIV2>; assigned-clock-rates = <40000000>; fsl,clk-source = /bits/ 8 <0>; + fsl,stop-mode = <&wakeupmix_gpr 0x0c 2>; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/mediatek/mt7622.dtsi b/arch/arm64/boot/dts/mediatek/mt7622.dtsi index 36ef2dbe8add..3ee9266fa8e9 100644 --- a/arch/arm64/boot/dts/mediatek/mt7622.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt7622.dtsi @@ -905,7 +905,7 @@ status = "disabled"; }; - sata_phy: t-phy@1a243000 { + sata_phy: t-phy { compatible = "mediatek,mt7622-tphy", "mediatek,generic-tphy-v1"; #address-cells = <2>; diff --git a/arch/arm64/boot/dts/mediatek/mt7986a.dtsi b/arch/arm64/boot/dts/mediatek/mt7986a.dtsi index 68539ea788df..24eda00e320d 100644 --- a/arch/arm64/boot/dts/mediatek/mt7986a.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt7986a.dtsi @@ -434,7 +434,7 @@ }; }; - pcie_phy: t-phy@11c00000 { + pcie_phy: t-phy { compatible = "mediatek,mt7986-tphy", "mediatek,generic-tphy-v2"; #address-cells = <2>; diff --git a/arch/arm64/boot/dts/mediatek/mt8195-demo.dts b/arch/arm64/boot/dts/mediatek/mt8195-demo.dts index b2485ddfd33b..5d635085fe3f 100644 --- a/arch/arm64/boot/dts/mediatek/mt8195-demo.dts +++ b/arch/arm64/boot/dts/mediatek/mt8195-demo.dts @@ -48,7 +48,7 @@ memory@40000000 { device_type = "memory"; - reg = <0 0x40000000 0 0x80000000>; + reg = <0 0x40000000 0x2 0x00000000>; }; reserved-memory { @@ -56,13 +56,8 @@ #size-cells = <2>; ranges; - /* 2 MiB reserved for ARM Trusted Firmware (BL31) */ - bl31_secmon_reserved: secmon@54600000 { - no-map; - reg = <0 0x54600000 0x0 0x200000>; - }; - - /* 12 MiB reserved for OP-TEE (BL32) + /* + * 12 MiB reserved for OP-TEE (BL32) * +-----------------------+ 0x43e0_0000 * | SHMEM 2MiB | * +-----------------------+ 0x43c0_0000 @@ -75,6 +70,34 @@ no-map; reg = <0 0x43200000 0 0x00c00000>; }; + + scp_mem: memory@50000000 { + compatible = "shared-dma-pool"; + reg = <0 0x50000000 0 0x2900000>; + no-map; + }; + + vpu_mem: memory@53000000 { + compatible = "shared-dma-pool"; + reg = <0 0x53000000 0 0x1400000>; /* 20 MB */ + }; + + /* 2 MiB reserved for ARM Trusted Firmware (BL31) */ + bl31_secmon_mem: memory@54600000 { + no-map; + reg = <0 0x54600000 0x0 0x200000>; + }; + + snd_dma_mem: memory@60000000 { + compatible = "shared-dma-pool"; + reg = <0 0x60000000 0 0x1100000>; + no-map; + }; + + apu_mem: memory@62000000 { + compatible = "shared-dma-pool"; + reg = <0 0x62000000 0 0x1400000>; /* 20 MB */ + }; }; }; diff --git a/arch/arm64/boot/dts/mediatek/mt8195.dtsi b/arch/arm64/boot/dts/mediatek/mt8195.dtsi index 4dbbf8fdab75..54c674c45b49 100644 --- a/arch/arm64/boot/dts/mediatek/mt8195.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt8195.dtsi @@ -313,6 +313,7 @@ interrupts = <GIC_SPI 18 IRQ_TYPE_LEVEL_HIGH 0>; cpus = <&cpu0>, <&cpu1>, <&cpu2>, <&cpu3>, <&cpu4>, <&cpu5>, <&cpu6>, <&cpu7>; + status = "fail"; }; dmic_codec: dmic-codec { @@ -2957,7 +2958,7 @@ clock-names = "merge","merge_async"; power-domains = <&spm MT8195_POWER_DOMAIN_VDOSYS1>; mediatek,gce-client-reg = <&gce0 SUBSYS_1c10XXXX 0xc000 0x1000>; - mediatek,merge-mute = <1>; + mediatek,merge-mute; resets = <&vdosys1 MT8195_VDOSYS1_SW0_RST_B_MERGE0_DL_ASYNC>; }; @@ -2970,7 +2971,7 @@ clock-names = "merge","merge_async"; power-domains = <&spm MT8195_POWER_DOMAIN_VDOSYS1>; mediatek,gce-client-reg = <&gce0 SUBSYS_1c10XXXX 0xd000 0x1000>; - mediatek,merge-mute = <1>; + mediatek,merge-mute; resets = <&vdosys1 MT8195_VDOSYS1_SW0_RST_B_MERGE1_DL_ASYNC>; }; @@ -2983,7 +2984,7 @@ clock-names = "merge","merge_async"; power-domains = <&spm MT8195_POWER_DOMAIN_VDOSYS1>; mediatek,gce-client-reg = <&gce0 SUBSYS_1c10XXXX 0xe000 0x1000>; - mediatek,merge-mute = <1>; + mediatek,merge-mute; resets = <&vdosys1 MT8195_VDOSYS1_SW0_RST_B_MERGE2_DL_ASYNC>; }; @@ -2996,7 +2997,7 @@ clock-names = "merge","merge_async"; power-domains = <&spm MT8195_POWER_DOMAIN_VDOSYS1>; mediatek,gce-client-reg = <&gce0 SUBSYS_1c10XXXX 0xf000 0x1000>; - mediatek,merge-mute = <1>; + mediatek,merge-mute; resets = <&vdosys1 MT8195_VDOSYS1_SW0_RST_B_MERGE3_DL_ASYNC>; }; @@ -3009,7 +3010,7 @@ clock-names = "merge","merge_async"; power-domains = <&spm MT8195_POWER_DOMAIN_VDOSYS1>; mediatek,gce-client-reg = <&gce0 SUBSYS_1c11XXXX 0x0000 0x1000>; - mediatek,merge-fifo-en = <1>; + mediatek,merge-fifo-en; resets = <&vdosys1 MT8195_VDOSYS1_SW0_RST_B_MERGE4_DL_ASYNC>; }; diff --git a/arch/arm64/boot/dts/qcom/apq8096-db820c.dts b/arch/arm64/boot/dts/qcom/apq8096-db820c.dts index 385b178314db..3067a4091a7a 100644 --- a/arch/arm64/boot/dts/qcom/apq8096-db820c.dts +++ b/arch/arm64/boot/dts/qcom/apq8096-db820c.dts @@ -62,25 +62,23 @@ stdout-path = "serial0:115200n8"; }; - clocks { - divclk4: divclk4 { - compatible = "fixed-clock"; - #clock-cells = <0>; - clock-frequency = <32768>; - clock-output-names = "divclk4"; + div1_mclk: divclk1 { + compatible = "gpio-gate-clock"; + pinctrl-0 = <&audio_mclk>; + pinctrl-names = "default"; + clocks = <&rpmcc RPM_SMD_DIV_CLK1>; + #clock-cells = <0>; + enable-gpios = <&pm8994_gpios 15 0>; + }; - pinctrl-names = "default"; - pinctrl-0 = <&divclk4_pin_a>; - }; + divclk4: divclk4 { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <32768>; + clock-output-names = "divclk4"; - div1_mclk: divclk1 { - compatible = "gpio-gate-clock"; - pinctrl-0 = <&audio_mclk>; - pinctrl-names = "default"; - clocks = <&rpmcc RPM_SMD_DIV_CLK1>; - #clock-cells = <0>; - enable-gpios = <&pm8994_gpios 15 0>; - }; + pinctrl-names = "default"; + pinctrl-0 = <&divclk4_pin_a>; }; gpio-keys { diff --git a/arch/arm64/boot/dts/qcom/msm8996-xiaomi-common.dtsi b/arch/arm64/boot/dts/qcom/msm8996-xiaomi-common.dtsi index bcd2397eb373..06f8ff624181 100644 --- a/arch/arm64/boot/dts/qcom/msm8996-xiaomi-common.dtsi +++ b/arch/arm64/boot/dts/qcom/msm8996-xiaomi-common.dtsi @@ -11,26 +11,24 @@ #include <dt-bindings/pinctrl/qcom,pmic-gpio.h> / { - clocks { - divclk1_cdc: divclk1 { - compatible = "gpio-gate-clock"; - clocks = <&rpmcc RPM_SMD_DIV_CLK1>; - #clock-cells = <0>; - enable-gpios = <&pm8994_gpios 15 GPIO_ACTIVE_HIGH>; + divclk1_cdc: divclk1 { + compatible = "gpio-gate-clock"; + clocks = <&rpmcc RPM_SMD_DIV_CLK1>; + #clock-cells = <0>; + enable-gpios = <&pm8994_gpios 15 GPIO_ACTIVE_HIGH>; - pinctrl-names = "default"; - pinctrl-0 = <&divclk1_default>; - }; + pinctrl-names = "default"; + pinctrl-0 = <&divclk1_default>; + }; - divclk4: divclk4 { - compatible = "fixed-clock"; - #clock-cells = <0>; - clock-frequency = <32768>; - clock-output-names = "divclk4"; + divclk4: divclk4 { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <32768>; + clock-output-names = "divclk4"; - pinctrl-names = "default"; - pinctrl-0 = <&divclk4_pin_a>; - }; + pinctrl-names = "default"; + pinctrl-0 = <&divclk4_pin_a>; }; gpio-keys { diff --git a/arch/arm64/boot/dts/qcom/msm8996-xiaomi-gemini.dts b/arch/arm64/boot/dts/qcom/msm8996-xiaomi-gemini.dts index d1066edaea47..f8e9d90afab0 100644 --- a/arch/arm64/boot/dts/qcom/msm8996-xiaomi-gemini.dts +++ b/arch/arm64/boot/dts/qcom/msm8996-xiaomi-gemini.dts @@ -20,16 +20,14 @@ qcom,pmic-id = <0x20009 0x2000a 0x00 0x00>; qcom,board-id = <31 0>; - clocks { - divclk2_haptics: divclk2 { - compatible = "fixed-clock"; - #clock-cells = <0>; - clock-frequency = <32768>; - clock-output-names = "divclk2"; - - pinctrl-names = "default"; - pinctrl-0 = <&divclk2_pin_a>; - }; + divclk2_haptics: divclk2 { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <32768>; + clock-output-names = "divclk2"; + + pinctrl-names = "default"; + pinctrl-0 = <&divclk2_pin_a>; }; }; diff --git a/arch/arm64/boot/dts/qcom/sa8775p-pmics.dtsi b/arch/arm64/boot/dts/qcom/sa8775p-pmics.dtsi index 3c3b6287cd27..eaa43f022a65 100644 --- a/arch/arm64/boot/dts/qcom/sa8775p-pmics.dtsi +++ b/arch/arm64/boot/dts/qcom/sa8775p-pmics.dtsi @@ -173,7 +173,7 @@ compatible = "qcom,pmm8654au-gpio", "qcom,spmi-gpio"; reg = <0x8800>; gpio-controller; - gpio-ranges = <&pmm8654au_2_gpios 0 0 12>; + gpio-ranges = <&pmm8654au_1_gpios 0 0 12>; #gpio-cells = <2>; interrupt-controller; #interrupt-cells = <2>; diff --git a/arch/arm64/boot/dts/qcom/sm8150.dtsi b/arch/arm64/boot/dts/qcom/sm8150.dtsi index a7c3020a5de4..06c53000bb74 100644 --- a/arch/arm64/boot/dts/qcom/sm8150.dtsi +++ b/arch/arm64/boot/dts/qcom/sm8150.dtsi @@ -3958,7 +3958,7 @@ pdc: interrupt-controller@b220000 { compatible = "qcom,sm8150-pdc", "qcom,pdc"; - reg = <0 0x0b220000 0 0x400>; + reg = <0 0x0b220000 0 0x30000>; qcom,pdc-ranges = <0 480 94>, <94 609 31>, <125 63 1>; #interrupt-cells = <2>; diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts index 08a3ad3e7ae9..de0a1f2af983 100644 --- a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts +++ b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts @@ -68,15 +68,17 @@ simple-audio-card,format = "i2s"; simple-audio-card,name = "Haikou,I2S-codec"; simple-audio-card,mclk-fs = <512>; + simple-audio-card,frame-master = <&sgtl5000_codec>; + simple-audio-card,bitclock-master = <&sgtl5000_codec>; - simple-audio-card,codec { - clocks = <&sgtl5000_clk>; + sgtl5000_codec: simple-audio-card,codec { sound-dai = <&sgtl5000>; + // Prevent the dai subsystem from overwriting the clock + // frequency. We are using a fixed-frequency oscillator. + system-clock-fixed; }; simple-audio-card,cpu { - bitclock-master; - frame-master; sound-dai = <&i2s0_8ch>; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi index 7dccbe8a9393..f2279aa6ca9e 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi @@ -492,6 +492,7 @@ &i2s0 { pinctrl-0 = <&i2s0_2ch_bus>; + pinctrl-1 = <&i2s0_2ch_bus_bclk_off>; rockchip,capture-channels = <2>; rockchip,playback-channels = <2>; status = "okay"; diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index 9da0b6d77c8d..5bc2d4faeea6 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -2457,6 +2457,16 @@ <4 RK_PA0 1 &pcfg_pull_none>; }; + i2s0_2ch_bus_bclk_off: i2s0-2ch-bus-bclk-off { + rockchip,pins = + <3 RK_PD0 RK_FUNC_GPIO &pcfg_pull_none>, + <3 RK_PD1 1 &pcfg_pull_none>, + <3 RK_PD2 1 &pcfg_pull_none>, + <3 RK_PD3 1 &pcfg_pull_none>, + <3 RK_PD7 1 &pcfg_pull_none>, + <4 RK_PA0 1 &pcfg_pull_none>; + }; + i2s0_8ch_bus: i2s0-8ch-bus { rockchip,pins = <3 RK_PD0 1 &pcfg_pull_none>, diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 5315789f4868..a789119e6483 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -636,6 +636,7 @@ CONFIG_POWER_RESET_MSM=y CONFIG_POWER_RESET_QCOM_PON=m CONFIG_POWER_RESET_XGENE=y CONFIG_POWER_RESET_SYSCON=y +CONFIG_POWER_RESET_SYSCON_POWEROFF=y CONFIG_SYSCON_REBOOT_MODE=y CONFIG_NVMEM_REBOOT_MODE=m CONFIG_BATTERY_SBS=m @@ -1175,7 +1176,6 @@ CONFIG_COMMON_CLK_S2MPS11=y CONFIG_COMMON_CLK_PWM=y CONFIG_COMMON_CLK_RS9_PCIE=y CONFIG_COMMON_CLK_VC5=y -CONFIG_COMMON_CLK_NPCM8XX=y CONFIG_COMMON_CLK_BD718XX=m CONFIG_CLK_RASPBERRYPI=m CONFIG_CLK_IMX8MM=y diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index 4d537d56eb84..6792a1f83f2a 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -9,6 +9,7 @@ #ifndef _ASM_ACPI_H #define _ASM_ACPI_H +#include <linux/cpuidle.h> #include <linux/efi.h> #include <linux/memblock.h> #include <linux/psci.h> @@ -44,6 +45,24 @@ #define ACPI_MADT_GICC_TRBE (offsetof(struct acpi_madt_generic_interrupt, \ trbe_interrupt) + sizeof(u16)) +/* + * Arm® Functional Fixed Hardware Specification Version 1.2. + * Table 2: Arm Architecture context loss flags + */ +#define CPUIDLE_CORE_CTXT BIT(0) /* Core context Lost */ + +static inline unsigned int arch_get_idle_state_flags(u32 arch_flags) +{ + if (arch_flags & CPUIDLE_CORE_CTXT) + return CPUIDLE_FLAG_TIMER_STOP; + + return 0; +} +#define arch_get_idle_state_flags arch_get_idle_state_flags + +#define CPUIDLE_TRACE_CTXT BIT(1) /* Trace context loss */ +#define CPUIDLE_GICR_CTXT BIT(2) /* GICR */ +#define CPUIDLE_GICD_CTXT BIT(3) /* GICD */ /* Basic configuration for ACPI */ #ifdef CONFIG_ACPI diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 5f6f84837a49..74d00feb62f0 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -79,6 +79,7 @@ #define ARM_CPU_PART_CORTEX_A78AE 0xD42 #define ARM_CPU_PART_CORTEX_X1 0xD44 #define ARM_CPU_PART_CORTEX_A510 0xD46 +#define ARM_CPU_PART_CORTEX_A520 0xD80 #define ARM_CPU_PART_CORTEX_A710 0xD47 #define ARM_CPU_PART_CORTEX_A715 0xD4D #define ARM_CPU_PART_CORTEX_X2 0xD48 @@ -148,6 +149,7 @@ #define MIDR_CORTEX_A78AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE) #define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) #define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510) +#define MIDR_CORTEX_A520 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A520) #define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710) #define MIDR_CORTEX_A715 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A715) #define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index f43a38ac1779..2ddc33d93b13 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -28,7 +28,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags); #define arch_make_huge_pte arch_make_huge_pte #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte); + pte_t *ptep, pte_t pte, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 2186927b1d21..b85f46a73e21 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -346,14 +346,14 @@ */ #define __HFGRTR_EL2_RES0 (GENMASK(63, 56) | GENMASK(53, 51)) #define __HFGRTR_EL2_MASK GENMASK(49, 0) -#define __HFGRTR_EL2_nMASK (GENMASK(55, 54) | BIT(50)) +#define __HFGRTR_EL2_nMASK (GENMASK(58, 57) | GENMASK(55, 54) | BIT(50)) #define __HFGWTR_EL2_RES0 (GENMASK(63, 56) | GENMASK(53, 51) | \ BIT(46) | BIT(42) | BIT(40) | BIT(28) | \ GENMASK(26, 25) | BIT(21) | BIT(18) | \ GENMASK(15, 14) | GENMASK(10, 9) | BIT(2)) #define __HFGWTR_EL2_MASK GENMASK(49, 0) -#define __HFGWTR_EL2_nMASK (GENMASK(55, 54) | BIT(50)) +#define __HFGWTR_EL2_nMASK (GENMASK(58, 57) | GENMASK(55, 54) | BIT(50)) #define __HFGITR_EL2_RES0 GENMASK(63, 57) #define __HFGITR_EL2_MASK GENMASK(54, 0) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index be66e94a21bd..5706e74c5578 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -730,6 +730,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .cpu_enable = cpu_clear_bf16_from_user_emulation, }, #endif +#ifdef CONFIG_ARM64_ERRATUM_2966298 + { + .desc = "ARM erratum 2966298", + .capability = ARM64_WORKAROUND_2966298, + /* Cortex-A520 r0p0 - r0p1 */ + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A520, 0, 0, 1), + }, +#endif #ifdef CONFIG_AMPERE_ERRATUM_AC03_CPU_38 { .desc = "AmpereOne erratum AC03_CPU_38", diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 6ad61de03d0a..a6030913cd58 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -428,6 +428,10 @@ alternative_else_nop_endif ldp x28, x29, [sp, #16 * 14] .if \el == 0 +alternative_if ARM64_WORKAROUND_2966298 + tlbi vale1, xzr + dsb nsh +alternative_else_nop_endif alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 ldr lr, [sp, #S_LR] add sp, sp, #PT_REGS_SIZE // restore sp diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 6615a08382f1..13ba691b848f 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -55,11 +55,6 @@ static struct irq_ops arch_timer_irq_ops = { .get_input_level = kvm_arch_timer_get_input_level, }; -static bool has_cntpoff(void) -{ - return (has_vhe() && cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF)); -} - static int nr_timers(struct kvm_vcpu *vcpu) { if (!vcpu_has_nv(vcpu)) @@ -180,7 +175,7 @@ u64 kvm_phys_timer_read(void) return timecounter->cc->read(timecounter->cc); } -static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) +void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) { if (vcpu_has_nv(vcpu)) { if (is_hyp_ctxt(vcpu)) { @@ -548,8 +543,7 @@ static void timer_save_state(struct arch_timer_context *ctx) timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTP_CTL)); cval = read_sysreg_el0(SYS_CNTP_CVAL); - if (!has_cntpoff()) - cval -= timer_get_offset(ctx); + cval -= timer_get_offset(ctx); timer_set_cval(ctx, cval); @@ -636,8 +630,7 @@ static void timer_restore_state(struct arch_timer_context *ctx) cval = timer_get_cval(ctx); offset = timer_get_offset(ctx); set_cntpoff(offset); - if (!has_cntpoff()) - cval += offset; + cval += offset; write_sysreg_el0(cval, SYS_CNTP_CVAL); isb(); write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTP_CTL); diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 91cc851a6c75..06185216a297 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -1042,6 +1042,8 @@ enum fg_filter_id { static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { /* HFGRTR_EL2, HFGWTR_EL2 */ + SR_FGT(SYS_PIR_EL1, HFGxTR, nPIR_EL1, 0), + SR_FGT(SYS_PIRE0_EL1, HFGxTR, nPIRE0_EL1, 0), SR_FGT(SYS_TPIDR2_EL0, HFGxTR, nTPIDR2_EL0, 0), SR_FGT(SYS_SMPRI_EL1, HFGxTR, nSMPRI_EL1, 0), SR_FGT(SYS_ACCDATA_EL1, HFGxTR, nACCDATA_EL1, 0), diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index c8e70e8ceeeb..1581df6aec87 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -39,6 +39,26 @@ static void __activate_traps(struct kvm_vcpu *vcpu) ___activate_traps(vcpu); + if (has_cntpoff()) { + struct timer_map map; + + get_timer_map(vcpu, &map); + + /* + * We're entrering the guest. Reload the correct + * values from memory now that TGE is clear. + */ + if (map.direct_ptimer == vcpu_ptimer(vcpu)) + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + if (map.direct_ptimer == vcpu_hptimer(vcpu)) + val = __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2); + + if (map.direct_ptimer) { + write_sysreg_el0(val, SYS_CNTP_CVAL); + isb(); + } + } + val = read_sysreg(cpacr_el1); val |= CPACR_ELx_TTA; val &= ~(CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN | @@ -77,6 +97,30 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); + if (has_cntpoff()) { + struct timer_map map; + u64 val, offset; + + get_timer_map(vcpu, &map); + + /* + * We're exiting the guest. Save the latest CVAL value + * to memory and apply the offset now that TGE is set. + */ + val = read_sysreg_el0(SYS_CNTP_CVAL); + if (map.direct_ptimer == vcpu_ptimer(vcpu)) + __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = val; + if (map.direct_ptimer == vcpu_hptimer(vcpu)) + __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2) = val; + + offset = read_sysreg_s(SYS_CNTPOFF_EL2); + + if (map.direct_ptimer && offset) { + write_sysreg_el0(val + offset, SYS_CNTP_CVAL); + isb(); + } + } + /* * ARM errata 1165522 and 1530923 require the actual execution of the * above before we can switch to the EL2/EL0 translation regime used by diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 0eea225fd09a..a243934c5568 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -39,7 +39,7 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); - if (!kvm_arm_support_pmu_v3() || !pmu || !kvm_pmu_switch_needed(attr)) + if (!kvm_arm_support_pmu_v3() || !kvm_pmu_switch_needed(attr)) return; if (!attr->exclude_host) @@ -55,7 +55,7 @@ void kvm_clr_pmu_events(u32 clr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); - if (!kvm_arm_support_pmu_v3() || !pmu) + if (!kvm_arm_support_pmu_v3()) return; pmu->events_host &= ~clr; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1a21c526e5e2..0f90026f12a0 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2267,8 +2267,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_PMMIR_EL1), trap_raz_wi }, { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, - { SYS_DESC(SYS_PIRE0_EL1), access_vm_reg, reset_unknown, PIRE0_EL1 }, - { SYS_DESC(SYS_PIR_EL1), access_vm_reg, reset_unknown, PIR_EL1 }, + { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1 }, + { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1 }, { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, { SYS_DESC(SYS_LORSA_EL1), trap_loregion }, diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 9c52718ea750..13fd592228b1 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -241,15 +241,8 @@ static void clear_flush(struct mm_struct *mm, flush_tlb_range(&vma, saddr, addr); } -static inline struct folio *hugetlb_swap_entry_to_folio(swp_entry_t entry) -{ - VM_BUG_ON(!is_migration_entry(entry) && !is_hwpoison_entry(entry)); - - return page_folio(pfn_to_page(swp_offset_pfn(entry))); -} - void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) + pte_t *ptep, pte_t pte, unsigned long sz) { size_t pgsize; int i; @@ -257,13 +250,10 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, unsigned long pfn, dpfn; pgprot_t hugeprot; - if (!pte_present(pte)) { - struct folio *folio; - - folio = hugetlb_swap_entry_to_folio(pte_to_swp_entry(pte)); - ncontig = num_contig_ptes(folio_size(folio), &pgsize); + ncontig = num_contig_ptes(sz, &pgsize); - for (i = 0; i < ncontig; i++, ptep++) + if (!pte_present(pte)) { + for (i = 0; i < ncontig; i++, ptep++, addr += pgsize) set_pte_at(mm, addr, ptep, pte); return; } @@ -273,7 +263,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, return; } - ncontig = find_num_contig(mm, addr, ptep, &pgsize); pfn = pte_pfn(pte); dpfn = pgsize >> PAGE_SHIFT; hugeprot = pte_pgprot(pte); @@ -571,5 +560,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + unsigned long psize = huge_page_size(hstate_vma(vma)); + + set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index c3f06fdef609..dea3dc89234b 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -84,6 +84,7 @@ WORKAROUND_2077057 WORKAROUND_2457168 WORKAROUND_2645198 WORKAROUND_2658417 +WORKAROUND_2966298 WORKAROUND_AMPERE_AC03_CPU_38 WORKAROUND_TRBE_OVERWRITE_FILL_MODE WORKAROUND_TSB_FLUSH_FAILURE diff --git a/arch/ia64/include/asm/cpu.h b/arch/ia64/include/asm/cpu.h index db125df9e088..642d71675ddb 100644 --- a/arch/ia64/include/asm/cpu.h +++ b/arch/ia64/include/asm/cpu.h @@ -15,9 +15,4 @@ DECLARE_PER_CPU(struct ia64_cpu, cpu_devices); DECLARE_PER_CPU(int, cpu_state); -#ifdef CONFIG_HOTPLUG_CPU -extern int arch_register_cpu(int num); -extern void arch_unregister_cpu(int); -#endif - #endif /* _ASM_IA64_CPU_H_ */ diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index 94a848b06f15..741863a187a6 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -59,7 +59,7 @@ void __ref arch_unregister_cpu(int num) } EXPORT_SYMBOL(arch_unregister_cpu); #else -static int __init arch_register_cpu(int num) +int __init arch_register_cpu(int num) { return register_cpu(&sysfs_cpus[num].cpu, num); } diff --git a/arch/loongarch/Kbuild b/arch/loongarch/Kbuild index b01f5cdb27e0..beb8499dd8ed 100644 --- a/arch/loongarch/Kbuild +++ b/arch/loongarch/Kbuild @@ -3,5 +3,7 @@ obj-y += mm/ obj-y += net/ obj-y += vdso/ +obj-$(CONFIG_KVM) += kvm/ + # for cleaning subdir- += boot diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index e14396a2ddcb..d889a0b97bc1 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -129,6 +129,7 @@ config LOONGARCH select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE select HAVE_KRETPROBES + select HAVE_KVM select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_PCI @@ -263,6 +264,9 @@ config AS_HAS_LASX_EXTENSION config AS_HAS_LBT_EXTENSION def_bool $(as-instr,movscr2gr \$a0$(comma)\$scr0) +config AS_HAS_LVZ_EXTENSION + def_bool $(as-instr,hvcl 0) + menu "Kernel type and options" source "kernel/Kconfig.hz" @@ -676,3 +680,5 @@ source "kernel/power/Kconfig" source "drivers/acpi/Kconfig" endmenu + +source "arch/loongarch/kvm/Kconfig" diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index a3b52aaa83b3..33795e4a5bd6 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -66,6 +66,8 @@ CONFIG_EFI_ZBOOT=y CONFIG_EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER=y CONFIG_EFI_CAPSULE_LOADER=m CONFIG_EFI_TEST=m +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=m CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_FORCE_LOAD=y diff --git a/arch/loongarch/include/asm/elf.h b/arch/loongarch/include/asm/elf.h index 7af0cebf28d7..b9a4ab54285c 100644 --- a/arch/loongarch/include/asm/elf.h +++ b/arch/loongarch/include/asm/elf.h @@ -111,6 +111,15 @@ #define R_LARCH_TLS_GD_HI20 98 #define R_LARCH_32_PCREL 99 #define R_LARCH_RELAX 100 +#define R_LARCH_DELETE 101 +#define R_LARCH_ALIGN 102 +#define R_LARCH_PCREL20_S2 103 +#define R_LARCH_CFA 104 +#define R_LARCH_ADD6 105 +#define R_LARCH_SUB6 106 +#define R_LARCH_ADD_ULEB128 107 +#define R_LARCH_SUB_ULEB128 108 +#define R_LARCH_64_PCREL 109 #ifndef ELF_ARCH diff --git a/arch/loongarch/include/asm/inst.h b/arch/loongarch/include/asm/inst.h index 71e1ed4165c8..008a88ead60d 100644 --- a/arch/loongarch/include/asm/inst.h +++ b/arch/loongarch/include/asm/inst.h @@ -65,6 +65,14 @@ enum reg2_op { revbd_op = 0x0f, revh2w_op = 0x10, revhd_op = 0x11, + iocsrrdb_op = 0x19200, + iocsrrdh_op = 0x19201, + iocsrrdw_op = 0x19202, + iocsrrdd_op = 0x19203, + iocsrwrb_op = 0x19204, + iocsrwrh_op = 0x19205, + iocsrwrw_op = 0x19206, + iocsrwrd_op = 0x19207, }; enum reg2i5_op { @@ -318,6 +326,13 @@ struct reg2bstrd_format { unsigned int opcode : 10; }; +struct reg2csr_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int csr : 14; + unsigned int opcode : 8; +}; + struct reg3_format { unsigned int rd : 5; unsigned int rj : 5; @@ -346,6 +361,7 @@ union loongarch_instruction { struct reg2i14_format reg2i14_format; struct reg2i16_format reg2i16_format; struct reg2bstrd_format reg2bstrd_format; + struct reg2csr_format reg2csr_format; struct reg3_format reg3_format; struct reg3sa2_format reg3sa2_format; }; diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h index 0dcb36b32cb2..c486c2341b66 100644 --- a/arch/loongarch/include/asm/io.h +++ b/arch/loongarch/include/asm/io.h @@ -52,10 +52,9 @@ static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size, * @offset: bus address of the memory * @size: size of the resource to map */ -extern pgprot_t pgprot_wc; - #define ioremap_wc(offset, size) \ - ioremap_prot((offset), (size), pgprot_val(pgprot_wc)) + ioremap_prot((offset), (size), \ + pgprot_val(wc_enabled ? PAGE_KERNEL_WUC : PAGE_KERNEL_SUC)) #define ioremap_cache(offset, size) \ ioremap_prot((offset), (size), pgprot_val(PAGE_KERNEL)) diff --git a/arch/loongarch/include/asm/kvm_csr.h b/arch/loongarch/include/asm/kvm_csr.h new file mode 100644 index 000000000000..724ca8b7b401 --- /dev/null +++ b/arch/loongarch/include/asm/kvm_csr.h @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#ifndef __ASM_LOONGARCH_KVM_CSR_H__ +#define __ASM_LOONGARCH_KVM_CSR_H__ + +#include <linux/uaccess.h> +#include <linux/kvm_host.h> +#include <asm/loongarch.h> +#include <asm/kvm_vcpu.h> + +#define gcsr_read(csr) \ +({ \ + register unsigned long __v; \ + __asm__ __volatile__( \ + " gcsrrd %[val], %[reg]\n\t" \ + : [val] "=r" (__v) \ + : [reg] "i" (csr) \ + : "memory"); \ + __v; \ +}) + +#define gcsr_write(v, csr) \ +({ \ + register unsigned long __v = v; \ + __asm__ __volatile__ ( \ + " gcsrwr %[val], %[reg]\n\t" \ + : [val] "+r" (__v) \ + : [reg] "i" (csr) \ + : "memory"); \ +}) + +#define gcsr_xchg(v, m, csr) \ +({ \ + register unsigned long __v = v; \ + __asm__ __volatile__( \ + " gcsrxchg %[val], %[mask], %[reg]\n\t" \ + : [val] "+r" (__v) \ + : [mask] "r" (m), [reg] "i" (csr) \ + : "memory"); \ + __v; \ +}) + +/* Guest CSRS read and write */ +#define read_gcsr_crmd() gcsr_read(LOONGARCH_CSR_CRMD) +#define write_gcsr_crmd(val) gcsr_write(val, LOONGARCH_CSR_CRMD) +#define read_gcsr_prmd() gcsr_read(LOONGARCH_CSR_PRMD) +#define write_gcsr_prmd(val) gcsr_write(val, LOONGARCH_CSR_PRMD) +#define read_gcsr_euen() gcsr_read(LOONGARCH_CSR_EUEN) +#define write_gcsr_euen(val) gcsr_write(val, LOONGARCH_CSR_EUEN) +#define read_gcsr_misc() gcsr_read(LOONGARCH_CSR_MISC) +#define write_gcsr_misc(val) gcsr_write(val, LOONGARCH_CSR_MISC) +#define read_gcsr_ecfg() gcsr_read(LOONGARCH_CSR_ECFG) +#define write_gcsr_ecfg(val) gcsr_write(val, LOONGARCH_CSR_ECFG) +#define read_gcsr_estat() gcsr_read(LOONGARCH_CSR_ESTAT) +#define write_gcsr_estat(val) gcsr_write(val, LOONGARCH_CSR_ESTAT) +#define read_gcsr_era() gcsr_read(LOONGARCH_CSR_ERA) +#define write_gcsr_era(val) gcsr_write(val, LOONGARCH_CSR_ERA) +#define read_gcsr_badv() gcsr_read(LOONGARCH_CSR_BADV) +#define write_gcsr_badv(val) gcsr_write(val, LOONGARCH_CSR_BADV) +#define read_gcsr_badi() gcsr_read(LOONGARCH_CSR_BADI) +#define write_gcsr_badi(val) gcsr_write(val, LOONGARCH_CSR_BADI) +#define read_gcsr_eentry() gcsr_read(LOONGARCH_CSR_EENTRY) +#define write_gcsr_eentry(val) gcsr_write(val, LOONGARCH_CSR_EENTRY) + +#define read_gcsr_asid() gcsr_read(LOONGARCH_CSR_ASID) +#define write_gcsr_asid(val) gcsr_write(val, LOONGARCH_CSR_ASID) +#define read_gcsr_pgdl() gcsr_read(LOONGARCH_CSR_PGDL) +#define write_gcsr_pgdl(val) gcsr_write(val, LOONGARCH_CSR_PGDL) +#define read_gcsr_pgdh() gcsr_read(LOONGARCH_CSR_PGDH) +#define write_gcsr_pgdh(val) gcsr_write(val, LOONGARCH_CSR_PGDH) +#define write_gcsr_pgd(val) gcsr_write(val, LOONGARCH_CSR_PGD) +#define read_gcsr_pgd() gcsr_read(LOONGARCH_CSR_PGD) +#define read_gcsr_pwctl0() gcsr_read(LOONGARCH_CSR_PWCTL0) +#define write_gcsr_pwctl0(val) gcsr_write(val, LOONGARCH_CSR_PWCTL0) +#define read_gcsr_pwctl1() gcsr_read(LOONGARCH_CSR_PWCTL1) +#define write_gcsr_pwctl1(val) gcsr_write(val, LOONGARCH_CSR_PWCTL1) +#define read_gcsr_stlbpgsize() gcsr_read(LOONGARCH_CSR_STLBPGSIZE) +#define write_gcsr_stlbpgsize(val) gcsr_write(val, LOONGARCH_CSR_STLBPGSIZE) +#define read_gcsr_rvacfg() gcsr_read(LOONGARCH_CSR_RVACFG) +#define write_gcsr_rvacfg(val) gcsr_write(val, LOONGARCH_CSR_RVACFG) + +#define read_gcsr_cpuid() gcsr_read(LOONGARCH_CSR_CPUID) +#define write_gcsr_cpuid(val) gcsr_write(val, LOONGARCH_CSR_CPUID) +#define read_gcsr_prcfg1() gcsr_read(LOONGARCH_CSR_PRCFG1) +#define write_gcsr_prcfg1(val) gcsr_write(val, LOONGARCH_CSR_PRCFG1) +#define read_gcsr_prcfg2() gcsr_read(LOONGARCH_CSR_PRCFG2) +#define write_gcsr_prcfg2(val) gcsr_write(val, LOONGARCH_CSR_PRCFG2) +#define read_gcsr_prcfg3() gcsr_read(LOONGARCH_CSR_PRCFG3) +#define write_gcsr_prcfg3(val) gcsr_write(val, LOONGARCH_CSR_PRCFG3) + +#define read_gcsr_kscratch0() gcsr_read(LOONGARCH_CSR_KS0) +#define write_gcsr_kscratch0(val) gcsr_write(val, LOONGARCH_CSR_KS0) +#define read_gcsr_kscratch1() gcsr_read(LOONGARCH_CSR_KS1) +#define write_gcsr_kscratch1(val) gcsr_write(val, LOONGARCH_CSR_KS1) +#define read_gcsr_kscratch2() gcsr_read(LOONGARCH_CSR_KS2) +#define write_gcsr_kscratch2(val) gcsr_write(val, LOONGARCH_CSR_KS2) +#define read_gcsr_kscratch3() gcsr_read(LOONGARCH_CSR_KS3) +#define write_gcsr_kscratch3(val) gcsr_write(val, LOONGARCH_CSR_KS3) +#define read_gcsr_kscratch4() gcsr_read(LOONGARCH_CSR_KS4) +#define write_gcsr_kscratch4(val) gcsr_write(val, LOONGARCH_CSR_KS4) +#define read_gcsr_kscratch5() gcsr_read(LOONGARCH_CSR_KS5) +#define write_gcsr_kscratch5(val) gcsr_write(val, LOONGARCH_CSR_KS5) +#define read_gcsr_kscratch6() gcsr_read(LOONGARCH_CSR_KS6) +#define write_gcsr_kscratch6(val) gcsr_write(val, LOONGARCH_CSR_KS6) +#define read_gcsr_kscratch7() gcsr_read(LOONGARCH_CSR_KS7) +#define write_gcsr_kscratch7(val) gcsr_write(val, LOONGARCH_CSR_KS7) + +#define read_gcsr_timerid() gcsr_read(LOONGARCH_CSR_TMID) +#define write_gcsr_timerid(val) gcsr_write(val, LOONGARCH_CSR_TMID) +#define read_gcsr_timercfg() gcsr_read(LOONGARCH_CSR_TCFG) +#define write_gcsr_timercfg(val) gcsr_write(val, LOONGARCH_CSR_TCFG) +#define read_gcsr_timertick() gcsr_read(LOONGARCH_CSR_TVAL) +#define write_gcsr_timertick(val) gcsr_write(val, LOONGARCH_CSR_TVAL) +#define read_gcsr_timeroffset() gcsr_read(LOONGARCH_CSR_CNTC) +#define write_gcsr_timeroffset(val) gcsr_write(val, LOONGARCH_CSR_CNTC) + +#define read_gcsr_llbctl() gcsr_read(LOONGARCH_CSR_LLBCTL) +#define write_gcsr_llbctl(val) gcsr_write(val, LOONGARCH_CSR_LLBCTL) + +#define read_gcsr_tlbidx() gcsr_read(LOONGARCH_CSR_TLBIDX) +#define write_gcsr_tlbidx(val) gcsr_write(val, LOONGARCH_CSR_TLBIDX) +#define read_gcsr_tlbrentry() gcsr_read(LOONGARCH_CSR_TLBRENTRY) +#define write_gcsr_tlbrentry(val) gcsr_write(val, LOONGARCH_CSR_TLBRENTRY) +#define read_gcsr_tlbrbadv() gcsr_read(LOONGARCH_CSR_TLBRBADV) +#define write_gcsr_tlbrbadv(val) gcsr_write(val, LOONGARCH_CSR_TLBRBADV) +#define read_gcsr_tlbrera() gcsr_read(LOONGARCH_CSR_TLBRERA) +#define write_gcsr_tlbrera(val) gcsr_write(val, LOONGARCH_CSR_TLBRERA) +#define read_gcsr_tlbrsave() gcsr_read(LOONGARCH_CSR_TLBRSAVE) +#define write_gcsr_tlbrsave(val) gcsr_write(val, LOONGARCH_CSR_TLBRSAVE) +#define read_gcsr_tlbrelo0() gcsr_read(LOONGARCH_CSR_TLBRELO0) +#define write_gcsr_tlbrelo0(val) gcsr_write(val, LOONGARCH_CSR_TLBRELO0) +#define read_gcsr_tlbrelo1() gcsr_read(LOONGARCH_CSR_TLBRELO1) +#define write_gcsr_tlbrelo1(val) gcsr_write(val, LOONGARCH_CSR_TLBRELO1) +#define read_gcsr_tlbrehi() gcsr_read(LOONGARCH_CSR_TLBREHI) +#define write_gcsr_tlbrehi(val) gcsr_write(val, LOONGARCH_CSR_TLBREHI) +#define read_gcsr_tlbrprmd() gcsr_read(LOONGARCH_CSR_TLBRPRMD) +#define write_gcsr_tlbrprmd(val) gcsr_write(val, LOONGARCH_CSR_TLBRPRMD) + +#define read_gcsr_directwin0() gcsr_read(LOONGARCH_CSR_DMWIN0) +#define write_gcsr_directwin0(val) gcsr_write(val, LOONGARCH_CSR_DMWIN0) +#define read_gcsr_directwin1() gcsr_read(LOONGARCH_CSR_DMWIN1) +#define write_gcsr_directwin1(val) gcsr_write(val, LOONGARCH_CSR_DMWIN1) +#define read_gcsr_directwin2() gcsr_read(LOONGARCH_CSR_DMWIN2) +#define write_gcsr_directwin2(val) gcsr_write(val, LOONGARCH_CSR_DMWIN2) +#define read_gcsr_directwin3() gcsr_read(LOONGARCH_CSR_DMWIN3) +#define write_gcsr_directwin3(val) gcsr_write(val, LOONGARCH_CSR_DMWIN3) + +/* Guest related CSRs */ +#define read_csr_gtlbc() csr_read64(LOONGARCH_CSR_GTLBC) +#define write_csr_gtlbc(val) csr_write64(val, LOONGARCH_CSR_GTLBC) +#define read_csr_trgp() csr_read64(LOONGARCH_CSR_TRGP) +#define read_csr_gcfg() csr_read64(LOONGARCH_CSR_GCFG) +#define write_csr_gcfg(val) csr_write64(val, LOONGARCH_CSR_GCFG) +#define read_csr_gstat() csr_read64(LOONGARCH_CSR_GSTAT) +#define write_csr_gstat(val) csr_write64(val, LOONGARCH_CSR_GSTAT) +#define read_csr_gintc() csr_read64(LOONGARCH_CSR_GINTC) +#define write_csr_gintc(val) csr_write64(val, LOONGARCH_CSR_GINTC) +#define read_csr_gcntc() csr_read64(LOONGARCH_CSR_GCNTC) +#define write_csr_gcntc(val) csr_write64(val, LOONGARCH_CSR_GCNTC) + +#define __BUILD_GCSR_OP(name) __BUILD_CSR_COMMON(gcsr_##name) + +__BUILD_CSR_OP(gcfg) +__BUILD_CSR_OP(gstat) +__BUILD_CSR_OP(gtlbc) +__BUILD_CSR_OP(gintc) +__BUILD_GCSR_OP(llbctl) +__BUILD_GCSR_OP(tlbidx) + +#define set_gcsr_estat(val) \ + gcsr_xchg(val, val, LOONGARCH_CSR_ESTAT) +#define clear_gcsr_estat(val) \ + gcsr_xchg(~(val), val, LOONGARCH_CSR_ESTAT) + +#define kvm_read_hw_gcsr(id) gcsr_read(id) +#define kvm_write_hw_gcsr(id, val) gcsr_write(val, id) + +#define kvm_save_hw_gcsr(csr, gid) (csr->csrs[gid] = gcsr_read(gid)) +#define kvm_restore_hw_gcsr(csr, gid) (gcsr_write(csr->csrs[gid], gid)) + +int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu); + +static __always_inline unsigned long kvm_read_sw_gcsr(struct loongarch_csrs *csr, int gid) +{ + return csr->csrs[gid]; +} + +static __always_inline void kvm_write_sw_gcsr(struct loongarch_csrs *csr, int gid, unsigned long val) +{ + csr->csrs[gid] = val; +} + +static __always_inline void kvm_set_sw_gcsr(struct loongarch_csrs *csr, + int gid, unsigned long val) +{ + csr->csrs[gid] |= val; +} + +static __always_inline void kvm_change_sw_gcsr(struct loongarch_csrs *csr, + int gid, unsigned long mask, unsigned long val) +{ + unsigned long _mask = mask; + + csr->csrs[gid] &= ~_mask; + csr->csrs[gid] |= val & _mask; +} + +#endif /* __ASM_LOONGARCH_KVM_CSR_H__ */ diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h new file mode 100644 index 000000000000..11328700d4fa --- /dev/null +++ b/arch/loongarch/include/asm/kvm_host.h @@ -0,0 +1,237 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#ifndef __ASM_LOONGARCH_KVM_HOST_H__ +#define __ASM_LOONGARCH_KVM_HOST_H__ + +#include <linux/cpumask.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/kvm.h> +#include <linux/kvm_types.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/threads.h> +#include <linux/types.h> + +#include <asm/inst.h> +#include <asm/kvm_mmu.h> +#include <asm/loongarch.h> + +/* Loongarch KVM register ids */ +#define KVM_GET_IOC_CSR_IDX(id) ((id & KVM_CSR_IDX_MASK) >> LOONGARCH_REG_SHIFT) +#define KVM_GET_IOC_CPUCFG_IDX(id) ((id & KVM_CPUCFG_IDX_MASK) >> LOONGARCH_REG_SHIFT) + +#define KVM_MAX_VCPUS 256 +#define KVM_MAX_CPUCFG_REGS 21 +/* memory slots that does not exposed to userspace */ +#define KVM_PRIVATE_MEM_SLOTS 0 + +#define KVM_HALT_POLL_NS_DEFAULT 500000 + +struct kvm_vm_stat { + struct kvm_vm_stat_generic generic; + u64 pages; + u64 hugepages; +}; + +struct kvm_vcpu_stat { + struct kvm_vcpu_stat_generic generic; + u64 int_exits; + u64 idle_exits; + u64 cpucfg_exits; + u64 signal_exits; +}; + +struct kvm_arch_memory_slot { +}; + +struct kvm_context { + unsigned long vpid_cache; + struct kvm_vcpu *last_vcpu; +}; + +struct kvm_world_switch { + int (*exc_entry)(void); + int (*enter_guest)(struct kvm_run *run, struct kvm_vcpu *vcpu); + unsigned long page_order; +}; + +#define MAX_PGTABLE_LEVELS 4 + +struct kvm_arch { + /* Guest physical mm */ + kvm_pte_t *pgd; + unsigned long gpa_size; + unsigned long invalid_ptes[MAX_PGTABLE_LEVELS]; + unsigned int pte_shifts[MAX_PGTABLE_LEVELS]; + unsigned int root_level; + + s64 time_offset; + struct kvm_context __percpu *vmcs; +}; + +#define CSR_MAX_NUMS 0x800 + +struct loongarch_csrs { + unsigned long csrs[CSR_MAX_NUMS]; +}; + +/* Resume Flags */ +#define RESUME_HOST 0 +#define RESUME_GUEST 1 + +enum emulation_result { + EMULATE_DONE, /* no further processing */ + EMULATE_DO_MMIO, /* kvm_run filled with MMIO request */ + EMULATE_DO_IOCSR, /* handle IOCSR request */ + EMULATE_FAIL, /* can't emulate this instruction */ + EMULATE_EXCEPT, /* A guest exception has been generated */ +}; + +#define KVM_LARCH_FPU (0x1 << 0) +#define KVM_LARCH_SWCSR_LATEST (0x1 << 1) +#define KVM_LARCH_HWCSR_USABLE (0x1 << 2) + +struct kvm_vcpu_arch { + /* + * Switch pointer-to-function type to unsigned long + * for loading the value into register directly. + */ + unsigned long host_eentry; + unsigned long guest_eentry; + + /* Pointers stored here for easy accessing from assembly code */ + int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); + + /* Host registers preserved across guest mode execution */ + unsigned long host_sp; + unsigned long host_tp; + unsigned long host_pgd; + + /* Host CSRs are used when handling exits from guest */ + unsigned long badi; + unsigned long badv; + unsigned long host_ecfg; + unsigned long host_estat; + unsigned long host_percpu; + + /* GPRs */ + unsigned long gprs[32]; + unsigned long pc; + + /* Which auxiliary state is loaded (KVM_LARCH_*) */ + unsigned int aux_inuse; + + /* FPU state */ + struct loongarch_fpu fpu FPU_ALIGN; + + /* CSR state */ + struct loongarch_csrs *csr; + + /* GPR used as IO source/target */ + u32 io_gpr; + + /* KVM register to control count timer */ + u32 count_ctl; + struct hrtimer swtimer; + + /* Bitmask of intr that are pending */ + unsigned long irq_pending; + /* Bitmask of pending intr to be cleared */ + unsigned long irq_clear; + + /* Bitmask of exceptions that are pending */ + unsigned long exception_pending; + unsigned int esubcode; + + /* Cache for pages needed inside spinlock regions */ + struct kvm_mmu_memory_cache mmu_page_cache; + + /* vcpu's vpid */ + u64 vpid; + + /* Frequency of stable timer in Hz */ + u64 timer_mhz; + ktime_t expire; + + /* Last CPU the vCPU state was loaded on */ + int last_sched_cpu; + /* mp state */ + struct kvm_mp_state mp_state; + /* cpucfg */ + u32 cpucfg[KVM_MAX_CPUCFG_REGS]; +}; + +static inline unsigned long readl_sw_gcsr(struct loongarch_csrs *csr, int reg) +{ + return csr->csrs[reg]; +} + +static inline void writel_sw_gcsr(struct loongarch_csrs *csr, int reg, unsigned long val) +{ + csr->csrs[reg] = val; +} + +/* Debug: dump vcpu state */ +int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu); + +/* MMU handling */ +void kvm_flush_tlb_all(void); +void kvm_flush_tlb_gpa(struct kvm_vcpu *vcpu, unsigned long gpa); +int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv, bool write); + +#define KVM_ARCH_WANT_MMU_NOTIFIER +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable); +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); + +static inline void update_pc(struct kvm_vcpu_arch *arch) +{ + arch->pc += 4; +} + +/* + * kvm_is_ifetch_fault() - Find whether a TLBL exception is due to ifetch fault. + * @vcpu: Virtual CPU. + * + * Returns: Whether the TLBL exception was likely due to an instruction + * fetch fault rather than a data load fault. + */ +static inline bool kvm_is_ifetch_fault(struct kvm_vcpu_arch *arch) +{ + return arch->pc == arch->badv; +} + +/* Misc */ +static inline void kvm_arch_hardware_unsetup(void) {} +static inline void kvm_arch_sync_events(struct kvm *kvm) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} +static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} +void kvm_check_vpid(struct kvm_vcpu *vcpu); +enum hrtimer_restart kvm_swtimer_wakeup(struct hrtimer *timer); +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, const struct kvm_memory_slot *memslot); +void kvm_init_vmcs(struct kvm *kvm); +void kvm_exc_entry(void); +int kvm_enter_guest(struct kvm_run *run, struct kvm_vcpu *vcpu); + +extern unsigned long vpid_mask; +extern const unsigned long kvm_exception_size; +extern const unsigned long kvm_enter_guest_size; +extern struct kvm_world_switch *kvm_loongarch_ops; + +#define SW_GCSR (1 << 0) +#define HW_GCSR (1 << 1) +#define INVALID_GCSR (1 << 2) + +int get_gcsr_flag(int csr); +void set_hw_gcsr(int csr_id, unsigned long val); + +#endif /* __ASM_LOONGARCH_KVM_HOST_H__ */ diff --git a/arch/loongarch/include/asm/kvm_mmu.h b/arch/loongarch/include/asm/kvm_mmu.h new file mode 100644 index 000000000000..099bafc6f797 --- /dev/null +++ b/arch/loongarch/include/asm/kvm_mmu.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#ifndef __ASM_LOONGARCH_KVM_MMU_H__ +#define __ASM_LOONGARCH_KVM_MMU_H__ + +#include <linux/kvm_host.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> + +/* + * KVM_MMU_CACHE_MIN_PAGES is the number of GPA page table translation levels + * for which pages need to be cached. + */ +#define KVM_MMU_CACHE_MIN_PAGES (CONFIG_PGTABLE_LEVELS - 1) + +#define _KVM_FLUSH_PGTABLE 0x1 +#define _KVM_HAS_PGMASK 0x2 +#define kvm_pfn_pte(pfn, prot) (((pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) +#define kvm_pte_pfn(x) ((phys_addr_t)((x & _PFN_MASK) >> PFN_PTE_SHIFT)) + +typedef unsigned long kvm_pte_t; +typedef struct kvm_ptw_ctx kvm_ptw_ctx; +typedef int (*kvm_pte_ops)(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx); + +struct kvm_ptw_ctx { + kvm_pte_ops ops; + unsigned long flag; + + /* for kvm_arch_mmu_enable_log_dirty_pt_masked use */ + unsigned long mask; + unsigned long gfn; + + /* page walk mmu info */ + unsigned int level; + unsigned long pgtable_shift; + unsigned long invalid_entry; + unsigned long *invalid_ptes; + unsigned int *pte_shifts; + void *opaque; + + /* free pte table page list */ + struct list_head list; +}; + +kvm_pte_t *kvm_pgd_alloc(void); + +static inline void kvm_set_pte(kvm_pte_t *ptep, kvm_pte_t val) +{ + WRITE_ONCE(*ptep, val); +} + +static inline int kvm_pte_write(kvm_pte_t pte) { return pte & _PAGE_WRITE; } +static inline int kvm_pte_dirty(kvm_pte_t pte) { return pte & _PAGE_DIRTY; } +static inline int kvm_pte_young(kvm_pte_t pte) { return pte & _PAGE_ACCESSED; } +static inline int kvm_pte_huge(kvm_pte_t pte) { return pte & _PAGE_HUGE; } + +static inline kvm_pte_t kvm_pte_mkyoung(kvm_pte_t pte) +{ + return pte | _PAGE_ACCESSED; +} + +static inline kvm_pte_t kvm_pte_mkold(kvm_pte_t pte) +{ + return pte & ~_PAGE_ACCESSED; +} + +static inline kvm_pte_t kvm_pte_mkdirty(kvm_pte_t pte) +{ + return pte | _PAGE_DIRTY; +} + +static inline kvm_pte_t kvm_pte_mkclean(kvm_pte_t pte) +{ + return pte & ~_PAGE_DIRTY; +} + +static inline kvm_pte_t kvm_pte_mkhuge(kvm_pte_t pte) +{ + return pte | _PAGE_HUGE; +} + +static inline kvm_pte_t kvm_pte_mksmall(kvm_pte_t pte) +{ + return pte & ~_PAGE_HUGE; +} + +static inline int kvm_need_flush(kvm_ptw_ctx *ctx) +{ + return ctx->flag & _KVM_FLUSH_PGTABLE; +} + +static inline kvm_pte_t *kvm_pgtable_offset(kvm_ptw_ctx *ctx, kvm_pte_t *table, + phys_addr_t addr) +{ + + return table + ((addr >> ctx->pgtable_shift) & (PTRS_PER_PTE - 1)); +} + +static inline phys_addr_t kvm_pgtable_addr_end(kvm_ptw_ctx *ctx, + phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t boundary, size; + + size = 0x1UL << ctx->pgtable_shift; + boundary = (addr + size) & ~(size - 1); + return (boundary - 1 < end - 1) ? boundary : end; +} + +static inline int kvm_pte_present(kvm_ptw_ctx *ctx, kvm_pte_t *entry) +{ + if (!ctx || ctx->level == 0) + return !!(*entry & _PAGE_PRESENT); + + return *entry != ctx->invalid_entry; +} + +static inline int kvm_pte_none(kvm_ptw_ctx *ctx, kvm_pte_t *entry) +{ + return *entry == ctx->invalid_entry; +} + +static inline void kvm_ptw_enter(kvm_ptw_ctx *ctx) +{ + ctx->level--; + ctx->pgtable_shift = ctx->pte_shifts[ctx->level]; + ctx->invalid_entry = ctx->invalid_ptes[ctx->level]; +} + +static inline void kvm_ptw_exit(kvm_ptw_ctx *ctx) +{ + ctx->level++; + ctx->pgtable_shift = ctx->pte_shifts[ctx->level]; + ctx->invalid_entry = ctx->invalid_ptes[ctx->level]; +} + +#endif /* __ASM_LOONGARCH_KVM_MMU_H__ */ diff --git a/arch/loongarch/include/asm/kvm_types.h b/arch/loongarch/include/asm/kvm_types.h new file mode 100644 index 000000000000..2fe1d4bdff66 --- /dev/null +++ b/arch/loongarch/include/asm/kvm_types.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#ifndef _ASM_LOONGARCH_KVM_TYPES_H +#define _ASM_LOONGARCH_KVM_TYPES_H + +#define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40 + +#endif /* _ASM_LOONGARCH_KVM_TYPES_H */ diff --git a/arch/loongarch/include/asm/kvm_vcpu.h b/arch/loongarch/include/asm/kvm_vcpu.h new file mode 100644 index 000000000000..553cfa2b2b1c --- /dev/null +++ b/arch/loongarch/include/asm/kvm_vcpu.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#ifndef __ASM_LOONGARCH_KVM_VCPU_H__ +#define __ASM_LOONGARCH_KVM_VCPU_H__ + +#include <linux/kvm_host.h> +#include <asm/loongarch.h> + +/* Controlled by 0x5 guest estat */ +#define CPU_SIP0 (_ULCAST_(1)) +#define CPU_SIP1 (_ULCAST_(1) << 1) +#define CPU_PMU (_ULCAST_(1) << 10) +#define CPU_TIMER (_ULCAST_(1) << 11) +#define CPU_IPI (_ULCAST_(1) << 12) + +/* Controlled by 0x52 guest exception VIP aligned to estat bit 5~12 */ +#define CPU_IP0 (_ULCAST_(1)) +#define CPU_IP1 (_ULCAST_(1) << 1) +#define CPU_IP2 (_ULCAST_(1) << 2) +#define CPU_IP3 (_ULCAST_(1) << 3) +#define CPU_IP4 (_ULCAST_(1) << 4) +#define CPU_IP5 (_ULCAST_(1) << 5) +#define CPU_IP6 (_ULCAST_(1) << 6) +#define CPU_IP7 (_ULCAST_(1) << 7) + +#define MNSEC_PER_SEC (NSEC_PER_SEC >> 20) + +/* KVM_IRQ_LINE irq field index values */ +#define KVM_LOONGSON_IRQ_TYPE_SHIFT 24 +#define KVM_LOONGSON_IRQ_TYPE_MASK 0xff +#define KVM_LOONGSON_IRQ_VCPU_SHIFT 16 +#define KVM_LOONGSON_IRQ_VCPU_MASK 0xff +#define KVM_LOONGSON_IRQ_NUM_SHIFT 0 +#define KVM_LOONGSON_IRQ_NUM_MASK 0xffff + +typedef union loongarch_instruction larch_inst; +typedef int (*exit_handle_fn)(struct kvm_vcpu *); + +int kvm_emu_mmio_read(struct kvm_vcpu *vcpu, larch_inst inst); +int kvm_emu_mmio_write(struct kvm_vcpu *vcpu, larch_inst inst); +int kvm_complete_mmio_read(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_complete_iocsr_read(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_emu_idle(struct kvm_vcpu *vcpu); +int kvm_pending_timer(struct kvm_vcpu *vcpu); +int kvm_handle_fault(struct kvm_vcpu *vcpu, int fault); +void kvm_deliver_intr(struct kvm_vcpu *vcpu); +void kvm_deliver_exception(struct kvm_vcpu *vcpu); + +void kvm_own_fpu(struct kvm_vcpu *vcpu); +void kvm_lose_fpu(struct kvm_vcpu *vcpu); +void kvm_save_fpu(struct loongarch_fpu *fpu); +void kvm_restore_fpu(struct loongarch_fpu *fpu); +void kvm_restore_fcsr(struct loongarch_fpu *fpu); + +void kvm_acquire_timer(struct kvm_vcpu *vcpu); +void kvm_init_timer(struct kvm_vcpu *vcpu, unsigned long hz); +void kvm_reset_timer(struct kvm_vcpu *vcpu); +void kvm_save_timer(struct kvm_vcpu *vcpu); +void kvm_restore_timer(struct kvm_vcpu *vcpu); + +int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); + +/* + * Loongarch KVM guest interrupt handling + */ +static inline void kvm_queue_irq(struct kvm_vcpu *vcpu, unsigned int irq) +{ + set_bit(irq, &vcpu->arch.irq_pending); + clear_bit(irq, &vcpu->arch.irq_clear); +} + +static inline void kvm_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int irq) +{ + clear_bit(irq, &vcpu->arch.irq_pending); + set_bit(irq, &vcpu->arch.irq_clear); +} + +static inline int kvm_queue_exception(struct kvm_vcpu *vcpu, + unsigned int code, unsigned int subcode) +{ + /* only one exception can be injected */ + if (!vcpu->arch.exception_pending) { + set_bit(code, &vcpu->arch.exception_pending); + vcpu->arch.esubcode = subcode; + return 0; + } else + return -1; +} + +#endif /* __ASM_LOONGARCH_KVM_VCPU_H__ */ diff --git a/arch/loongarch/include/asm/linkage.h b/arch/loongarch/include/asm/linkage.h index 81b0c4cfbf4f..e2eca1a25b4e 100644 --- a/arch/loongarch/include/asm/linkage.h +++ b/arch/loongarch/include/asm/linkage.h @@ -33,4 +33,12 @@ .cfi_endproc; \ SYM_END(name, SYM_T_FUNC) +#define SYM_CODE_START(name) \ + SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \ + .cfi_startproc; + +#define SYM_CODE_END(name) \ + .cfi_endproc; \ + SYM_END(name, SYM_T_NONE) + #endif diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 33531d432b49..9b4957cefa8a 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -226,6 +226,7 @@ #define LOONGARCH_CSR_ECFG 0x4 /* Exception config */ #define CSR_ECFG_VS_SHIFT 16 #define CSR_ECFG_VS_WIDTH 3 +#define CSR_ECFG_VS_SHIFT_END (CSR_ECFG_VS_SHIFT + CSR_ECFG_VS_WIDTH - 1) #define CSR_ECFG_VS (_ULCAST_(0x7) << CSR_ECFG_VS_SHIFT) #define CSR_ECFG_IM_SHIFT 0 #define CSR_ECFG_IM_WIDTH 14 @@ -314,13 +315,14 @@ #define CSR_TLBLO1_V (_ULCAST_(0x1) << CSR_TLBLO1_V_SHIFT) #define LOONGARCH_CSR_GTLBC 0x15 /* Guest TLB control */ -#define CSR_GTLBC_RID_SHIFT 16 -#define CSR_GTLBC_RID_WIDTH 8 -#define CSR_GTLBC_RID (_ULCAST_(0xff) << CSR_GTLBC_RID_SHIFT) +#define CSR_GTLBC_TGID_SHIFT 16 +#define CSR_GTLBC_TGID_WIDTH 8 +#define CSR_GTLBC_TGID_SHIFT_END (CSR_GTLBC_TGID_SHIFT + CSR_GTLBC_TGID_WIDTH - 1) +#define CSR_GTLBC_TGID (_ULCAST_(0xff) << CSR_GTLBC_TGID_SHIFT) #define CSR_GTLBC_TOTI_SHIFT 13 #define CSR_GTLBC_TOTI (_ULCAST_(0x1) << CSR_GTLBC_TOTI_SHIFT) -#define CSR_GTLBC_USERID_SHIFT 12 -#define CSR_GTLBC_USERID (_ULCAST_(0x1) << CSR_GTLBC_USERID_SHIFT) +#define CSR_GTLBC_USETGID_SHIFT 12 +#define CSR_GTLBC_USETGID (_ULCAST_(0x1) << CSR_GTLBC_USETGID_SHIFT) #define CSR_GTLBC_GMTLBSZ_SHIFT 0 #define CSR_GTLBC_GMTLBSZ_WIDTH 6 #define CSR_GTLBC_GMTLBSZ (_ULCAST_(0x3f) << CSR_GTLBC_GMTLBSZ_SHIFT) @@ -475,6 +477,7 @@ #define LOONGARCH_CSR_GSTAT 0x50 /* Guest status */ #define CSR_GSTAT_GID_SHIFT 16 #define CSR_GSTAT_GID_WIDTH 8 +#define CSR_GSTAT_GID_SHIFT_END (CSR_GSTAT_GID_SHIFT + CSR_GSTAT_GID_WIDTH - 1) #define CSR_GSTAT_GID (_ULCAST_(0xff) << CSR_GSTAT_GID_SHIFT) #define CSR_GSTAT_GIDBIT_SHIFT 4 #define CSR_GSTAT_GIDBIT_WIDTH 6 @@ -525,6 +528,12 @@ #define CSR_GCFG_MATC_GUEST (_ULCAST_(0x0) << CSR_GCFG_MATC_SHITF) #define CSR_GCFG_MATC_ROOT (_ULCAST_(0x1) << CSR_GCFG_MATC_SHITF) #define CSR_GCFG_MATC_NEST (_ULCAST_(0x2) << CSR_GCFG_MATC_SHITF) +#define CSR_GCFG_MATP_NEST_SHIFT 2 +#define CSR_GCFG_MATP_NEST (_ULCAST_(0x1) << CSR_GCFG_MATP_NEST_SHIFT) +#define CSR_GCFG_MATP_ROOT_SHIFT 1 +#define CSR_GCFG_MATP_ROOT (_ULCAST_(0x1) << CSR_GCFG_MATP_ROOT_SHIFT) +#define CSR_GCFG_MATP_GUEST_SHIFT 0 +#define CSR_GCFG_MATP_GUEST (_ULCAST_(0x1) << CSR_GCFG_MATP_GUEST_SHIFT) #define LOONGARCH_CSR_GINTC 0x52 /* Guest interrupt control */ #define CSR_GINTC_HC_SHIFT 16 diff --git a/arch/loongarch/include/asm/pgtable-bits.h b/arch/loongarch/include/asm/pgtable-bits.h index 35348d4c4209..21319c1e045c 100644 --- a/arch/loongarch/include/asm/pgtable-bits.h +++ b/arch/loongarch/include/asm/pgtable-bits.h @@ -105,13 +105,15 @@ static inline pgprot_t pgprot_noncached(pgprot_t _prot) return __pgprot(prot); } +extern bool wc_enabled; + #define pgprot_writecombine pgprot_writecombine static inline pgprot_t pgprot_writecombine(pgprot_t _prot) { unsigned long prot = pgprot_val(_prot); - prot = (prot & ~_CACHE_MASK) | _CACHE_WUC; + prot = (prot & ~_CACHE_MASK) | (wc_enabled ? _CACHE_WUC : _CACHE_SUC); return __pgprot(prot); } diff --git a/arch/loongarch/include/uapi/asm/kvm.h b/arch/loongarch/include/uapi/asm/kvm.h new file mode 100644 index 000000000000..c6ad2ee6106c --- /dev/null +++ b/arch/loongarch/include/uapi/asm/kvm.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#ifndef __UAPI_ASM_LOONGARCH_KVM_H +#define __UAPI_ASM_LOONGARCH_KVM_H + +#include <linux/types.h> + +/* + * KVM LoongArch specific structures and definitions. + * + * Some parts derived from the x86 version of this file. + */ + +#define __KVM_HAVE_READONLY_MEM + +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 +#define KVM_DIRTY_LOG_PAGE_OFFSET 64 + +/* + * for KVM_GET_REGS and KVM_SET_REGS + */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 gpr[32]; + __u64 pc; +}; + +/* + * for KVM_GET_FPU and KVM_SET_FPU + */ +struct kvm_fpu { + __u32 fcsr; + __u64 fcc; /* 8x8 */ + struct kvm_fpureg { + __u64 val64[4]; + } fpr[32]; +}; + +/* + * For LoongArch, we use KVM_SET_ONE_REG and KVM_GET_ONE_REG to access various + * registers. The id field is broken down as follows: + * + * bits[63..52] - As per linux/kvm.h + * bits[51..32] - Must be zero. + * bits[31..16] - Register set. + * + * Register set = 0: GP registers from kvm_regs (see definitions below). + * + * Register set = 1: CSR registers. + * + * Register set = 2: KVM specific registers (see definitions below). + * + * Register set = 3: FPU / SIMD registers (see definitions below). + * + * Other sets registers may be added in the future. Each set would + * have its own identifier in bits[31..16]. + */ + +#define KVM_REG_LOONGARCH_GPR (KVM_REG_LOONGARCH | 0x00000ULL) +#define KVM_REG_LOONGARCH_CSR (KVM_REG_LOONGARCH | 0x10000ULL) +#define KVM_REG_LOONGARCH_KVM (KVM_REG_LOONGARCH | 0x20000ULL) +#define KVM_REG_LOONGARCH_FPSIMD (KVM_REG_LOONGARCH | 0x30000ULL) +#define KVM_REG_LOONGARCH_CPUCFG (KVM_REG_LOONGARCH | 0x40000ULL) +#define KVM_REG_LOONGARCH_MASK (KVM_REG_LOONGARCH | 0x70000ULL) +#define KVM_CSR_IDX_MASK 0x7fff +#define KVM_CPUCFG_IDX_MASK 0x7fff + +/* + * KVM_REG_LOONGARCH_KVM - KVM specific control registers. + */ + +#define KVM_REG_LOONGARCH_COUNTER (KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 1) +#define KVM_REG_LOONGARCH_VCPU_RESET (KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 2) + +#define LOONGARCH_REG_SHIFT 3 +#define LOONGARCH_REG_64(TYPE, REG) (TYPE | KVM_REG_SIZE_U64 | (REG << LOONGARCH_REG_SHIFT)) +#define KVM_IOC_CSRID(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CSR, REG) +#define KVM_IOC_CPUCFG(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CPUCFG, REG) + +struct kvm_debug_exit_arch { +}; + +/* for KVM_SET_GUEST_DEBUG */ +struct kvm_guest_debug_arch { +}; + +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + +/* dummy definition */ +struct kvm_sregs { +}; + +struct kvm_iocsr_entry { + __u32 addr; + __u32 pad; + __u64 data; +}; + +#define KVM_NR_IRQCHIPS 1 +#define KVM_IRQCHIP_NUM_PINS 64 +#define KVM_MAX_CORES 256 + +#endif /* __UAPI_ASM_LOONGARCH_KVM_H */ diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c index 8da0726777ed..173fe514fc9e 100644 --- a/arch/loongarch/kernel/asm-offsets.c +++ b/arch/loongarch/kernel/asm-offsets.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/kbuild.h> #include <linux/suspend.h> +#include <linux/kvm_host.h> #include <asm/cpu-info.h> #include <asm/ptrace.h> #include <asm/processor.h> @@ -289,3 +290,34 @@ void output_fgraph_ret_regs_defines(void) BLANK(); } #endif + +void output_kvm_defines(void) +{ + COMMENT("KVM/LoongArch Specific offsets."); + + OFFSET(VCPU_FCC, kvm_vcpu_arch, fpu.fcc); + OFFSET(VCPU_FCSR0, kvm_vcpu_arch, fpu.fcsr); + BLANK(); + + OFFSET(KVM_VCPU_ARCH, kvm_vcpu, arch); + OFFSET(KVM_VCPU_KVM, kvm_vcpu, kvm); + OFFSET(KVM_VCPU_RUN, kvm_vcpu, run); + BLANK(); + + OFFSET(KVM_ARCH_HSP, kvm_vcpu_arch, host_sp); + OFFSET(KVM_ARCH_HTP, kvm_vcpu_arch, host_tp); + OFFSET(KVM_ARCH_HPGD, kvm_vcpu_arch, host_pgd); + OFFSET(KVM_ARCH_HANDLE_EXIT, kvm_vcpu_arch, handle_exit); + OFFSET(KVM_ARCH_HEENTRY, kvm_vcpu_arch, host_eentry); + OFFSET(KVM_ARCH_GEENTRY, kvm_vcpu_arch, guest_eentry); + OFFSET(KVM_ARCH_GPC, kvm_vcpu_arch, pc); + OFFSET(KVM_ARCH_GGPR, kvm_vcpu_arch, gprs); + OFFSET(KVM_ARCH_HBADI, kvm_vcpu_arch, badi); + OFFSET(KVM_ARCH_HBADV, kvm_vcpu_arch, badv); + OFFSET(KVM_ARCH_HECFG, kvm_vcpu_arch, host_ecfg); + OFFSET(KVM_ARCH_HESTAT, kvm_vcpu_arch, host_estat); + OFFSET(KVM_ARCH_HPERCPU, kvm_vcpu_arch, host_percpu); + + OFFSET(KVM_GPGD, kvm, arch.pgd); + BLANK(); +} diff --git a/arch/loongarch/kernel/entry.S b/arch/loongarch/kernel/entry.S index 65518bb8f472..1ec8e4c4cc2b 100644 --- a/arch/loongarch/kernel/entry.S +++ b/arch/loongarch/kernel/entry.S @@ -18,7 +18,7 @@ .text .cfi_sections .debug_frame .align 5 -SYM_FUNC_START(handle_syscall) +SYM_CODE_START(handle_syscall) csrrd t0, PERCPU_BASE_KS la.pcrel t1, kernelsp add.d t1, t1, t0 @@ -71,7 +71,7 @@ SYM_FUNC_START(handle_syscall) bl do_syscall RESTORE_ALL_AND_RET -SYM_FUNC_END(handle_syscall) +SYM_CODE_END(handle_syscall) _ASM_NOKPROBE(handle_syscall) SYM_CODE_START(ret_from_fork) diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S index 78f066384657..2bb3aa2dcfcb 100644 --- a/arch/loongarch/kernel/genex.S +++ b/arch/loongarch/kernel/genex.S @@ -31,7 +31,7 @@ SYM_FUNC_START(__arch_cpu_idle) 1: jr ra SYM_FUNC_END(__arch_cpu_idle) -SYM_FUNC_START(handle_vint) +SYM_CODE_START(handle_vint) BACKUP_T0T1 SAVE_ALL la_abs t1, __arch_cpu_idle @@ -46,11 +46,11 @@ SYM_FUNC_START(handle_vint) la_abs t0, do_vint jirl ra, t0, 0 RESTORE_ALL_AND_RET -SYM_FUNC_END(handle_vint) +SYM_CODE_END(handle_vint) -SYM_FUNC_START(except_vec_cex) +SYM_CODE_START(except_vec_cex) b cache_parity_error -SYM_FUNC_END(except_vec_cex) +SYM_CODE_END(except_vec_cex) .macro build_prep_badv csrrd t0, LOONGARCH_CSR_BADV @@ -66,7 +66,7 @@ SYM_FUNC_END(except_vec_cex) .macro BUILD_HANDLER exception handler prep .align 5 - SYM_FUNC_START(handle_\exception) + SYM_CODE_START(handle_\exception) 666: BACKUP_T0T1 SAVE_ALL @@ -76,7 +76,7 @@ SYM_FUNC_END(except_vec_cex) jirl ra, t0, 0 668: RESTORE_ALL_AND_RET - SYM_FUNC_END(handle_\exception) + SYM_CODE_END(handle_\exception) SYM_DATA(unwind_hint_\exception, .word 668b - 666b) .endm @@ -93,7 +93,7 @@ SYM_FUNC_END(except_vec_cex) BUILD_HANDLER watch watch none BUILD_HANDLER reserved reserved none /* others */ -SYM_FUNC_START(handle_sys) +SYM_CODE_START(handle_sys) la_abs t0, handle_syscall jr t0 -SYM_FUNC_END(handle_sys) +SYM_CODE_END(handle_sys) diff --git a/arch/loongarch/kernel/module.c b/arch/loongarch/kernel/module.c index b8b86088b2dd..b13b2858fe39 100644 --- a/arch/loongarch/kernel/module.c +++ b/arch/loongarch/kernel/module.c @@ -367,6 +367,24 @@ static int apply_r_larch_got_pc(struct module *mod, return apply_r_larch_pcala(mod, location, got, rela_stack, rela_stack_top, type); } +static int apply_r_larch_32_pcrel(struct module *mod, u32 *location, Elf_Addr v, + s64 *rela_stack, size_t *rela_stack_top, unsigned int type) +{ + ptrdiff_t offset = (void *)v - (void *)location; + + *(u32 *)location = offset; + return 0; +} + +static int apply_r_larch_64_pcrel(struct module *mod, u32 *location, Elf_Addr v, + s64 *rela_stack, size_t *rela_stack_top, unsigned int type) +{ + ptrdiff_t offset = (void *)v - (void *)location; + + *(u64 *)location = offset; + return 0; +} + /* * reloc_handlers_rela() - Apply a particular relocation to a module * @mod: the module to apply the reloc to @@ -382,7 +400,7 @@ typedef int (*reloc_rela_handler)(struct module *mod, u32 *location, Elf_Addr v, /* The handlers for known reloc types */ static reloc_rela_handler reloc_rela_handlers[] = { - [R_LARCH_NONE ... R_LARCH_RELAX] = apply_r_larch_error, + [R_LARCH_NONE ... R_LARCH_64_PCREL] = apply_r_larch_error, [R_LARCH_NONE] = apply_r_larch_none, [R_LARCH_32] = apply_r_larch_32, @@ -396,6 +414,8 @@ static reloc_rela_handler reloc_rela_handlers[] = { [R_LARCH_SOP_POP_32_S_10_5 ... R_LARCH_SOP_POP_32_U] = apply_r_larch_sop_imm_field, [R_LARCH_ADD32 ... R_LARCH_SUB64] = apply_r_larch_add_sub, [R_LARCH_PCALA_HI20...R_LARCH_PCALA64_HI12] = apply_r_larch_pcala, + [R_LARCH_32_PCREL] = apply_r_larch_32_pcrel, + [R_LARCH_64_PCREL] = apply_r_larch_64_pcrel, }; int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c index c7d33c489e04..6e65ff12d5c7 100644 --- a/arch/loongarch/kernel/numa.c +++ b/arch/loongarch/kernel/numa.c @@ -436,7 +436,7 @@ void __init paging_init(void) void __init mem_init(void) { - high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT); + high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); memblock_free_all(); } diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 7783f0a3d742..aed65915e932 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -161,19 +161,19 @@ static void __init smbios_parse(void) } #ifdef CONFIG_ARCH_WRITECOMBINE -pgprot_t pgprot_wc = PAGE_KERNEL_WUC; +bool wc_enabled = true; #else -pgprot_t pgprot_wc = PAGE_KERNEL_SUC; +bool wc_enabled = false; #endif -EXPORT_SYMBOL(pgprot_wc); +EXPORT_SYMBOL(wc_enabled); static int __init setup_writecombine(char *p) { if (!strcmp(p, "on")) - pgprot_wc = PAGE_KERNEL_WUC; + wc_enabled = true; else if (!strcmp(p, "off")) - pgprot_wc = PAGE_KERNEL_SUC; + wc_enabled = false; else pr_warn("Unknown writecombine setting \"%s\".\n", p); diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig new file mode 100644 index 000000000000..fda425babfb2 --- /dev/null +++ b/arch/loongarch/kvm/Kconfig @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# KVM configuration +# + +source "virt/kvm/Kconfig" + +menuconfig VIRTUALIZATION + bool "Virtualization" + help + Say Y here to get to see options for using your Linux host to run + other operating systems inside virtual machines (guests). + This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and + disabled. + +if VIRTUALIZATION + +config KVM + tristate "Kernel-based Virtual Machine (KVM) support" + depends on AS_HAS_LVZ_EXTENSION + depends on HAVE_KVM + select HAVE_KVM_DIRTY_RING_ACQ_REL + select HAVE_KVM_EVENTFD + select HAVE_KVM_VCPU_ASYNC_IOCTL + select KVM_GENERIC_DIRTYLOG_READ_PROTECT + select KVM_GENERIC_HARDWARE_ENABLING + select KVM_MMIO + select KVM_XFER_TO_GUEST_WORK + select MMU_NOTIFIER + select PREEMPT_NOTIFIERS + help + Support hosting virtualized guest machines using + hardware virtualization extensions. You will need + a processor equipped with virtualization extensions. + + If unsure, say N. + +endif # VIRTUALIZATION diff --git a/arch/loongarch/kvm/Makefile b/arch/loongarch/kvm/Makefile new file mode 100644 index 000000000000..244467d7792a --- /dev/null +++ b/arch/loongarch/kvm/Makefile @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for LoongArch KVM support +# + +ccflags-y += -I $(srctree)/$(src) + +include $(srctree)/virt/kvm/Makefile.kvm + +obj-$(CONFIG_KVM) += kvm.o + +kvm-y += exit.o +kvm-y += interrupt.o +kvm-y += main.o +kvm-y += mmu.o +kvm-y += switch.o +kvm-y += timer.o +kvm-y += tlb.o +kvm-y += vcpu.o +kvm-y += vm.o + +CFLAGS_exit.o += $(call cc-option,-Wno-override-init,) diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c new file mode 100644 index 000000000000..ce8de3fa472c --- /dev/null +++ b/arch/loongarch/kvm/exit.c @@ -0,0 +1,696 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/kvm_host.h> +#include <linux/module.h> +#include <linux/preempt.h> +#include <linux/vmalloc.h> +#include <asm/fpu.h> +#include <asm/inst.h> +#include <asm/loongarch.h> +#include <asm/mmzone.h> +#include <asm/numa.h> +#include <asm/time.h> +#include <asm/tlb.h> +#include <asm/kvm_csr.h> +#include <asm/kvm_vcpu.h> +#include "trace.h" + +static unsigned long kvm_emu_read_csr(struct kvm_vcpu *vcpu, int csrid) +{ + unsigned long val = 0; + struct loongarch_csrs *csr = vcpu->arch.csr; + + /* + * From LoongArch Reference Manual Volume 1 Chapter 4.2.1 + * For undefined CSR id, return value is 0 + */ + if (get_gcsr_flag(csrid) & SW_GCSR) + val = kvm_read_sw_gcsr(csr, csrid); + else + pr_warn_once("Unsupported csrrd 0x%x with pc %lx\n", csrid, vcpu->arch.pc); + + return val; +} + +static unsigned long kvm_emu_write_csr(struct kvm_vcpu *vcpu, int csrid, unsigned long val) +{ + unsigned long old = 0; + struct loongarch_csrs *csr = vcpu->arch.csr; + + if (get_gcsr_flag(csrid) & SW_GCSR) { + old = kvm_read_sw_gcsr(csr, csrid); + kvm_write_sw_gcsr(csr, csrid, val); + } else + pr_warn_once("Unsupported csrwr 0x%x with pc %lx\n", csrid, vcpu->arch.pc); + + return old; +} + +static unsigned long kvm_emu_xchg_csr(struct kvm_vcpu *vcpu, int csrid, + unsigned long csr_mask, unsigned long val) +{ + unsigned long old = 0; + struct loongarch_csrs *csr = vcpu->arch.csr; + + if (get_gcsr_flag(csrid) & SW_GCSR) { + old = kvm_read_sw_gcsr(csr, csrid); + val = (old & ~csr_mask) | (val & csr_mask); + kvm_write_sw_gcsr(csr, csrid, val); + old = old & csr_mask; + } else + pr_warn_once("Unsupported csrxchg 0x%x with pc %lx\n", csrid, vcpu->arch.pc); + + return old; +} + +static int kvm_handle_csr(struct kvm_vcpu *vcpu, larch_inst inst) +{ + unsigned int rd, rj, csrid; + unsigned long csr_mask, val = 0; + + /* + * CSR value mask imm + * rj = 0 means csrrd + * rj = 1 means csrwr + * rj != 0,1 means csrxchg + */ + rd = inst.reg2csr_format.rd; + rj = inst.reg2csr_format.rj; + csrid = inst.reg2csr_format.csr; + + /* Process CSR ops */ + switch (rj) { + case 0: /* process csrrd */ + val = kvm_emu_read_csr(vcpu, csrid); + vcpu->arch.gprs[rd] = val; + break; + case 1: /* process csrwr */ + val = vcpu->arch.gprs[rd]; + val = kvm_emu_write_csr(vcpu, csrid, val); + vcpu->arch.gprs[rd] = val; + break; + default: /* process csrxchg */ + val = vcpu->arch.gprs[rd]; + csr_mask = vcpu->arch.gprs[rj]; + val = kvm_emu_xchg_csr(vcpu, csrid, csr_mask, val); + vcpu->arch.gprs[rd] = val; + } + + return EMULATE_DONE; +} + +int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + int ret; + unsigned long val; + u32 addr, rd, rj, opcode; + + /* + * Each IOCSR with different opcode + */ + rd = inst.reg2_format.rd; + rj = inst.reg2_format.rj; + opcode = inst.reg2_format.opcode; + addr = vcpu->arch.gprs[rj]; + ret = EMULATE_DO_IOCSR; + run->iocsr_io.phys_addr = addr; + run->iocsr_io.is_write = 0; + + /* LoongArch is Little endian */ + switch (opcode) { + case iocsrrdb_op: + run->iocsr_io.len = 1; + break; + case iocsrrdh_op: + run->iocsr_io.len = 2; + break; + case iocsrrdw_op: + run->iocsr_io.len = 4; + break; + case iocsrrdd_op: + run->iocsr_io.len = 8; + break; + case iocsrwrb_op: + run->iocsr_io.len = 1; + run->iocsr_io.is_write = 1; + break; + case iocsrwrh_op: + run->iocsr_io.len = 2; + run->iocsr_io.is_write = 1; + break; + case iocsrwrw_op: + run->iocsr_io.len = 4; + run->iocsr_io.is_write = 1; + break; + case iocsrwrd_op: + run->iocsr_io.len = 8; + run->iocsr_io.is_write = 1; + break; + default: + ret = EMULATE_FAIL; + break; + } + + if (ret == EMULATE_DO_IOCSR) { + if (run->iocsr_io.is_write) { + val = vcpu->arch.gprs[rd]; + memcpy(run->iocsr_io.data, &val, run->iocsr_io.len); + } + vcpu->arch.io_gpr = rd; + } + + return ret; +} + +int kvm_complete_iocsr_read(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + enum emulation_result er = EMULATE_DONE; + unsigned long *gpr = &vcpu->arch.gprs[vcpu->arch.io_gpr]; + + switch (run->iocsr_io.len) { + case 1: + *gpr = *(s8 *)run->iocsr_io.data; + break; + case 2: + *gpr = *(s16 *)run->iocsr_io.data; + break; + case 4: + *gpr = *(s32 *)run->iocsr_io.data; + break; + case 8: + *gpr = *(s64 *)run->iocsr_io.data; + break; + default: + kvm_err("Bad IOCSR length: %d, addr is 0x%lx\n", + run->iocsr_io.len, vcpu->arch.badv); + er = EMULATE_FAIL; + break; + } + + return er; +} + +int kvm_emu_idle(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.idle_exits; + trace_kvm_exit_idle(vcpu, KVM_TRACE_EXIT_IDLE); + + if (!kvm_arch_vcpu_runnable(vcpu)) { + /* + * Switch to the software timer before halt-polling/blocking as + * the guest's timer may be a break event for the vCPU, and the + * hypervisor timer runs only when the CPU is in guest mode. + * Switch before halt-polling so that KVM recognizes an expired + * timer before blocking. + */ + kvm_save_timer(vcpu); + kvm_vcpu_block(vcpu); + } + + return EMULATE_DONE; +} + +static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu) +{ + int rd, rj; + unsigned int index; + unsigned long curr_pc; + larch_inst inst; + enum emulation_result er = EMULATE_DONE; + struct kvm_run *run = vcpu->run; + + /* Fetch the instruction */ + inst.word = vcpu->arch.badi; + curr_pc = vcpu->arch.pc; + update_pc(&vcpu->arch); + + trace_kvm_exit_gspr(vcpu, inst.word); + er = EMULATE_FAIL; + switch (((inst.word >> 24) & 0xff)) { + case 0x0: /* CPUCFG GSPR */ + if (inst.reg2_format.opcode == 0x1B) { + rd = inst.reg2_format.rd; + rj = inst.reg2_format.rj; + ++vcpu->stat.cpucfg_exits; + index = vcpu->arch.gprs[rj]; + er = EMULATE_DONE; + /* + * By LoongArch Reference Manual 2.2.10.5 + * return value is 0 for undefined cpucfg index + */ + if (index < KVM_MAX_CPUCFG_REGS) + vcpu->arch.gprs[rd] = vcpu->arch.cpucfg[index]; + else + vcpu->arch.gprs[rd] = 0; + } + break; + case 0x4: /* CSR{RD,WR,XCHG} GSPR */ + er = kvm_handle_csr(vcpu, inst); + break; + case 0x6: /* Cache, Idle and IOCSR GSPR */ + switch (((inst.word >> 22) & 0x3ff)) { + case 0x18: /* Cache GSPR */ + er = EMULATE_DONE; + trace_kvm_exit_cache(vcpu, KVM_TRACE_EXIT_CACHE); + break; + case 0x19: /* Idle/IOCSR GSPR */ + switch (((inst.word >> 15) & 0x1ffff)) { + case 0xc90: /* IOCSR GSPR */ + er = kvm_emu_iocsr(inst, run, vcpu); + break; + case 0xc91: /* Idle GSPR */ + er = kvm_emu_idle(vcpu); + break; + default: + er = EMULATE_FAIL; + break; + } + break; + default: + er = EMULATE_FAIL; + break; + } + break; + default: + er = EMULATE_FAIL; + break; + } + + /* Rollback PC only if emulation was unsuccessful */ + if (er == EMULATE_FAIL) { + kvm_err("[%#lx]%s: unsupported gspr instruction 0x%08x\n", + curr_pc, __func__, inst.word); + + kvm_arch_vcpu_dump_regs(vcpu); + vcpu->arch.pc = curr_pc; + } + + return er; +} + +/* + * Trigger GSPR: + * 1) Execute CPUCFG instruction; + * 2) Execute CACOP/IDLE instructions; + * 3) Access to unimplemented CSRs/IOCSRs. + */ +static int kvm_handle_gspr(struct kvm_vcpu *vcpu) +{ + int ret = RESUME_GUEST; + enum emulation_result er = EMULATE_DONE; + + er = kvm_trap_handle_gspr(vcpu); + + if (er == EMULATE_DONE) { + ret = RESUME_GUEST; + } else if (er == EMULATE_DO_MMIO) { + vcpu->run->exit_reason = KVM_EXIT_MMIO; + ret = RESUME_HOST; + } else if (er == EMULATE_DO_IOCSR) { + vcpu->run->exit_reason = KVM_EXIT_LOONGARCH_IOCSR; + ret = RESUME_HOST; + } else { + kvm_queue_exception(vcpu, EXCCODE_INE, 0); + ret = RESUME_GUEST; + } + + return ret; +} + +int kvm_emu_mmio_read(struct kvm_vcpu *vcpu, larch_inst inst) +{ + int ret; + unsigned int op8, opcode, rd; + struct kvm_run *run = vcpu->run; + + run->mmio.phys_addr = vcpu->arch.badv; + vcpu->mmio_needed = 2; /* signed */ + op8 = (inst.word >> 24) & 0xff; + ret = EMULATE_DO_MMIO; + + switch (op8) { + case 0x24 ... 0x27: /* ldptr.w/d process */ + rd = inst.reg2i14_format.rd; + opcode = inst.reg2i14_format.opcode; + + switch (opcode) { + case ldptrw_op: + run->mmio.len = 4; + break; + case ldptrd_op: + run->mmio.len = 8; + break; + default: + break; + } + break; + case 0x28 ... 0x2e: /* ld.b/h/w/d, ld.bu/hu/wu process */ + rd = inst.reg2i12_format.rd; + opcode = inst.reg2i12_format.opcode; + + switch (opcode) { + case ldb_op: + run->mmio.len = 1; + break; + case ldbu_op: + vcpu->mmio_needed = 1; /* unsigned */ + run->mmio.len = 1; + break; + case ldh_op: + run->mmio.len = 2; + break; + case ldhu_op: + vcpu->mmio_needed = 1; /* unsigned */ + run->mmio.len = 2; + break; + case ldw_op: + run->mmio.len = 4; + break; + case ldwu_op: + vcpu->mmio_needed = 1; /* unsigned */ + run->mmio.len = 4; + break; + case ldd_op: + run->mmio.len = 8; + break; + default: + ret = EMULATE_FAIL; + break; + } + break; + case 0x38: /* ldx.b/h/w/d, ldx.bu/hu/wu process */ + rd = inst.reg3_format.rd; + opcode = inst.reg3_format.opcode; + + switch (opcode) { + case ldxb_op: + run->mmio.len = 1; + break; + case ldxbu_op: + run->mmio.len = 1; + vcpu->mmio_needed = 1; /* unsigned */ + break; + case ldxh_op: + run->mmio.len = 2; + break; + case ldxhu_op: + run->mmio.len = 2; + vcpu->mmio_needed = 1; /* unsigned */ + break; + case ldxw_op: + run->mmio.len = 4; + break; + case ldxwu_op: + run->mmio.len = 4; + vcpu->mmio_needed = 1; /* unsigned */ + break; + case ldxd_op: + run->mmio.len = 8; + break; + default: + ret = EMULATE_FAIL; + break; + } + break; + default: + ret = EMULATE_FAIL; + } + + if (ret == EMULATE_DO_MMIO) { + /* Set for kvm_complete_mmio_read() use */ + vcpu->arch.io_gpr = rd; + run->mmio.is_write = 0; + vcpu->mmio_is_write = 0; + } else { + kvm_err("Read not supported Inst=0x%08x @%lx BadVaddr:%#lx\n", + inst.word, vcpu->arch.pc, vcpu->arch.badv); + kvm_arch_vcpu_dump_regs(vcpu); + vcpu->mmio_needed = 0; + } + + return ret; +} + +int kvm_complete_mmio_read(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + enum emulation_result er = EMULATE_DONE; + unsigned long *gpr = &vcpu->arch.gprs[vcpu->arch.io_gpr]; + + /* Update with new PC */ + update_pc(&vcpu->arch); + switch (run->mmio.len) { + case 1: + if (vcpu->mmio_needed == 2) + *gpr = *(s8 *)run->mmio.data; + else + *gpr = *(u8 *)run->mmio.data; + break; + case 2: + if (vcpu->mmio_needed == 2) + *gpr = *(s16 *)run->mmio.data; + else + *gpr = *(u16 *)run->mmio.data; + break; + case 4: + if (vcpu->mmio_needed == 2) + *gpr = *(s32 *)run->mmio.data; + else + *gpr = *(u32 *)run->mmio.data; + break; + case 8: + *gpr = *(s64 *)run->mmio.data; + break; + default: + kvm_err("Bad MMIO length: %d, addr is 0x%lx\n", + run->mmio.len, vcpu->arch.badv); + er = EMULATE_FAIL; + break; + } + + return er; +} + +int kvm_emu_mmio_write(struct kvm_vcpu *vcpu, larch_inst inst) +{ + int ret; + unsigned int rd, op8, opcode; + unsigned long curr_pc, rd_val = 0; + struct kvm_run *run = vcpu->run; + void *data = run->mmio.data; + + /* + * Update PC and hold onto current PC in case there is + * an error and we want to rollback the PC + */ + curr_pc = vcpu->arch.pc; + update_pc(&vcpu->arch); + + op8 = (inst.word >> 24) & 0xff; + run->mmio.phys_addr = vcpu->arch.badv; + ret = EMULATE_DO_MMIO; + switch (op8) { + case 0x24 ... 0x27: /* stptr.w/d process */ + rd = inst.reg2i14_format.rd; + opcode = inst.reg2i14_format.opcode; + + switch (opcode) { + case stptrw_op: + run->mmio.len = 4; + *(unsigned int *)data = vcpu->arch.gprs[rd]; + break; + case stptrd_op: + run->mmio.len = 8; + *(unsigned long *)data = vcpu->arch.gprs[rd]; + break; + default: + ret = EMULATE_FAIL; + break; + } + break; + case 0x28 ... 0x2e: /* st.b/h/w/d process */ + rd = inst.reg2i12_format.rd; + opcode = inst.reg2i12_format.opcode; + rd_val = vcpu->arch.gprs[rd]; + + switch (opcode) { + case stb_op: + run->mmio.len = 1; + *(unsigned char *)data = rd_val; + break; + case sth_op: + run->mmio.len = 2; + *(unsigned short *)data = rd_val; + break; + case stw_op: + run->mmio.len = 4; + *(unsigned int *)data = rd_val; + break; + case std_op: + run->mmio.len = 8; + *(unsigned long *)data = rd_val; + break; + default: + ret = EMULATE_FAIL; + break; + } + break; + case 0x38: /* stx.b/h/w/d process */ + rd = inst.reg3_format.rd; + opcode = inst.reg3_format.opcode; + + switch (opcode) { + case stxb_op: + run->mmio.len = 1; + *(unsigned char *)data = vcpu->arch.gprs[rd]; + break; + case stxh_op: + run->mmio.len = 2; + *(unsigned short *)data = vcpu->arch.gprs[rd]; + break; + case stxw_op: + run->mmio.len = 4; + *(unsigned int *)data = vcpu->arch.gprs[rd]; + break; + case stxd_op: + run->mmio.len = 8; + *(unsigned long *)data = vcpu->arch.gprs[rd]; + break; + default: + ret = EMULATE_FAIL; + break; + } + break; + default: + ret = EMULATE_FAIL; + } + + if (ret == EMULATE_DO_MMIO) { + run->mmio.is_write = 1; + vcpu->mmio_needed = 1; + vcpu->mmio_is_write = 1; + } else { + vcpu->arch.pc = curr_pc; + kvm_err("Write not supported Inst=0x%08x @%lx BadVaddr:%#lx\n", + inst.word, vcpu->arch.pc, vcpu->arch.badv); + kvm_arch_vcpu_dump_regs(vcpu); + /* Rollback PC if emulation was unsuccessful */ + } + + return ret; +} + +static int kvm_handle_rdwr_fault(struct kvm_vcpu *vcpu, bool write) +{ + int ret; + larch_inst inst; + enum emulation_result er = EMULATE_DONE; + struct kvm_run *run = vcpu->run; + unsigned long badv = vcpu->arch.badv; + + ret = kvm_handle_mm_fault(vcpu, badv, write); + if (ret) { + /* Treat as MMIO */ + inst.word = vcpu->arch.badi; + if (write) { + er = kvm_emu_mmio_write(vcpu, inst); + } else { + /* A code fetch fault doesn't count as an MMIO */ + if (kvm_is_ifetch_fault(&vcpu->arch)) { + kvm_queue_exception(vcpu, EXCCODE_ADE, EXSUBCODE_ADEF); + return RESUME_GUEST; + } + + er = kvm_emu_mmio_read(vcpu, inst); + } + } + + if (er == EMULATE_DONE) { + ret = RESUME_GUEST; + } else if (er == EMULATE_DO_MMIO) { + run->exit_reason = KVM_EXIT_MMIO; + ret = RESUME_HOST; + } else { + kvm_queue_exception(vcpu, EXCCODE_ADE, EXSUBCODE_ADEM); + ret = RESUME_GUEST; + } + + return ret; +} + +static int kvm_handle_read_fault(struct kvm_vcpu *vcpu) +{ + return kvm_handle_rdwr_fault(vcpu, false); +} + +static int kvm_handle_write_fault(struct kvm_vcpu *vcpu) +{ + return kvm_handle_rdwr_fault(vcpu, true); +} + +/** + * kvm_handle_fpu_disabled() - Guest used fpu however it is disabled at host + * @vcpu: Virtual CPU context. + * + * Handle when the guest attempts to use fpu which hasn't been allowed + * by the root context. + */ +static int kvm_handle_fpu_disabled(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + + /* + * If guest FPU not present, the FPU operation should have been + * treated as a reserved instruction! + * If FPU already in use, we shouldn't get this at all. + */ + if (WARN_ON(vcpu->arch.aux_inuse & KVM_LARCH_FPU)) { + kvm_err("%s internal error\n", __func__); + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } + + kvm_own_fpu(vcpu); + + return RESUME_GUEST; +} + +/* + * LoongArch KVM callback handling for unimplemented guest exiting + */ +static int kvm_fault_ni(struct kvm_vcpu *vcpu) +{ + unsigned int ecode, inst; + unsigned long estat, badv; + + /* Fetch the instruction */ + inst = vcpu->arch.badi; + badv = vcpu->arch.badv; + estat = vcpu->arch.host_estat; + ecode = (estat & CSR_ESTAT_EXC) >> CSR_ESTAT_EXC_SHIFT; + kvm_err("ECode: %d PC=%#lx Inst=0x%08x BadVaddr=%#lx ESTAT=%#lx\n", + ecode, vcpu->arch.pc, inst, badv, read_gcsr_estat()); + kvm_arch_vcpu_dump_regs(vcpu); + kvm_queue_exception(vcpu, EXCCODE_INE, 0); + + return RESUME_GUEST; +} + +static exit_handle_fn kvm_fault_tables[EXCCODE_INT_START] = { + [0 ... EXCCODE_INT_START - 1] = kvm_fault_ni, + [EXCCODE_TLBI] = kvm_handle_read_fault, + [EXCCODE_TLBL] = kvm_handle_read_fault, + [EXCCODE_TLBS] = kvm_handle_write_fault, + [EXCCODE_TLBM] = kvm_handle_write_fault, + [EXCCODE_FPDIS] = kvm_handle_fpu_disabled, + [EXCCODE_GSPR] = kvm_handle_gspr, +}; + +int kvm_handle_fault(struct kvm_vcpu *vcpu, int fault) +{ + return kvm_fault_tables[fault](vcpu); +} diff --git a/arch/loongarch/kvm/interrupt.c b/arch/loongarch/kvm/interrupt.c new file mode 100644 index 000000000000..4c3f22de4b40 --- /dev/null +++ b/arch/loongarch/kvm/interrupt.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/err.h> +#include <linux/errno.h> +#include <asm/kvm_csr.h> +#include <asm/kvm_vcpu.h> + +static unsigned int priority_to_irq[EXCCODE_INT_NUM] = { + [INT_TI] = CPU_TIMER, + [INT_IPI] = CPU_IPI, + [INT_SWI0] = CPU_SIP0, + [INT_SWI1] = CPU_SIP1, + [INT_HWI0] = CPU_IP0, + [INT_HWI1] = CPU_IP1, + [INT_HWI2] = CPU_IP2, + [INT_HWI3] = CPU_IP3, + [INT_HWI4] = CPU_IP4, + [INT_HWI5] = CPU_IP5, + [INT_HWI6] = CPU_IP6, + [INT_HWI7] = CPU_IP7, +}; + +static int kvm_irq_deliver(struct kvm_vcpu *vcpu, unsigned int priority) +{ + unsigned int irq = 0; + + clear_bit(priority, &vcpu->arch.irq_pending); + if (priority < EXCCODE_INT_NUM) + irq = priority_to_irq[priority]; + + switch (priority) { + case INT_TI: + case INT_IPI: + case INT_SWI0: + case INT_SWI1: + set_gcsr_estat(irq); + break; + + case INT_HWI0 ... INT_HWI7: + set_csr_gintc(irq); + break; + + default: + break; + } + + return 1; +} + +static int kvm_irq_clear(struct kvm_vcpu *vcpu, unsigned int priority) +{ + unsigned int irq = 0; + + clear_bit(priority, &vcpu->arch.irq_clear); + if (priority < EXCCODE_INT_NUM) + irq = priority_to_irq[priority]; + + switch (priority) { + case INT_TI: + case INT_IPI: + case INT_SWI0: + case INT_SWI1: + clear_gcsr_estat(irq); + break; + + case INT_HWI0 ... INT_HWI7: + clear_csr_gintc(irq); + break; + + default: + break; + } + + return 1; +} + +void kvm_deliver_intr(struct kvm_vcpu *vcpu) +{ + unsigned int priority; + unsigned long *pending = &vcpu->arch.irq_pending; + unsigned long *pending_clr = &vcpu->arch.irq_clear; + + if (!(*pending) && !(*pending_clr)) + return; + + if (*pending_clr) { + priority = __ffs(*pending_clr); + while (priority <= INT_IPI) { + kvm_irq_clear(vcpu, priority); + priority = find_next_bit(pending_clr, + BITS_PER_BYTE * sizeof(*pending_clr), + priority + 1); + } + } + + if (*pending) { + priority = __ffs(*pending); + while (priority <= INT_IPI) { + kvm_irq_deliver(vcpu, priority); + priority = find_next_bit(pending, + BITS_PER_BYTE * sizeof(*pending), + priority + 1); + } + } +} + +int kvm_pending_timer(struct kvm_vcpu *vcpu) +{ + return test_bit(INT_TI, &vcpu->arch.irq_pending); +} + +/* + * Only support illegal instruction or illegal Address Error exception, + * Other exceptions are injected by hardware in kvm mode + */ +static void _kvm_deliver_exception(struct kvm_vcpu *vcpu, + unsigned int code, unsigned int subcode) +{ + unsigned long val, vec_size; + + /* + * BADV is added for EXCCODE_ADE exception + * Use PC register (GVA address) if it is instruction exeception + * Else use BADV from host side (GPA address) for data exeception + */ + if (code == EXCCODE_ADE) { + if (subcode == EXSUBCODE_ADEF) + val = vcpu->arch.pc; + else + val = vcpu->arch.badv; + kvm_write_hw_gcsr(LOONGARCH_CSR_BADV, val); + } + + /* Set exception instruction */ + kvm_write_hw_gcsr(LOONGARCH_CSR_BADI, vcpu->arch.badi); + + /* + * Save CRMD in PRMD + * Set IRQ disabled and PLV0 with CRMD + */ + val = kvm_read_hw_gcsr(LOONGARCH_CSR_CRMD); + kvm_write_hw_gcsr(LOONGARCH_CSR_PRMD, val); + val = val & ~(CSR_CRMD_PLV | CSR_CRMD_IE); + kvm_write_hw_gcsr(LOONGARCH_CSR_CRMD, val); + + /* Set exception PC address */ + kvm_write_hw_gcsr(LOONGARCH_CSR_ERA, vcpu->arch.pc); + + /* + * Set exception code + * Exception and interrupt can be inject at the same time + * Hardware will handle exception first and then extern interrupt + * Exception code is Ecode in ESTAT[16:21] + * Interrupt code in ESTAT[0:12] + */ + val = kvm_read_hw_gcsr(LOONGARCH_CSR_ESTAT); + val = (val & ~CSR_ESTAT_EXC) | code; + kvm_write_hw_gcsr(LOONGARCH_CSR_ESTAT, val); + + /* Calculate expcetion entry address */ + val = kvm_read_hw_gcsr(LOONGARCH_CSR_ECFG); + vec_size = (val & CSR_ECFG_VS) >> CSR_ECFG_VS_SHIFT; + if (vec_size) + vec_size = (1 << vec_size) * 4; + val = kvm_read_hw_gcsr(LOONGARCH_CSR_EENTRY); + vcpu->arch.pc = val + code * vec_size; +} + +void kvm_deliver_exception(struct kvm_vcpu *vcpu) +{ + unsigned int code; + unsigned long *pending = &vcpu->arch.exception_pending; + + if (*pending) { + code = __ffs(*pending); + _kvm_deliver_exception(vcpu, code, vcpu->arch.esubcode); + *pending = 0; + vcpu->arch.esubcode = 0; + } +} diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c new file mode 100644 index 000000000000..1c1d5199500e --- /dev/null +++ b/arch/loongarch/kvm/main.c @@ -0,0 +1,420 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/err.h> +#include <linux/module.h> +#include <linux/kvm_host.h> +#include <asm/cacheflush.h> +#include <asm/cpufeature.h> +#include <asm/kvm_csr.h> +#include "trace.h" + +unsigned long vpid_mask; +struct kvm_world_switch *kvm_loongarch_ops; +static int gcsr_flag[CSR_MAX_NUMS]; +static struct kvm_context __percpu *vmcs; + +int get_gcsr_flag(int csr) +{ + if (csr < CSR_MAX_NUMS) + return gcsr_flag[csr]; + + return INVALID_GCSR; +} + +static inline void set_gcsr_sw_flag(int csr) +{ + if (csr < CSR_MAX_NUMS) + gcsr_flag[csr] |= SW_GCSR; +} + +static inline void set_gcsr_hw_flag(int csr) +{ + if (csr < CSR_MAX_NUMS) + gcsr_flag[csr] |= HW_GCSR; +} + +/* + * The default value of gcsr_flag[CSR] is 0, and we use this + * function to set the flag to 1 (SW_GCSR) or 2 (HW_GCSR) if the + * gcsr is software or hardware. It will be used by get/set_gcsr, + * if gcsr_flag is HW we should use gcsrrd/gcsrwr to access it, + * else use software csr to emulate it. + */ +static void kvm_init_gcsr_flag(void) +{ + set_gcsr_hw_flag(LOONGARCH_CSR_CRMD); + set_gcsr_hw_flag(LOONGARCH_CSR_PRMD); + set_gcsr_hw_flag(LOONGARCH_CSR_EUEN); + set_gcsr_hw_flag(LOONGARCH_CSR_MISC); + set_gcsr_hw_flag(LOONGARCH_CSR_ECFG); + set_gcsr_hw_flag(LOONGARCH_CSR_ESTAT); + set_gcsr_hw_flag(LOONGARCH_CSR_ERA); + set_gcsr_hw_flag(LOONGARCH_CSR_BADV); + set_gcsr_hw_flag(LOONGARCH_CSR_BADI); + set_gcsr_hw_flag(LOONGARCH_CSR_EENTRY); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBIDX); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBEHI); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBELO0); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBELO1); + set_gcsr_hw_flag(LOONGARCH_CSR_ASID); + set_gcsr_hw_flag(LOONGARCH_CSR_PGDL); + set_gcsr_hw_flag(LOONGARCH_CSR_PGDH); + set_gcsr_hw_flag(LOONGARCH_CSR_PGD); + set_gcsr_hw_flag(LOONGARCH_CSR_PWCTL0); + set_gcsr_hw_flag(LOONGARCH_CSR_PWCTL1); + set_gcsr_hw_flag(LOONGARCH_CSR_STLBPGSIZE); + set_gcsr_hw_flag(LOONGARCH_CSR_RVACFG); + set_gcsr_hw_flag(LOONGARCH_CSR_CPUID); + set_gcsr_hw_flag(LOONGARCH_CSR_PRCFG1); + set_gcsr_hw_flag(LOONGARCH_CSR_PRCFG2); + set_gcsr_hw_flag(LOONGARCH_CSR_PRCFG3); + set_gcsr_hw_flag(LOONGARCH_CSR_KS0); + set_gcsr_hw_flag(LOONGARCH_CSR_KS1); + set_gcsr_hw_flag(LOONGARCH_CSR_KS2); + set_gcsr_hw_flag(LOONGARCH_CSR_KS3); + set_gcsr_hw_flag(LOONGARCH_CSR_KS4); + set_gcsr_hw_flag(LOONGARCH_CSR_KS5); + set_gcsr_hw_flag(LOONGARCH_CSR_KS6); + set_gcsr_hw_flag(LOONGARCH_CSR_KS7); + set_gcsr_hw_flag(LOONGARCH_CSR_TMID); + set_gcsr_hw_flag(LOONGARCH_CSR_TCFG); + set_gcsr_hw_flag(LOONGARCH_CSR_TVAL); + set_gcsr_hw_flag(LOONGARCH_CSR_TINTCLR); + set_gcsr_hw_flag(LOONGARCH_CSR_CNTC); + set_gcsr_hw_flag(LOONGARCH_CSR_LLBCTL); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBRENTRY); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBRBADV); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBRERA); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBRSAVE); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBRELO0); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBRELO1); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBREHI); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBRPRMD); + set_gcsr_hw_flag(LOONGARCH_CSR_DMWIN0); + set_gcsr_hw_flag(LOONGARCH_CSR_DMWIN1); + set_gcsr_hw_flag(LOONGARCH_CSR_DMWIN2); + set_gcsr_hw_flag(LOONGARCH_CSR_DMWIN3); + + set_gcsr_sw_flag(LOONGARCH_CSR_IMPCTL1); + set_gcsr_sw_flag(LOONGARCH_CSR_IMPCTL2); + set_gcsr_sw_flag(LOONGARCH_CSR_MERRCTL); + set_gcsr_sw_flag(LOONGARCH_CSR_MERRINFO1); + set_gcsr_sw_flag(LOONGARCH_CSR_MERRINFO2); + set_gcsr_sw_flag(LOONGARCH_CSR_MERRENTRY); + set_gcsr_sw_flag(LOONGARCH_CSR_MERRERA); + set_gcsr_sw_flag(LOONGARCH_CSR_MERRSAVE); + set_gcsr_sw_flag(LOONGARCH_CSR_CTAG); + set_gcsr_sw_flag(LOONGARCH_CSR_DEBUG); + set_gcsr_sw_flag(LOONGARCH_CSR_DERA); + set_gcsr_sw_flag(LOONGARCH_CSR_DESAVE); + + set_gcsr_sw_flag(LOONGARCH_CSR_FWPC); + set_gcsr_sw_flag(LOONGARCH_CSR_FWPS); + set_gcsr_sw_flag(LOONGARCH_CSR_MWPC); + set_gcsr_sw_flag(LOONGARCH_CSR_MWPS); + + set_gcsr_sw_flag(LOONGARCH_CSR_DB0ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_DB0MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_DB0CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_DB0ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_DB1ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_DB1MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_DB1CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_DB1ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_DB2ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_DB2MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_DB2CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_DB2ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_DB3ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_DB3MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_DB3CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_DB3ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_DB4ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_DB4MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_DB4CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_DB4ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_DB5ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_DB5MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_DB5CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_DB5ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_DB6ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_DB6MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_DB6CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_DB6ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_DB7ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_DB7MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_DB7CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_DB7ASID); + + set_gcsr_sw_flag(LOONGARCH_CSR_IB0ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_IB0MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_IB0CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_IB0ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_IB1ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_IB1MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_IB1CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_IB1ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_IB2ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_IB2MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_IB2CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_IB2ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_IB3ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_IB3MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_IB3CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_IB3ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_IB4ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_IB4MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_IB4CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_IB4ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_IB5ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_IB5MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_IB5CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_IB5ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_IB6ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_IB6MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_IB6CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_IB6ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_IB7ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_IB7MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_IB7CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_IB7ASID); + + set_gcsr_sw_flag(LOONGARCH_CSR_PERFCTRL0); + set_gcsr_sw_flag(LOONGARCH_CSR_PERFCNTR0); + set_gcsr_sw_flag(LOONGARCH_CSR_PERFCTRL1); + set_gcsr_sw_flag(LOONGARCH_CSR_PERFCNTR1); + set_gcsr_sw_flag(LOONGARCH_CSR_PERFCTRL2); + set_gcsr_sw_flag(LOONGARCH_CSR_PERFCNTR2); + set_gcsr_sw_flag(LOONGARCH_CSR_PERFCTRL3); + set_gcsr_sw_flag(LOONGARCH_CSR_PERFCNTR3); +} + +static void kvm_update_vpid(struct kvm_vcpu *vcpu, int cpu) +{ + unsigned long vpid; + struct kvm_context *context; + + context = per_cpu_ptr(vcpu->kvm->arch.vmcs, cpu); + vpid = context->vpid_cache + 1; + if (!(vpid & vpid_mask)) { + /* finish round of vpid loop */ + if (unlikely(!vpid)) + vpid = vpid_mask + 1; + + ++vpid; /* vpid 0 reserved for root */ + + /* start new vpid cycle */ + kvm_flush_tlb_all(); + } + + context->vpid_cache = vpid; + vcpu->arch.vpid = vpid; +} + +void kvm_check_vpid(struct kvm_vcpu *vcpu) +{ + int cpu; + bool migrated; + unsigned long ver, old, vpid; + struct kvm_context *context; + + cpu = smp_processor_id(); + /* + * Are we entering guest context on a different CPU to last time? + * If so, the vCPU's guest TLB state on this CPU may be stale. + */ + context = per_cpu_ptr(vcpu->kvm->arch.vmcs, cpu); + migrated = (vcpu->cpu != cpu); + + /* + * Check if our vpid is of an older version + * + * We also discard the stored vpid if we've executed on + * another CPU, as the guest mappings may have changed without + * hypervisor knowledge. + */ + ver = vcpu->arch.vpid & ~vpid_mask; + old = context->vpid_cache & ~vpid_mask; + if (migrated || (ver != old)) { + kvm_update_vpid(vcpu, cpu); + trace_kvm_vpid_change(vcpu, vcpu->arch.vpid); + vcpu->cpu = cpu; + } + + /* Restore GSTAT(0x50).vpid */ + vpid = (vcpu->arch.vpid & vpid_mask) << CSR_GSTAT_GID_SHIFT; + change_csr_gstat(vpid_mask << CSR_GSTAT_GID_SHIFT, vpid); +} + +void kvm_init_vmcs(struct kvm *kvm) +{ + kvm->arch.vmcs = vmcs; +} + +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return -ENOIOCTLCMD; +} + +int kvm_arch_hardware_enable(void) +{ + unsigned long env, gcfg = 0; + + env = read_csr_gcfg(); + + /* First init gcfg, gstat, gintc, gtlbc. All guest use the same config */ + write_csr_gcfg(0); + write_csr_gstat(0); + write_csr_gintc(0); + clear_csr_gtlbc(CSR_GTLBC_USETGID | CSR_GTLBC_TOTI); + + /* + * Enable virtualization features granting guest direct control of + * certain features: + * GCI=2: Trap on init or unimplement cache instruction. + * TORU=0: Trap on Root Unimplement. + * CACTRL=1: Root control cache. + * TOP=0: Trap on Previlege. + * TOE=0: Trap on Exception. + * TIT=0: Trap on Timer. + */ + if (env & CSR_GCFG_GCIP_ALL) + gcfg |= CSR_GCFG_GCI_SECURE; + if (env & CSR_GCFG_MATC_ROOT) + gcfg |= CSR_GCFG_MATC_ROOT; + + gcfg |= CSR_GCFG_TIT; + write_csr_gcfg(gcfg); + + kvm_flush_tlb_all(); + + /* Enable using TGID */ + set_csr_gtlbc(CSR_GTLBC_USETGID); + kvm_debug("GCFG:%lx GSTAT:%lx GINTC:%lx GTLBC:%lx", + read_csr_gcfg(), read_csr_gstat(), read_csr_gintc(), read_csr_gtlbc()); + + return 0; +} + +void kvm_arch_hardware_disable(void) +{ + write_csr_gcfg(0); + write_csr_gstat(0); + write_csr_gintc(0); + clear_csr_gtlbc(CSR_GTLBC_USETGID | CSR_GTLBC_TOTI); + + /* Flush any remaining guest TLB entries */ + kvm_flush_tlb_all(); +} + +static int kvm_loongarch_env_init(void) +{ + int cpu, order; + void *addr; + struct kvm_context *context; + + vmcs = alloc_percpu(struct kvm_context); + if (!vmcs) { + pr_err("kvm: failed to allocate percpu kvm_context\n"); + return -ENOMEM; + } + + kvm_loongarch_ops = kzalloc(sizeof(*kvm_loongarch_ops), GFP_KERNEL); + if (!kvm_loongarch_ops) { + free_percpu(vmcs); + vmcs = NULL; + return -ENOMEM; + } + + /* + * PGD register is shared between root kernel and kvm hypervisor. + * So world switch entry should be in DMW area rather than TLB area + * to avoid page fault reenter. + * + * In future if hardware pagetable walking is supported, we won't + * need to copy world switch code to DMW area. + */ + order = get_order(kvm_exception_size + kvm_enter_guest_size); + addr = (void *)__get_free_pages(GFP_KERNEL, order); + if (!addr) { + free_percpu(vmcs); + vmcs = NULL; + kfree(kvm_loongarch_ops); + kvm_loongarch_ops = NULL; + return -ENOMEM; + } + + memcpy(addr, kvm_exc_entry, kvm_exception_size); + memcpy(addr + kvm_exception_size, kvm_enter_guest, kvm_enter_guest_size); + flush_icache_range((unsigned long)addr, (unsigned long)addr + kvm_exception_size + kvm_enter_guest_size); + kvm_loongarch_ops->exc_entry = addr; + kvm_loongarch_ops->enter_guest = addr + kvm_exception_size; + kvm_loongarch_ops->page_order = order; + + vpid_mask = read_csr_gstat(); + vpid_mask = (vpid_mask & CSR_GSTAT_GIDBIT) >> CSR_GSTAT_GIDBIT_SHIFT; + if (vpid_mask) + vpid_mask = GENMASK(vpid_mask - 1, 0); + + for_each_possible_cpu(cpu) { + context = per_cpu_ptr(vmcs, cpu); + context->vpid_cache = vpid_mask + 1; + context->last_vcpu = NULL; + } + + kvm_init_gcsr_flag(); + + return 0; +} + +static void kvm_loongarch_env_exit(void) +{ + unsigned long addr; + + if (vmcs) + free_percpu(vmcs); + + if (kvm_loongarch_ops) { + if (kvm_loongarch_ops->exc_entry) { + addr = (unsigned long)kvm_loongarch_ops->exc_entry; + free_pages(addr, kvm_loongarch_ops->page_order); + } + kfree(kvm_loongarch_ops); + } +} + +static int kvm_loongarch_init(void) +{ + int r; + + if (!cpu_has_lvz) { + kvm_info("Hardware virtualization not available\n"); + return -ENODEV; + } + r = kvm_loongarch_env_init(); + if (r) + return r; + + return kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); +} + +static void kvm_loongarch_exit(void) +{ + kvm_exit(); + kvm_loongarch_env_exit(); +} + +module_init(kvm_loongarch_init); +module_exit(kvm_loongarch_exit); + +#ifdef MODULE +static const struct cpu_feature kvm_feature[] = { + { .feature = cpu_feature(LOONGARCH_LVZ) }, + {}, +}; +MODULE_DEVICE_TABLE(cpu, kvm_feature); +#endif diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c new file mode 100644 index 000000000000..80480df5f550 --- /dev/null +++ b/arch/loongarch/kvm/mmu.c @@ -0,0 +1,914 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/highmem.h> +#include <linux/hugetlb.h> +#include <linux/kvm_host.h> +#include <linux/page-flags.h> +#include <linux/uaccess.h> +#include <asm/mmu_context.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> +#include <asm/kvm_mmu.h> + +static inline void kvm_ptw_prepare(struct kvm *kvm, kvm_ptw_ctx *ctx) +{ + ctx->level = kvm->arch.root_level; + /* pte table */ + ctx->invalid_ptes = kvm->arch.invalid_ptes; + ctx->pte_shifts = kvm->arch.pte_shifts; + ctx->pgtable_shift = ctx->pte_shifts[ctx->level]; + ctx->invalid_entry = ctx->invalid_ptes[ctx->level]; + ctx->opaque = kvm; +} + +/* + * Mark a range of guest physical address space old (all accesses fault) in the + * VM's GPA page table to allow detection of commonly used pages. + */ +static int kvm_mkold_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) +{ + if (kvm_pte_young(*pte)) { + *pte = kvm_pte_mkold(*pte); + return 1; + } + + return 0; +} + +/* + * Mark a range of guest physical address space clean (writes fault) in the VM's + * GPA page table to allow dirty page tracking. + */ +static int kvm_mkclean_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) +{ + gfn_t offset; + kvm_pte_t val; + + val = *pte; + /* + * For kvm_arch_mmu_enable_log_dirty_pt_masked with mask, start and end + * may cross hugepage, for first huge page parameter addr is equal to + * start, however for the second huge page addr is base address of + * this huge page, rather than start or end address + */ + if ((ctx->flag & _KVM_HAS_PGMASK) && !kvm_pte_huge(val)) { + offset = (addr >> PAGE_SHIFT) - ctx->gfn; + if (!(BIT(offset) & ctx->mask)) + return 0; + } + + /* + * Need not split huge page now, just set write-proect pte bit + * Split huge page until next write fault + */ + if (kvm_pte_dirty(val)) { + *pte = kvm_pte_mkclean(val); + return 1; + } + + return 0; +} + +/* + * Clear pte entry + */ +static int kvm_flush_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) +{ + struct kvm *kvm; + + kvm = ctx->opaque; + if (ctx->level) + kvm->stat.hugepages--; + else + kvm->stat.pages--; + + *pte = ctx->invalid_entry; + + return 1; +} + +/* + * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory. + * + * Allocate a blank KVM GPA page directory (PGD) for representing guest physical + * to host physical page mappings. + * + * Returns: Pointer to new KVM GPA page directory. + * NULL on allocation failure. + */ +kvm_pte_t *kvm_pgd_alloc(void) +{ + kvm_pte_t *pgd; + + pgd = (kvm_pte_t *)__get_free_pages(GFP_KERNEL, 0); + if (pgd) + pgd_init((void *)pgd); + + return pgd; +} + +static void _kvm_pte_init(void *addr, unsigned long val) +{ + unsigned long *p, *end; + + p = (unsigned long *)addr; + end = p + PTRS_PER_PTE; + do { + p[0] = val; + p[1] = val; + p[2] = val; + p[3] = val; + p[4] = val; + p += 8; + p[-3] = val; + p[-2] = val; + p[-1] = val; + } while (p != end); +} + +/* + * Caller must hold kvm->mm_lock + * + * Walk the page tables of kvm to find the PTE corresponding to the + * address @addr. If page tables don't exist for @addr, they will be created + * from the MMU cache if @cache is not NULL. + */ +static kvm_pte_t *kvm_populate_gpa(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, + unsigned long addr, int level) +{ + kvm_ptw_ctx ctx; + kvm_pte_t *entry, *child; + + kvm_ptw_prepare(kvm, &ctx); + child = kvm->arch.pgd; + while (ctx.level > level) { + entry = kvm_pgtable_offset(&ctx, child, addr); + if (kvm_pte_none(&ctx, entry)) { + if (!cache) + return NULL; + + child = kvm_mmu_memory_cache_alloc(cache); + _kvm_pte_init(child, ctx.invalid_ptes[ctx.level - 1]); + kvm_set_pte(entry, __pa(child)); + } else if (kvm_pte_huge(*entry)) { + return entry; + } else + child = (kvm_pte_t *)__va(PHYSADDR(*entry)); + kvm_ptw_enter(&ctx); + } + + entry = kvm_pgtable_offset(&ctx, child, addr); + + return entry; +} + +/* + * Page walker for VM shadow mmu at last level + * The last level is small pte page or huge pmd page + */ +static int kvm_ptw_leaf(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) +{ + int ret; + phys_addr_t next, start, size; + struct list_head *list; + kvm_pte_t *entry, *child; + + ret = 0; + start = addr; + child = (kvm_pte_t *)__va(PHYSADDR(*dir)); + entry = kvm_pgtable_offset(ctx, child, addr); + do { + next = addr + (0x1UL << ctx->pgtable_shift); + if (!kvm_pte_present(ctx, entry)) + continue; + + ret |= ctx->ops(entry, addr, ctx); + } while (entry++, addr = next, addr < end); + + if (kvm_need_flush(ctx)) { + size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3); + if (start + size == end) { + list = (struct list_head *)child; + list_add_tail(list, &ctx->list); + *dir = ctx->invalid_ptes[ctx->level + 1]; + } + } + + return ret; +} + +/* + * Page walker for VM shadow mmu at page table dir level + */ +static int kvm_ptw_dir(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) +{ + int ret; + phys_addr_t next, start, size; + struct list_head *list; + kvm_pte_t *entry, *child; + + ret = 0; + start = addr; + child = (kvm_pte_t *)__va(PHYSADDR(*dir)); + entry = kvm_pgtable_offset(ctx, child, addr); + do { + next = kvm_pgtable_addr_end(ctx, addr, end); + if (!kvm_pte_present(ctx, entry)) + continue; + + if (kvm_pte_huge(*entry)) { + ret |= ctx->ops(entry, addr, ctx); + continue; + } + + kvm_ptw_enter(ctx); + if (ctx->level == 0) + ret |= kvm_ptw_leaf(entry, addr, next, ctx); + else + ret |= kvm_ptw_dir(entry, addr, next, ctx); + kvm_ptw_exit(ctx); + } while (entry++, addr = next, addr < end); + + if (kvm_need_flush(ctx)) { + size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3); + if (start + size == end) { + list = (struct list_head *)child; + list_add_tail(list, &ctx->list); + *dir = ctx->invalid_ptes[ctx->level + 1]; + } + } + + return ret; +} + +/* + * Page walker for VM shadow mmu at page root table + */ +static int kvm_ptw_top(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) +{ + int ret; + phys_addr_t next; + kvm_pte_t *entry; + + ret = 0; + entry = kvm_pgtable_offset(ctx, dir, addr); + do { + next = kvm_pgtable_addr_end(ctx, addr, end); + if (!kvm_pte_present(ctx, entry)) + continue; + + kvm_ptw_enter(ctx); + ret |= kvm_ptw_dir(entry, addr, next, ctx); + kvm_ptw_exit(ctx); + } while (entry++, addr = next, addr < end); + + return ret; +} + +/* + * kvm_flush_range() - Flush a range of guest physical addresses. + * @kvm: KVM pointer. + * @start_gfn: Guest frame number of first page in GPA range to flush. + * @end_gfn: Guest frame number of last page in GPA range to flush. + * @lock: Whether to hold mmu_lock or not + * + * Flushes a range of GPA mappings from the GPA page tables. + */ +static void kvm_flush_range(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, int lock) +{ + int ret; + kvm_ptw_ctx ctx; + struct list_head *pos, *temp; + + ctx.ops = kvm_flush_pte; + ctx.flag = _KVM_FLUSH_PGTABLE; + kvm_ptw_prepare(kvm, &ctx); + INIT_LIST_HEAD(&ctx.list); + + if (lock) { + spin_lock(&kvm->mmu_lock); + ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, + end_gfn << PAGE_SHIFT, &ctx); + spin_unlock(&kvm->mmu_lock); + } else + ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, + end_gfn << PAGE_SHIFT, &ctx); + + /* Flush vpid for each vCPU individually */ + if (ret) + kvm_flush_remote_tlbs(kvm); + + /* + * free pte table page after mmu_lock + * the pte table page is linked together with ctx.list + */ + list_for_each_safe(pos, temp, &ctx.list) { + list_del(pos); + free_page((unsigned long)pos); + } +} + +/* + * kvm_mkclean_gpa_pt() - Make a range of guest physical addresses clean. + * @kvm: KVM pointer. + * @start_gfn: Guest frame number of first page in GPA range to flush. + * @end_gfn: Guest frame number of last page in GPA range to flush. + * + * Make a range of GPA mappings clean so that guest writes will fault and + * trigger dirty page logging. + * + * The caller must hold the @kvm->mmu_lock spinlock. + * + * Returns: Whether any GPA mappings were modified, which would require + * derived mappings (GVA page tables & TLB enties) to be + * invalidated. + */ +static int kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) +{ + kvm_ptw_ctx ctx; + + ctx.ops = kvm_mkclean_pte; + ctx.flag = 0; + kvm_ptw_prepare(kvm, &ctx); + return kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, end_gfn << PAGE_SHIFT, &ctx); +} + +/* + * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory + * slot to be write protected + * + * Walks bits set in mask write protects the associated pte's. Caller must + * acquire @kvm->mmu_lock. + */ +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) +{ + kvm_ptw_ctx ctx; + gfn_t base_gfn = slot->base_gfn + gfn_offset; + gfn_t start = base_gfn + __ffs(mask); + gfn_t end = base_gfn + __fls(mask) + 1; + + ctx.ops = kvm_mkclean_pte; + ctx.flag = _KVM_HAS_PGMASK; + ctx.mask = mask; + ctx.gfn = base_gfn; + kvm_ptw_prepare(kvm, &ctx); + + kvm_ptw_top(kvm->arch.pgd, start << PAGE_SHIFT, end << PAGE_SHIFT, &ctx); +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, + enum kvm_mr_change change) +{ + int needs_flush; + + /* + * If dirty page logging is enabled, write protect all pages in the slot + * ready for dirty logging. + * + * There is no need to do this in any of the following cases: + * CREATE: No dirty mappings will already exist. + * MOVE/DELETE: The old mappings will already have been cleaned up by + * kvm_arch_flush_shadow_memslot() + */ + if (change == KVM_MR_FLAGS_ONLY && + (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) && + new->flags & KVM_MEM_LOG_DIRTY_PAGES)) { + spin_lock(&kvm->mmu_lock); + /* Write protect GPA page table entries */ + needs_flush = kvm_mkclean_gpa_pt(kvm, new->base_gfn, + new->base_gfn + new->npages); + spin_unlock(&kvm->mmu_lock); + if (needs_flush) + kvm_flush_remote_tlbs(kvm); + } +} + +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ + kvm_flush_range(kvm, 0, kvm->arch.gpa_size >> PAGE_SHIFT, 0); +} + +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + /* + * The slot has been made invalid (ready for moving or deletion), so we + * need to ensure that it can no longer be accessed by any guest vCPUs. + */ + kvm_flush_range(kvm, slot->base_gfn, slot->base_gfn + slot->npages, 1); +} + +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) +{ + kvm_ptw_ctx ctx; + + ctx.flag = 0; + ctx.ops = kvm_flush_pte; + kvm_ptw_prepare(kvm, &ctx); + INIT_LIST_HEAD(&ctx.list); + + return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT, + range->end << PAGE_SHIFT, &ctx); +} + +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + unsigned long prot_bits; + kvm_pte_t *ptep; + kvm_pfn_t pfn = pte_pfn(range->arg.pte); + gpa_t gpa = range->start << PAGE_SHIFT; + + ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); + if (!ptep) + return false; + + /* Replacing an absent or old page doesn't need flushes */ + if (!kvm_pte_present(NULL, ptep) || !kvm_pte_young(*ptep)) { + kvm_set_pte(ptep, 0); + return false; + } + + /* Fill new pte if write protected or page migrated */ + prot_bits = _PAGE_PRESENT | __READABLE; + prot_bits |= _CACHE_MASK & pte_val(range->arg.pte); + + /* + * Set _PAGE_WRITE or _PAGE_DIRTY iff old and new pte both support + * _PAGE_WRITE for map_page_fast if next page write fault + * _PAGE_DIRTY since gpa has already recorded as dirty page + */ + prot_bits |= __WRITEABLE & *ptep & pte_val(range->arg.pte); + kvm_set_pte(ptep, kvm_pfn_pte(pfn, __pgprot(prot_bits))); + + return true; +} + +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + kvm_ptw_ctx ctx; + + ctx.flag = 0; + ctx.ops = kvm_mkold_pte; + kvm_ptw_prepare(kvm, &ctx); + + return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT, + range->end << PAGE_SHIFT, &ctx); +} + +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + gpa_t gpa = range->start << PAGE_SHIFT; + kvm_pte_t *ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); + + if (ptep && kvm_pte_present(NULL, ptep) && kvm_pte_young(*ptep)) + return true; + + return false; +} + +/* + * kvm_map_page_fast() - Fast path GPA fault handler. + * @vcpu: vCPU pointer. + * @gpa: Guest physical address of fault. + * @write: Whether the fault was due to a write. + * + * Perform fast path GPA fault handling, doing all that can be done without + * calling into KVM. This handles marking old pages young (for idle page + * tracking), and dirtying of clean pages (for dirty page logging). + * + * Returns: 0 on success, in which case we can update derived mappings and + * resume guest execution. + * -EFAULT on failure due to absent GPA mapping or write to + * read-only page, in which case KVM must be consulted. + */ +static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) +{ + int ret = 0; + kvm_pfn_t pfn = 0; + kvm_pte_t *ptep, changed, new; + gfn_t gfn = gpa >> PAGE_SHIFT; + struct kvm *kvm = vcpu->kvm; + struct kvm_memory_slot *slot; + + spin_lock(&kvm->mmu_lock); + + /* Fast path - just check GPA page table for an existing entry */ + ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); + if (!ptep || !kvm_pte_present(NULL, ptep)) { + ret = -EFAULT; + goto out; + } + + /* Track access to pages marked old */ + new = *ptep; + if (!kvm_pte_young(new)) + new = kvm_pte_mkyoung(new); + /* call kvm_set_pfn_accessed() after unlock */ + + if (write && !kvm_pte_dirty(new)) { + if (!kvm_pte_write(new)) { + ret = -EFAULT; + goto out; + } + + if (kvm_pte_huge(new)) { + /* + * Do not set write permission when dirty logging is + * enabled for HugePages + */ + slot = gfn_to_memslot(kvm, gfn); + if (kvm_slot_dirty_track_enabled(slot)) { + ret = -EFAULT; + goto out; + } + } + + /* Track dirtying of writeable pages */ + new = kvm_pte_mkdirty(new); + } + + changed = new ^ (*ptep); + if (changed) { + kvm_set_pte(ptep, new); + pfn = kvm_pte_pfn(new); + } + spin_unlock(&kvm->mmu_lock); + + /* + * Fixme: pfn may be freed after mmu_lock + * kvm_try_get_pfn(pfn)/kvm_release_pfn pair to prevent this? + */ + if (kvm_pte_young(changed)) + kvm_set_pfn_accessed(pfn); + + if (kvm_pte_dirty(changed)) { + mark_page_dirty(kvm, gfn); + kvm_set_pfn_dirty(pfn); + } + return ret; +out: + spin_unlock(&kvm->mmu_lock); + return ret; +} + +static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot, + unsigned long hva, unsigned long map_size, bool write) +{ + size_t size; + gpa_t gpa_start; + hva_t uaddr_start, uaddr_end; + + /* Disable dirty logging on HugePages */ + if (kvm_slot_dirty_track_enabled(memslot) && write) + return false; + + size = memslot->npages * PAGE_SIZE; + gpa_start = memslot->base_gfn << PAGE_SHIFT; + uaddr_start = memslot->userspace_addr; + uaddr_end = uaddr_start + size; + + /* + * Pages belonging to memslots that don't have the same alignment + * within a PMD for userspace and GPA cannot be mapped with stage-2 + * PMD entries, because we'll end up mapping the wrong pages. + * + * Consider a layout like the following: + * + * memslot->userspace_addr: + * +-----+--------------------+--------------------+---+ + * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| + * +-----+--------------------+--------------------+---+ + * + * memslot->base_gfn << PAGE_SIZE: + * +---+--------------------+--------------------+-----+ + * |abc|def Stage-2 block | Stage-2 block |tvxyz| + * +---+--------------------+--------------------+-----+ + * + * If we create those stage-2 blocks, we'll end up with this incorrect + * mapping: + * d -> f + * e -> g + * f -> h + */ + if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) + return false; + + /* + * Next, let's make sure we're not trying to map anything not covered + * by the memslot. This means we have to prohibit block size mappings + * for the beginning and end of a non-block aligned and non-block sized + * memory slot (illustrated by the head and tail parts of the + * userspace view above containing pages 'abcde' and 'xyz', + * respectively). + * + * Note that it doesn't matter if we do the check using the + * userspace_addr or the base_gfn, as both are equally aligned (per + * the check above) and equally sized. + */ + return (hva & ~(map_size - 1)) >= uaddr_start && + (hva & ~(map_size - 1)) + map_size <= uaddr_end; +} + +/* + * Lookup the mapping level for @gfn in the current mm. + * + * WARNING! Use of host_pfn_mapping_level() requires the caller and the end + * consumer to be tied into KVM's handlers for MMU notifier events! + * + * There are several ways to safely use this helper: + * + * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before + * consuming it. In this case, mmu_lock doesn't need to be held during the + * lookup, but it does need to be held while checking the MMU notifier. + * + * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation + * event for the hva. This can be done by explicit checking the MMU notifier + * or by ensuring that KVM already has a valid mapping that covers the hva. + * + * - Do not use the result to install new mappings, e.g. use the host mapping + * level only to decide whether or not to zap an entry. In this case, it's + * not required to hold mmu_lock (though it's highly likely the caller will + * want to hold mmu_lock anyways, e.g. to modify SPTEs). + * + * Note! The lookup can still race with modifications to host page tables, but + * the above "rules" ensure KVM will not _consume_ the result of the walk if a + * race with the primary MMU occurs. + */ +static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, + const struct kvm_memory_slot *slot) +{ + int level = 0; + unsigned long hva; + unsigned long flags; + pgd_t pgd; + p4d_t p4d; + pud_t pud; + pmd_t pmd; + + /* + * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() + * is not solely for performance, it's also necessary to avoid the + * "writable" check in __gfn_to_hva_many(), which will always fail on + * read-only memslots due to gfn_to_hva() assuming writes. Earlier + * page fault steps have already verified the guest isn't writing a + * read-only memslot. + */ + hva = __gfn_to_hva_memslot(slot, gfn); + + /* + * Disable IRQs to prevent concurrent tear down of host page tables, + * e.g. if the primary MMU promotes a P*D to a huge page and then frees + * the original page table. + */ + local_irq_save(flags); + + /* + * Read each entry once. As above, a non-leaf entry can be promoted to + * a huge page _during_ this walk. Re-reading the entry could send the + * walk into the weeks, e.g. p*d_large() returns false (sees the old + * value) and then p*d_offset() walks into the target huge page instead + * of the old page table (sees the new value). + */ + pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); + if (pgd_none(pgd)) + goto out; + + p4d = READ_ONCE(*p4d_offset(&pgd, hva)); + if (p4d_none(p4d) || !p4d_present(p4d)) + goto out; + + pud = READ_ONCE(*pud_offset(&p4d, hva)); + if (pud_none(pud) || !pud_present(pud)) + goto out; + + pmd = READ_ONCE(*pmd_offset(&pud, hva)); + if (pmd_none(pmd) || !pmd_present(pmd)) + goto out; + + if (kvm_pte_huge(pmd_val(pmd))) + level = 1; + +out: + local_irq_restore(flags); + return level; +} + +/* + * Split huge page + */ +static kvm_pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, kvm_pte_t *ptep, gfn_t gfn) +{ + int i; + kvm_pte_t val, *child; + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_memory_cache *memcache; + + memcache = &vcpu->arch.mmu_page_cache; + child = kvm_mmu_memory_cache_alloc(memcache); + val = kvm_pte_mksmall(*ptep); + for (i = 0; i < PTRS_PER_PTE; i++) { + kvm_set_pte(child + i, val); + val += PAGE_SIZE; + } + + /* The later kvm_flush_tlb_gpa() will flush hugepage tlb */ + kvm_set_pte(ptep, __pa(child)); + + kvm->stat.hugepages--; + kvm->stat.pages += PTRS_PER_PTE; + + return child + (gfn & (PTRS_PER_PTE - 1)); +} + +/* + * kvm_map_page() - Map a guest physical page. + * @vcpu: vCPU pointer. + * @gpa: Guest physical address of fault. + * @write: Whether the fault was due to a write. + * + * Handle GPA faults by creating a new GPA mapping (or updating an existing + * one). + * + * This takes care of marking pages young or dirty (idle/dirty page tracking), + * asking KVM for the corresponding PFN, and creating a mapping in the GPA page + * tables. Derived mappings (GVA page tables and TLBs) must be handled by the + * caller. + * + * Returns: 0 on success + * -EFAULT if there is no memory region at @gpa or a write was + * attempted to a read-only memory region. This is usually handled + * as an MMIO access. + */ +static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) +{ + bool writeable; + int srcu_idx, err, retry_no = 0, level; + unsigned long hva, mmu_seq, prot_bits; + kvm_pfn_t pfn; + kvm_pte_t *ptep, new_pte; + gfn_t gfn = gpa >> PAGE_SHIFT; + struct kvm *kvm = vcpu->kvm; + struct kvm_memory_slot *memslot; + struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; + + /* Try the fast path to handle old / clean pages */ + srcu_idx = srcu_read_lock(&kvm->srcu); + err = kvm_map_page_fast(vcpu, gpa, write); + if (!err) + goto out; + + memslot = gfn_to_memslot(kvm, gfn); + hva = gfn_to_hva_memslot_prot(memslot, gfn, &writeable); + if (kvm_is_error_hva(hva) || (write && !writeable)) { + err = -EFAULT; + goto out; + } + + /* We need a minimum of cached pages ready for page table creation */ + err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); + if (err) + goto out; + +retry: + /* + * Used to check for invalidations in progress, of the pfn that is + * returned by pfn_to_pfn_prot below. + */ + mmu_seq = kvm->mmu_invalidate_seq; + /* + * Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads in + * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't + * risk the page we get a reference to getting unmapped before we have a + * chance to grab the mmu_lock without mmu_invalidate_retry() noticing. + * + * This smp_rmb() pairs with the effective smp_wmb() of the combination + * of the pte_unmap_unlock() after the PTE is zapped, and the + * spin_lock() in kvm_mmu_invalidate_invalidate_<page|range_end>() before + * mmu_invalidate_seq is incremented. + */ + smp_rmb(); + + /* Slow path - ask KVM core whether we can access this GPA */ + pfn = gfn_to_pfn_prot(kvm, gfn, write, &writeable); + if (is_error_noslot_pfn(pfn)) { + err = -EFAULT; + goto out; + } + + /* Check if an invalidation has taken place since we got pfn */ + spin_lock(&kvm->mmu_lock); + if (mmu_invalidate_retry_hva(kvm, mmu_seq, hva)) { + /* + * This can happen when mappings are changed asynchronously, but + * also synchronously if a COW is triggered by + * gfn_to_pfn_prot(). + */ + spin_unlock(&kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + if (retry_no > 100) { + retry_no = 0; + schedule(); + } + retry_no++; + goto retry; + } + + /* + * For emulated devices such virtio device, actual cache attribute is + * determined by physical machine. + * For pass through physical device, it should be uncachable + */ + prot_bits = _PAGE_PRESENT | __READABLE; + if (pfn_valid(pfn)) + prot_bits |= _CACHE_CC; + else + prot_bits |= _CACHE_SUC; + + if (writeable) { + prot_bits |= _PAGE_WRITE; + if (write) + prot_bits |= __WRITEABLE; + } + + /* Disable dirty logging on HugePages */ + level = 0; + if (!fault_supports_huge_mapping(memslot, hva, PMD_SIZE, write)) { + level = 0; + } else { + level = host_pfn_mapping_level(kvm, gfn, memslot); + if (level == 1) { + gfn = gfn & ~(PTRS_PER_PTE - 1); + pfn = pfn & ~(PTRS_PER_PTE - 1); + } + } + + /* Ensure page tables are allocated */ + ptep = kvm_populate_gpa(kvm, memcache, gpa, level); + new_pte = kvm_pfn_pte(pfn, __pgprot(prot_bits)); + if (level == 1) { + new_pte = kvm_pte_mkhuge(new_pte); + /* + * previous pmd entry is invalid_pte_table + * there is invalid tlb with small page + * need flush these invalid tlbs for current vcpu + */ + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + ++kvm->stat.hugepages; + } else if (kvm_pte_huge(*ptep) && write) + ptep = kvm_split_huge(vcpu, ptep, gfn); + else + ++kvm->stat.pages; + kvm_set_pte(ptep, new_pte); + spin_unlock(&kvm->mmu_lock); + + if (prot_bits & _PAGE_DIRTY) { + mark_page_dirty_in_slot(kvm, memslot, gfn); + kvm_set_pfn_dirty(pfn); + } + + kvm_set_pfn_accessed(pfn); + kvm_release_pfn_clean(pfn); +out: + srcu_read_unlock(&kvm->srcu, srcu_idx); + return err; +} + +int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) +{ + int ret; + + ret = kvm_map_page(vcpu, gpa, write); + if (ret) + return ret; + + /* Invalidate this entry in the TLB */ + kvm_flush_tlb_gpa(vcpu, gpa); + + return 0; +} + +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ +} + +int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, + struct kvm_memory_slot *new, enum kvm_mr_change change) +{ + return 0; +} + +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, + const struct kvm_memory_slot *memslot) +{ + kvm_flush_remote_tlbs(kvm); +} diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S new file mode 100644 index 000000000000..0ed9040307b7 --- /dev/null +++ b/arch/loongarch/kvm/switch.S @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/linkage.h> +#include <asm/asm.h> +#include <asm/asmmacro.h> +#include <asm/loongarch.h> +#include <asm/regdef.h> +#include <asm/stackframe.h> + +#define HGPR_OFFSET(x) (PT_R0 + 8*x) +#define GGPR_OFFSET(x) (KVM_ARCH_GGPR + 8*x) + +.macro kvm_save_host_gpr base + .irp n,1,2,3,22,23,24,25,26,27,28,29,30,31 + st.d $r\n, \base, HGPR_OFFSET(\n) + .endr +.endm + +.macro kvm_restore_host_gpr base + .irp n,1,2,3,22,23,24,25,26,27,28,29,30,31 + ld.d $r\n, \base, HGPR_OFFSET(\n) + .endr +.endm + +/* + * Save and restore all GPRs except base register, + * and default value of base register is a2. + */ +.macro kvm_save_guest_gprs base + .irp n,1,2,3,4,5,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 + st.d $r\n, \base, GGPR_OFFSET(\n) + .endr +.endm + +.macro kvm_restore_guest_gprs base + .irp n,1,2,3,4,5,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 + ld.d $r\n, \base, GGPR_OFFSET(\n) + .endr +.endm + +/* + * Prepare switch to guest, save host regs and restore guest regs. + * a2: kvm_vcpu_arch, don't touch it until 'ertn' + * t0, t1: temp register + */ +.macro kvm_switch_to_guest + /* Set host ECFG.VS=0, all exceptions share one exception entry */ + csrrd t0, LOONGARCH_CSR_ECFG + bstrins.w t0, zero, CSR_ECFG_VS_SHIFT_END, CSR_ECFG_VS_SHIFT + csrwr t0, LOONGARCH_CSR_ECFG + + /* Load up the new EENTRY */ + ld.d t0, a2, KVM_ARCH_GEENTRY + csrwr t0, LOONGARCH_CSR_EENTRY + + /* Set Guest ERA */ + ld.d t0, a2, KVM_ARCH_GPC + csrwr t0, LOONGARCH_CSR_ERA + + /* Save host PGDL */ + csrrd t0, LOONGARCH_CSR_PGDL + st.d t0, a2, KVM_ARCH_HPGD + + /* Switch to kvm */ + ld.d t1, a2, KVM_VCPU_KVM - KVM_VCPU_ARCH + + /* Load guest PGDL */ + li.w t0, KVM_GPGD + ldx.d t0, t1, t0 + csrwr t0, LOONGARCH_CSR_PGDL + + /* Mix GID and RID */ + csrrd t1, LOONGARCH_CSR_GSTAT + bstrpick.w t1, t1, CSR_GSTAT_GID_SHIFT_END, CSR_GSTAT_GID_SHIFT + csrrd t0, LOONGARCH_CSR_GTLBC + bstrins.w t0, t1, CSR_GTLBC_TGID_SHIFT_END, CSR_GTLBC_TGID_SHIFT + csrwr t0, LOONGARCH_CSR_GTLBC + + /* + * Enable intr in root mode with future ertn so that host interrupt + * can be responsed during VM runs + * Guest CRMD comes from separate GCSR_CRMD register + */ + ori t0, zero, CSR_PRMD_PIE + csrxchg t0, t0, LOONGARCH_CSR_PRMD + + /* Set PVM bit to setup ertn to guest context */ + ori t0, zero, CSR_GSTAT_PVM + csrxchg t0, t0, LOONGARCH_CSR_GSTAT + + /* Load Guest GPRs */ + kvm_restore_guest_gprs a2 + /* Load KVM_ARCH register */ + ld.d a2, a2, (KVM_ARCH_GGPR + 8 * REG_A2) + + ertn /* Switch to guest: GSTAT.PGM = 1, ERRCTL.ISERR = 0, TLBRPRMD.ISTLBR = 0 */ +.endm + + /* + * Exception entry for general exception from guest mode + * - IRQ is disabled + * - kernel privilege in root mode + * - page mode keep unchanged from previous PRMD in root mode + * - Fixme: tlb exception cannot happen since registers relative with TLB + * - is still in guest mode, such as pgd table/vmid registers etc, + * - will fix with hw page walk enabled in future + * load kvm_vcpu from reserved CSR KVM_VCPU_KS, and save a2 to KVM_TEMP_KS + */ + .text + .cfi_sections .debug_frame +SYM_CODE_START(kvm_exc_entry) + csrwr a2, KVM_TEMP_KS + csrrd a2, KVM_VCPU_KS + addi.d a2, a2, KVM_VCPU_ARCH + + /* After save GPRs, free to use any GPR */ + kvm_save_guest_gprs a2 + /* Save guest A2 */ + csrrd t0, KVM_TEMP_KS + st.d t0, a2, (KVM_ARCH_GGPR + 8 * REG_A2) + + /* A2 is kvm_vcpu_arch, A1 is free to use */ + csrrd s1, KVM_VCPU_KS + ld.d s0, s1, KVM_VCPU_RUN + + csrrd t0, LOONGARCH_CSR_ESTAT + st.d t0, a2, KVM_ARCH_HESTAT + csrrd t0, LOONGARCH_CSR_ERA + st.d t0, a2, KVM_ARCH_GPC + csrrd t0, LOONGARCH_CSR_BADV + st.d t0, a2, KVM_ARCH_HBADV + csrrd t0, LOONGARCH_CSR_BADI + st.d t0, a2, KVM_ARCH_HBADI + + /* Restore host ECFG.VS */ + csrrd t0, LOONGARCH_CSR_ECFG + ld.d t1, a2, KVM_ARCH_HECFG + or t0, t0, t1 + csrwr t0, LOONGARCH_CSR_ECFG + + /* Restore host EENTRY */ + ld.d t0, a2, KVM_ARCH_HEENTRY + csrwr t0, LOONGARCH_CSR_EENTRY + + /* Restore host pgd table */ + ld.d t0, a2, KVM_ARCH_HPGD + csrwr t0, LOONGARCH_CSR_PGDL + + /* + * Disable PGM bit to enter root mode by default with next ertn + */ + ori t0, zero, CSR_GSTAT_PVM + csrxchg zero, t0, LOONGARCH_CSR_GSTAT + + /* + * Clear GTLBC.TGID field + * 0: for root tlb update in future tlb instr + * others: for guest tlb update like gpa to hpa in future tlb instr + */ + csrrd t0, LOONGARCH_CSR_GTLBC + bstrins.w t0, zero, CSR_GTLBC_TGID_SHIFT_END, CSR_GTLBC_TGID_SHIFT + csrwr t0, LOONGARCH_CSR_GTLBC + ld.d tp, a2, KVM_ARCH_HTP + ld.d sp, a2, KVM_ARCH_HSP + /* restore per cpu register */ + ld.d u0, a2, KVM_ARCH_HPERCPU + addi.d sp, sp, -PT_SIZE + + /* Prepare handle exception */ + or a0, s0, zero + or a1, s1, zero + ld.d t8, a2, KVM_ARCH_HANDLE_EXIT + jirl ra, t8, 0 + + or a2, s1, zero + addi.d a2, a2, KVM_VCPU_ARCH + + /* Resume host when ret <= 0 */ + blez a0, ret_to_host + + /* + * Return to guest + * Save per cpu register again, maybe switched to another cpu + */ + st.d u0, a2, KVM_ARCH_HPERCPU + + /* Save kvm_vcpu to kscratch */ + csrwr s1, KVM_VCPU_KS + kvm_switch_to_guest + +ret_to_host: + ld.d a2, a2, KVM_ARCH_HSP + addi.d a2, a2, -PT_SIZE + kvm_restore_host_gpr a2 + jr ra + +SYM_INNER_LABEL(kvm_exc_entry_end, SYM_L_LOCAL) +SYM_CODE_END(kvm_exc_entry) + +/* + * int kvm_enter_guest(struct kvm_run *run, struct kvm_vcpu *vcpu) + * + * @register_param: + * a0: kvm_run* run + * a1: kvm_vcpu* vcpu + */ +SYM_FUNC_START(kvm_enter_guest) + /* Allocate space in stack bottom */ + addi.d a2, sp, -PT_SIZE + /* Save host GPRs */ + kvm_save_host_gpr a2 + + /* Save host CRMD, PRMD to stack */ + csrrd a3, LOONGARCH_CSR_CRMD + st.d a3, a2, PT_CRMD + csrrd a3, LOONGARCH_CSR_PRMD + st.d a3, a2, PT_PRMD + + addi.d a2, a1, KVM_VCPU_ARCH + st.d sp, a2, KVM_ARCH_HSP + st.d tp, a2, KVM_ARCH_HTP + /* Save per cpu register */ + st.d u0, a2, KVM_ARCH_HPERCPU + + /* Save kvm_vcpu to kscratch */ + csrwr a1, KVM_VCPU_KS + kvm_switch_to_guest +SYM_INNER_LABEL(kvm_enter_guest_end, SYM_L_LOCAL) +SYM_FUNC_END(kvm_enter_guest) + +SYM_FUNC_START(kvm_save_fpu) + fpu_save_csr a0 t1 + fpu_save_double a0 t1 + fpu_save_cc a0 t1 t2 + jr ra +SYM_FUNC_END(kvm_save_fpu) + +SYM_FUNC_START(kvm_restore_fpu) + fpu_restore_double a0 t1 + fpu_restore_csr a0 t1 t2 + fpu_restore_cc a0 t1 t2 + jr ra +SYM_FUNC_END(kvm_restore_fpu) + + .section ".rodata" +SYM_DATA(kvm_exception_size, .quad kvm_exc_entry_end - kvm_exc_entry) +SYM_DATA(kvm_enter_guest_size, .quad kvm_enter_guest_end - kvm_enter_guest) diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c new file mode 100644 index 000000000000..284bf553fefe --- /dev/null +++ b/arch/loongarch/kvm/timer.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_csr.h> +#include <asm/kvm_vcpu.h> + +/* + * ktime_to_tick() - Scale ktime_t to timer tick value. + */ +static inline u64 ktime_to_tick(struct kvm_vcpu *vcpu, ktime_t now) +{ + u64 delta; + + delta = ktime_to_ns(now); + return div_u64(delta * vcpu->arch.timer_mhz, MNSEC_PER_SEC); +} + +static inline u64 tick_to_ns(struct kvm_vcpu *vcpu, u64 tick) +{ + return div_u64(tick * MNSEC_PER_SEC, vcpu->arch.timer_mhz); +} + +/* + * Push timer forward on timeout. + * Handle an hrtimer event by push the hrtimer forward a period. + */ +static enum hrtimer_restart kvm_count_timeout(struct kvm_vcpu *vcpu) +{ + unsigned long cfg, period; + + /* Add periodic tick to current expire time */ + cfg = kvm_read_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_TCFG); + if (cfg & CSR_TCFG_PERIOD) { + period = tick_to_ns(vcpu, cfg & CSR_TCFG_VAL); + hrtimer_add_expires_ns(&vcpu->arch.swtimer, period); + return HRTIMER_RESTART; + } else + return HRTIMER_NORESTART; +} + +/* Low level hrtimer wake routine */ +enum hrtimer_restart kvm_swtimer_wakeup(struct hrtimer *timer) +{ + struct kvm_vcpu *vcpu; + + vcpu = container_of(timer, struct kvm_vcpu, arch.swtimer); + kvm_queue_irq(vcpu, INT_TI); + rcuwait_wake_up(&vcpu->wait); + + return kvm_count_timeout(vcpu); +} + +/* + * Initialise the timer to the specified frequency, zero it + */ +void kvm_init_timer(struct kvm_vcpu *vcpu, unsigned long timer_hz) +{ + vcpu->arch.timer_mhz = timer_hz >> 20; + + /* Starting at 0 */ + kvm_write_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_TVAL, 0); +} + +/* + * Restore hard timer state and enable guest to access timer registers + * without trap, should be called with irq disabled + */ +void kvm_acquire_timer(struct kvm_vcpu *vcpu) +{ + unsigned long cfg; + + cfg = read_csr_gcfg(); + if (!(cfg & CSR_GCFG_TIT)) + return; + + /* Enable guest access to hard timer */ + write_csr_gcfg(cfg & ~CSR_GCFG_TIT); + + /* + * Freeze the soft-timer and sync the guest stable timer with it. We do + * this with interrupts disabled to avoid latency. + */ + hrtimer_cancel(&vcpu->arch.swtimer); +} + +/* + * Restore soft timer state from saved context. + */ +void kvm_restore_timer(struct kvm_vcpu *vcpu) +{ + unsigned long cfg, delta, period; + ktime_t expire, now; + struct loongarch_csrs *csr = vcpu->arch.csr; + + /* + * Set guest stable timer cfg csr + */ + cfg = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TCFG); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ESTAT); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TCFG); + if (!(cfg & CSR_TCFG_EN)) { + /* Guest timer is disabled, just restore timer registers */ + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TVAL); + return; + } + + /* + * Set remainder tick value if not expired + */ + now = ktime_get(); + expire = vcpu->arch.expire; + if (ktime_before(now, expire)) + delta = ktime_to_tick(vcpu, ktime_sub(expire, now)); + else { + if (cfg & CSR_TCFG_PERIOD) { + period = cfg & CSR_TCFG_VAL; + delta = ktime_to_tick(vcpu, ktime_sub(now, expire)); + delta = period - (delta % period); + } else + delta = 0; + /* + * Inject timer here though sw timer should inject timer + * interrupt async already, since sw timer may be cancelled + * during injecting intr async in function kvm_acquire_timer + */ + kvm_queue_irq(vcpu, INT_TI); + } + + write_gcsr_timertick(delta); +} + +/* + * Save guest timer state and switch to software emulation of guest + * timer. The hard timer must already be in use, so preemption should be + * disabled. + */ +static void _kvm_save_timer(struct kvm_vcpu *vcpu) +{ + unsigned long ticks, delta; + ktime_t expire; + struct loongarch_csrs *csr = vcpu->arch.csr; + + ticks = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TVAL); + delta = tick_to_ns(vcpu, ticks); + expire = ktime_add_ns(ktime_get(), delta); + vcpu->arch.expire = expire; + if (ticks) { + /* + * Update hrtimer to use new timeout + * HRTIMER_MODE_PINNED is suggested since vcpu may run in + * the same physical cpu in next time + */ + hrtimer_cancel(&vcpu->arch.swtimer); + hrtimer_start(&vcpu->arch.swtimer, expire, HRTIMER_MODE_ABS_PINNED); + } else + /* + * Inject timer interrupt so that hall polling can dectect and exit + */ + kvm_queue_irq(vcpu, INT_TI); +} + +/* + * Save guest timer state and switch to soft guest timer if hard timer was in + * use. + */ +void kvm_save_timer(struct kvm_vcpu *vcpu) +{ + unsigned long cfg; + struct loongarch_csrs *csr = vcpu->arch.csr; + + preempt_disable(); + cfg = read_csr_gcfg(); + if (!(cfg & CSR_GCFG_TIT)) { + /* Disable guest use of hard timer */ + write_csr_gcfg(cfg | CSR_GCFG_TIT); + + /* Save hard timer state */ + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TCFG); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TVAL); + if (kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TCFG) & CSR_TCFG_EN) + _kvm_save_timer(vcpu); + } + + /* Save timer-related state to vCPU context */ + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ESTAT); + preempt_enable(); +} + +void kvm_reset_timer(struct kvm_vcpu *vcpu) +{ + write_gcsr_timercfg(0); + kvm_write_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_TCFG, 0); + hrtimer_cancel(&vcpu->arch.swtimer); +} diff --git a/arch/loongarch/kvm/tlb.c b/arch/loongarch/kvm/tlb.c new file mode 100644 index 000000000000..02535df6b51f --- /dev/null +++ b/arch/loongarch/kvm/tlb.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/kvm_host.h> +#include <asm/tlb.h> +#include <asm/kvm_csr.h> + +/* + * kvm_flush_tlb_all() - Flush all root TLB entries for guests. + * + * Invalidate all entries including GVA-->GPA and GPA-->HPA mappings. + */ +void kvm_flush_tlb_all(void) +{ + unsigned long flags; + + local_irq_save(flags); + invtlb_all(INVTLB_ALLGID, 0, 0); + local_irq_restore(flags); +} + +void kvm_flush_tlb_gpa(struct kvm_vcpu *vcpu, unsigned long gpa) +{ + unsigned long flags; + + local_irq_save(flags); + gpa &= (PAGE_MASK << 1); + invtlb(INVTLB_GID_ADDR, read_csr_gstat() & CSR_GSTAT_GID, gpa); + local_irq_restore(flags); +} diff --git a/arch/loongarch/kvm/trace.h b/arch/loongarch/kvm/trace.h new file mode 100644 index 000000000000..a1e35d655418 --- /dev/null +++ b/arch/loongarch/kvm/trace.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_H + +#include <linux/tracepoint.h> +#include <asm/kvm_csr.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +/* + * Tracepoints for VM enters + */ +DECLARE_EVENT_CLASS(kvm_transition, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), + TP_STRUCT__entry( + __field(unsigned long, pc) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.pc; + ), + + TP_printk("PC: 0x%08lx", __entry->pc) +); + +DEFINE_EVENT(kvm_transition, kvm_enter, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu)); + +DEFINE_EVENT(kvm_transition, kvm_reenter, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu)); + +DEFINE_EVENT(kvm_transition, kvm_out, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu)); + +/* Further exit reasons */ +#define KVM_TRACE_EXIT_IDLE 64 +#define KVM_TRACE_EXIT_CACHE 65 + +/* Tracepoints for VM exits */ +#define kvm_trace_symbol_exit_types \ + { KVM_TRACE_EXIT_IDLE, "IDLE" }, \ + { KVM_TRACE_EXIT_CACHE, "CACHE" } + +DECLARE_EVENT_CLASS(kvm_exit, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason), + TP_ARGS(vcpu, reason), + TP_STRUCT__entry( + __field(unsigned long, pc) + __field(unsigned int, reason) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.pc; + __entry->reason = reason; + ), + + TP_printk("[%s]PC: 0x%08lx", + __print_symbolic(__entry->reason, + kvm_trace_symbol_exit_types), + __entry->pc) +); + +DEFINE_EVENT(kvm_exit, kvm_exit_idle, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason), + TP_ARGS(vcpu, reason)); + +DEFINE_EVENT(kvm_exit, kvm_exit_cache, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason), + TP_ARGS(vcpu, reason)); + +DEFINE_EVENT(kvm_exit, kvm_exit, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason), + TP_ARGS(vcpu, reason)); + +TRACE_EVENT(kvm_exit_gspr, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int inst_word), + TP_ARGS(vcpu, inst_word), + TP_STRUCT__entry( + __field(unsigned int, inst_word) + ), + + TP_fast_assign( + __entry->inst_word = inst_word; + ), + + TP_printk("Inst word: 0x%08x", __entry->inst_word) +); + +#define KVM_TRACE_AUX_SAVE 0 +#define KVM_TRACE_AUX_RESTORE 1 +#define KVM_TRACE_AUX_ENABLE 2 +#define KVM_TRACE_AUX_DISABLE 3 +#define KVM_TRACE_AUX_DISCARD 4 + +#define KVM_TRACE_AUX_FPU 1 + +#define kvm_trace_symbol_aux_op \ + { KVM_TRACE_AUX_SAVE, "save" }, \ + { KVM_TRACE_AUX_RESTORE, "restore" }, \ + { KVM_TRACE_AUX_ENABLE, "enable" }, \ + { KVM_TRACE_AUX_DISABLE, "disable" }, \ + { KVM_TRACE_AUX_DISCARD, "discard" } + +#define kvm_trace_symbol_aux_state \ + { KVM_TRACE_AUX_FPU, "FPU" } + +TRACE_EVENT(kvm_aux, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op, + unsigned int state), + TP_ARGS(vcpu, op, state), + TP_STRUCT__entry( + __field(unsigned long, pc) + __field(u8, op) + __field(u8, state) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.pc; + __entry->op = op; + __entry->state = state; + ), + + TP_printk("%s %s PC: 0x%08lx", + __print_symbolic(__entry->op, + kvm_trace_symbol_aux_op), + __print_symbolic(__entry->state, + kvm_trace_symbol_aux_state), + __entry->pc) +); + +TRACE_EVENT(kvm_vpid_change, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned long vpid), + TP_ARGS(vcpu, vpid), + TP_STRUCT__entry( + __field(unsigned long, vpid) + ), + + TP_fast_assign( + __entry->vpid = vpid; + ), + + TP_printk("VPID: 0x%08lx", __entry->vpid) +); + +#endif /* _TRACE_KVM_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/loongarch/kvm +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c new file mode 100644 index 000000000000..73d0c2b9c1a5 --- /dev/null +++ b/arch/loongarch/kvm/vcpu.c @@ -0,0 +1,939 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/kvm_host.h> +#include <linux/entry-kvm.h> +#include <asm/fpu.h> +#include <asm/loongarch.h> +#include <asm/setup.h> +#include <asm/time.h> + +#define CREATE_TRACE_POINTS +#include "trace.h" + +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { + KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, int_exits), + STATS_DESC_COUNTER(VCPU, idle_exits), + STATS_DESC_COUNTER(VCPU, cpucfg_exits), + STATS_DESC_COUNTER(VCPU, signal_exits), +}; + +const struct kvm_stats_header kvm_vcpu_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vcpu_stats_desc), +}; + +/* + * kvm_check_requests - check and handle pending vCPU requests + * + * Return: RESUME_GUEST if we should enter the guest + * RESUME_HOST if we should exit to userspace + */ +static int kvm_check_requests(struct kvm_vcpu *vcpu) +{ + if (!kvm_request_pending(vcpu)) + return RESUME_GUEST; + + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + vcpu->arch.vpid = 0; /* Drop vpid for this vCPU */ + + if (kvm_dirty_ring_check_request(vcpu)) + return RESUME_HOST; + + return RESUME_GUEST; +} + +/* + * Check and handle pending signal and vCPU requests etc + * Run with irq enabled and preempt enabled + * + * Return: RESUME_GUEST if we should enter the guest + * RESUME_HOST if we should exit to userspace + * < 0 if we should exit to userspace, where the return value + * indicates an error + */ +static int kvm_enter_guest_check(struct kvm_vcpu *vcpu) +{ + int ret; + + /* + * Check conditions before entering the guest + */ + ret = xfer_to_guest_mode_handle_work(vcpu); + if (ret < 0) + return ret; + + ret = kvm_check_requests(vcpu); + + return ret; +} + +/* + * Called with irq enabled + * + * Return: RESUME_GUEST if we should enter the guest, and irq disabled + * Others if we should exit to userspace + */ +static int kvm_pre_enter_guest(struct kvm_vcpu *vcpu) +{ + int ret; + + do { + ret = kvm_enter_guest_check(vcpu); + if (ret != RESUME_GUEST) + break; + + /* + * Handle vcpu timer, interrupts, check requests and + * check vmid before vcpu enter guest + */ + local_irq_disable(); + kvm_acquire_timer(vcpu); + kvm_deliver_intr(vcpu); + kvm_deliver_exception(vcpu); + /* Make sure the vcpu mode has been written */ + smp_store_mb(vcpu->mode, IN_GUEST_MODE); + kvm_check_vpid(vcpu); + vcpu->arch.host_eentry = csr_read64(LOONGARCH_CSR_EENTRY); + /* Clear KVM_LARCH_SWCSR_LATEST as CSR will change when enter guest */ + vcpu->arch.aux_inuse &= ~KVM_LARCH_SWCSR_LATEST; + + if (kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending()) { + /* make sure the vcpu mode has been written */ + smp_store_mb(vcpu->mode, OUTSIDE_GUEST_MODE); + local_irq_enable(); + ret = -EAGAIN; + } + } while (ret != RESUME_GUEST); + + return ret; +} + +/* + * Return 1 for resume guest and "<= 0" for resume host. + */ +static int kvm_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + int ret = RESUME_GUEST; + unsigned long estat = vcpu->arch.host_estat; + u32 intr = estat & 0x1fff; /* Ignore NMI */ + u32 ecode = (estat & CSR_ESTAT_EXC) >> CSR_ESTAT_EXC_SHIFT; + + vcpu->mode = OUTSIDE_GUEST_MODE; + + /* Set a default exit reason */ + run->exit_reason = KVM_EXIT_UNKNOWN; + + guest_timing_exit_irqoff(); + guest_state_exit_irqoff(); + local_irq_enable(); + + trace_kvm_exit(vcpu, ecode); + if (ecode) { + ret = kvm_handle_fault(vcpu, ecode); + } else { + WARN(!intr, "vm exiting with suspicious irq\n"); + ++vcpu->stat.int_exits; + } + + if (ret == RESUME_GUEST) + ret = kvm_pre_enter_guest(vcpu); + + if (ret != RESUME_GUEST) { + local_irq_disable(); + return ret; + } + + guest_timing_enter_irqoff(); + guest_state_enter_irqoff(); + trace_kvm_reenter(vcpu); + + return RESUME_GUEST; +} + +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) +{ + return !!(vcpu->arch.irq_pending) && + vcpu->arch.mp_state.mp_state == KVM_MP_STATE_RUNNABLE; +} + +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; +} + +bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) +{ + return false; +} + +vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) +{ + return -EINVAL; +} + +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) +{ + return kvm_pending_timer(vcpu) || + kvm_read_hw_gcsr(LOONGARCH_CSR_ESTAT) & (1 << INT_TI); +} + +int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu) +{ + int i; + + kvm_debug("vCPU Register Dump:\n"); + kvm_debug("\tPC = 0x%08lx\n", vcpu->arch.pc); + kvm_debug("\tExceptions: %08lx\n", vcpu->arch.irq_pending); + + for (i = 0; i < 32; i += 4) { + kvm_debug("\tGPR%02d: %08lx %08lx %08lx %08lx\n", i, + vcpu->arch.gprs[i], vcpu->arch.gprs[i + 1], + vcpu->arch.gprs[i + 2], vcpu->arch.gprs[i + 3]); + } + + kvm_debug("\tCRMD: 0x%08lx, ESTAT: 0x%08lx\n", + kvm_read_hw_gcsr(LOONGARCH_CSR_CRMD), + kvm_read_hw_gcsr(LOONGARCH_CSR_ESTAT)); + + kvm_debug("\tERA: 0x%08lx\n", kvm_read_hw_gcsr(LOONGARCH_CSR_ERA)); + + return 0; +} + +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + *mp_state = vcpu->arch.mp_state; + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + int ret = 0; + + switch (mp_state->mp_state) { + case KVM_MP_STATE_RUNNABLE: + vcpu->arch.mp_state = *mp_state; + break; + default: + ret = -EINVAL; + } + + return ret; +} + +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + return -EINVAL; +} + +/** + * kvm_migrate_count() - Migrate timer. + * @vcpu: Virtual CPU. + * + * Migrate hrtimer to the current CPU by cancelling and restarting it + * if the hrtimer is active. + * + * Must be called when the vCPU is migrated to a different CPU, so that + * the timer can interrupt the guest at the new CPU, and the timer irq can + * be delivered to the vCPU. + */ +static void kvm_migrate_count(struct kvm_vcpu *vcpu) +{ + if (hrtimer_cancel(&vcpu->arch.swtimer)) + hrtimer_restart(&vcpu->arch.swtimer); +} + +static int _kvm_getcsr(struct kvm_vcpu *vcpu, unsigned int id, u64 *val) +{ + unsigned long gintc; + struct loongarch_csrs *csr = vcpu->arch.csr; + + if (get_gcsr_flag(id) & INVALID_GCSR) + return -EINVAL; + + if (id == LOONGARCH_CSR_ESTAT) { + /* ESTAT IP0~IP7 get from GINTC */ + gintc = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_GINTC) & 0xff; + *val = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_ESTAT) | (gintc << 2); + return 0; + } + + /* + * Get software CSR state since software state is consistent + * with hardware for synchronous ioctl + */ + *val = kvm_read_sw_gcsr(csr, id); + + return 0; +} + +static int _kvm_setcsr(struct kvm_vcpu *vcpu, unsigned int id, u64 val) +{ + int ret = 0, gintc; + struct loongarch_csrs *csr = vcpu->arch.csr; + + if (get_gcsr_flag(id) & INVALID_GCSR) + return -EINVAL; + + if (id == LOONGARCH_CSR_ESTAT) { + /* ESTAT IP0~IP7 inject through GINTC */ + gintc = (val >> 2) & 0xff; + kvm_set_sw_gcsr(csr, LOONGARCH_CSR_GINTC, gintc); + + gintc = val & ~(0xffUL << 2); + kvm_set_sw_gcsr(csr, LOONGARCH_CSR_ESTAT, gintc); + + return ret; + } + + kvm_write_sw_gcsr(csr, id, val); + + return ret; +} + +static int kvm_get_one_reg(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg, u64 *v) +{ + int id, ret = 0; + u64 type = reg->id & KVM_REG_LOONGARCH_MASK; + + switch (type) { + case KVM_REG_LOONGARCH_CSR: + id = KVM_GET_IOC_CSR_IDX(reg->id); + ret = _kvm_getcsr(vcpu, id, v); + break; + case KVM_REG_LOONGARCH_CPUCFG: + id = KVM_GET_IOC_CPUCFG_IDX(reg->id); + if (id >= 0 && id < KVM_MAX_CPUCFG_REGS) + *v = vcpu->arch.cpucfg[id]; + else + ret = -EINVAL; + break; + case KVM_REG_LOONGARCH_KVM: + switch (reg->id) { + case KVM_REG_LOONGARCH_COUNTER: + *v = drdtime() + vcpu->kvm->arch.time_offset; + break; + default: + ret = -EINVAL; + break; + } + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int kvm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + int ret = 0; + u64 v, size = reg->id & KVM_REG_SIZE_MASK; + + switch (size) { + case KVM_REG_SIZE_U64: + ret = kvm_get_one_reg(vcpu, reg, &v); + if (ret) + return ret; + ret = put_user(v, (u64 __user *)(long)reg->addr); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int kvm_set_one_reg(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg, u64 v) +{ + int id, ret = 0; + u64 type = reg->id & KVM_REG_LOONGARCH_MASK; + + switch (type) { + case KVM_REG_LOONGARCH_CSR: + id = KVM_GET_IOC_CSR_IDX(reg->id); + ret = _kvm_setcsr(vcpu, id, v); + break; + case KVM_REG_LOONGARCH_CPUCFG: + id = KVM_GET_IOC_CPUCFG_IDX(reg->id); + if (id >= 0 && id < KVM_MAX_CPUCFG_REGS) + vcpu->arch.cpucfg[id] = (u32)v; + else + ret = -EINVAL; + break; + case KVM_REG_LOONGARCH_KVM: + switch (reg->id) { + case KVM_REG_LOONGARCH_COUNTER: + /* + * gftoffset is relative with board, not vcpu + * only set for the first time for smp system + */ + if (vcpu->vcpu_id == 0) + vcpu->kvm->arch.time_offset = (signed long)(v - drdtime()); + break; + case KVM_REG_LOONGARCH_VCPU_RESET: + kvm_reset_timer(vcpu); + memset(&vcpu->arch.irq_pending, 0, sizeof(vcpu->arch.irq_pending)); + memset(&vcpu->arch.irq_clear, 0, sizeof(vcpu->arch.irq_clear)); + break; + default: + ret = -EINVAL; + break; + } + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int kvm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + int ret = 0; + u64 v, size = reg->id & KVM_REG_SIZE_MASK; + + switch (size) { + case KVM_REG_SIZE_U64: + ret = get_user(v, (u64 __user *)(long)reg->addr); + if (ret) + return ret; + break; + default: + return -EINVAL; + } + + return kvm_set_one_reg(vcpu, reg, v); +} + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + return -ENOIOCTLCMD; +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + return -ENOIOCTLCMD; +} + +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(vcpu->arch.gprs); i++) + regs->gpr[i] = vcpu->arch.gprs[i]; + + regs->pc = vcpu->arch.pc; + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + int i; + + for (i = 1; i < ARRAY_SIZE(vcpu->arch.gprs); i++) + vcpu->arch.gprs[i] = regs->gpr[i]; + + vcpu->arch.gprs[0] = 0; /* zero is special, and cannot be set. */ + vcpu->arch.pc = regs->pc; + + return 0; +} + +static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, + struct kvm_enable_cap *cap) +{ + /* FPU is enabled by default, will support LSX/LASX later. */ + return -EINVAL; +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + long r; + void __user *argp = (void __user *)arg; + struct kvm_vcpu *vcpu = filp->private_data; + + /* + * Only software CSR should be modified + * + * If any hardware CSR register is modified, vcpu_load/vcpu_put pair + * should be used. Since CSR registers owns by this vcpu, if switch + * to other vcpus, other vcpus need reload CSR registers. + * + * If software CSR is modified, bit KVM_LARCH_HWCSR_USABLE should + * be clear in vcpu->arch.aux_inuse, and vcpu_load will check + * aux_inuse flag and reload CSR registers form software. + */ + + switch (ioctl) { + case KVM_SET_ONE_REG: + case KVM_GET_ONE_REG: { + struct kvm_one_reg reg; + + r = -EFAULT; + if (copy_from_user(®, argp, sizeof(reg))) + break; + if (ioctl == KVM_SET_ONE_REG) { + r = kvm_set_reg(vcpu, ®); + vcpu->arch.aux_inuse &= ~KVM_LARCH_HWCSR_USABLE; + } else + r = kvm_get_reg(vcpu, ®); + break; + } + case KVM_ENABLE_CAP: { + struct kvm_enable_cap cap; + + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + break; + r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); + break; + } + default: + r = -ENOIOCTLCMD; + break; + } + + return r; +} + +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + int i = 0; + + fpu->fcc = vcpu->arch.fpu.fcc; + fpu->fcsr = vcpu->arch.fpu.fcsr; + for (i = 0; i < NUM_FPU_REGS; i++) + memcpy(&fpu->fpr[i], &vcpu->arch.fpu.fpr[i], FPU_REG_WIDTH / 64); + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + int i = 0; + + vcpu->arch.fpu.fcc = fpu->fcc; + vcpu->arch.fpu.fcsr = fpu->fcsr; + for (i = 0; i < NUM_FPU_REGS; i++) + memcpy(&vcpu->arch.fpu.fpr[i], &fpu->fpr[i], FPU_REG_WIDTH / 64); + + return 0; +} + +/* Enable FPU and restore context */ +void kvm_own_fpu(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + + /* Enable FPU */ + set_csr_euen(CSR_EUEN_FPEN); + + kvm_restore_fpu(&vcpu->arch.fpu); + vcpu->arch.aux_inuse |= KVM_LARCH_FPU; + trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_FPU); + + preempt_enable(); +} + +/* Save context and disable FPU */ +void kvm_lose_fpu(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + + if (vcpu->arch.aux_inuse & KVM_LARCH_FPU) { + kvm_save_fpu(&vcpu->arch.fpu); + vcpu->arch.aux_inuse &= ~KVM_LARCH_FPU; + trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU); + + /* Disable FPU */ + clear_csr_euen(CSR_EUEN_FPEN); + } + + preempt_enable(); +} + +int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) +{ + int intr = (int)irq->irq; + + if (intr > 0) + kvm_queue_irq(vcpu, intr); + else if (intr < 0) + kvm_dequeue_irq(vcpu, -intr); + else { + kvm_err("%s: invalid interrupt ioctl %d\n", __func__, irq->irq); + return -EINVAL; + } + + kvm_vcpu_kick(vcpu); + + return 0; +} + +long kvm_arch_vcpu_async_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct kvm_vcpu *vcpu = filp->private_data; + + if (ioctl == KVM_INTERRUPT) { + struct kvm_interrupt irq; + + if (copy_from_user(&irq, argp, sizeof(irq))) + return -EFAULT; + + kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__, irq.irq); + + return kvm_vcpu_ioctl_interrupt(vcpu, &irq); + } + + return -ENOIOCTLCMD; +} + +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + return 0; +} + +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) +{ + unsigned long timer_hz; + struct loongarch_csrs *csr; + + vcpu->arch.vpid = 0; + + hrtimer_init(&vcpu->arch.swtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + vcpu->arch.swtimer.function = kvm_swtimer_wakeup; + + vcpu->arch.handle_exit = kvm_handle_exit; + vcpu->arch.guest_eentry = (unsigned long)kvm_loongarch_ops->exc_entry; + vcpu->arch.csr = kzalloc(sizeof(struct loongarch_csrs), GFP_KERNEL); + if (!vcpu->arch.csr) + return -ENOMEM; + + /* + * All kvm exceptions share one exception entry, and host <-> guest + * switch also switch ECFG.VS field, keep host ECFG.VS info here. + */ + vcpu->arch.host_ecfg = (read_csr_ecfg() & CSR_ECFG_VS); + + /* Init */ + vcpu->arch.last_sched_cpu = -1; + + /* + * Initialize guest register state to valid architectural reset state. + */ + timer_hz = calc_const_freq(); + kvm_init_timer(vcpu, timer_hz); + + /* Set Initialize mode for guest */ + csr = vcpu->arch.csr; + kvm_write_sw_gcsr(csr, LOONGARCH_CSR_CRMD, CSR_CRMD_DA); + + /* Set cpuid */ + kvm_write_sw_gcsr(csr, LOONGARCH_CSR_TMID, vcpu->vcpu_id); + + /* Start with no pending virtual guest interrupts */ + csr->csrs[LOONGARCH_CSR_GINTC] = 0; + + return 0; +} + +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ +} + +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + int cpu; + struct kvm_context *context; + + hrtimer_cancel(&vcpu->arch.swtimer); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + kfree(vcpu->arch.csr); + + /* + * If the vCPU is freed and reused as another vCPU, we don't want the + * matching pointer wrongly hanging around in last_vcpu. + */ + for_each_possible_cpu(cpu) { + context = per_cpu_ptr(vcpu->kvm->arch.vmcs, cpu); + if (context->last_vcpu == vcpu) + context->last_vcpu = NULL; + } +} + +static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + bool migrated; + struct kvm_context *context; + struct loongarch_csrs *csr = vcpu->arch.csr; + + /* + * Have we migrated to a different CPU? + * If so, any old guest TLB state may be stale. + */ + migrated = (vcpu->arch.last_sched_cpu != cpu); + + /* + * Was this the last vCPU to run on this CPU? + * If not, any old guest state from this vCPU will have been clobbered. + */ + context = per_cpu_ptr(vcpu->kvm->arch.vmcs, cpu); + if (migrated || (context->last_vcpu != vcpu)) + vcpu->arch.aux_inuse &= ~KVM_LARCH_HWCSR_USABLE; + context->last_vcpu = vcpu; + + /* Restore timer state regardless */ + kvm_restore_timer(vcpu); + + /* Control guest page CCA attribute */ + change_csr_gcfg(CSR_GCFG_MATC_MASK, CSR_GCFG_MATC_ROOT); + + /* Don't bother restoring registers multiple times unless necessary */ + if (vcpu->arch.aux_inuse & KVM_LARCH_HWCSR_USABLE) + return 0; + + write_csr_gcntc((ulong)vcpu->kvm->arch.time_offset); + + /* Restore guest CSR registers */ + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_CRMD); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PRMD); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_EUEN); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_MISC); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ECFG); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ERA); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_BADV); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_BADI); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_EENTRY); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBIDX); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBEHI); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBELO0); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBELO1); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ASID); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PGDL); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PGDH); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PWCTL0); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PWCTL1); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_STLBPGSIZE); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_RVACFG); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_CPUID); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_KS0); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_KS1); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_KS2); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_KS3); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_KS4); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_KS5); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_KS6); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_KS7); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TMID); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_CNTC); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBRENTRY); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBRBADV); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBRERA); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBRSAVE); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBRELO0); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBRELO1); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBREHI); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBRPRMD); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_DMWIN0); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_DMWIN1); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_DMWIN2); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_DMWIN3); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_LLBCTL); + + /* Restore Root.GINTC from unused Guest.GINTC register */ + write_csr_gintc(csr->csrs[LOONGARCH_CSR_GINTC]); + + /* + * We should clear linked load bit to break interrupted atomics. This + * prevents a SC on the next vCPU from succeeding by matching a LL on + * the previous vCPU. + */ + if (vcpu->kvm->created_vcpus > 1) + set_gcsr_llbctl(CSR_LLBCTL_WCLLB); + + vcpu->arch.aux_inuse |= KVM_LARCH_HWCSR_USABLE; + + return 0; +} + +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + unsigned long flags; + + local_irq_save(flags); + if (vcpu->arch.last_sched_cpu != cpu) { + kvm_debug("[%d->%d]KVM vCPU[%d] switch\n", + vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id); + /* + * Migrate the timer interrupt to the current CPU so that it + * always interrupts the guest and synchronously triggers a + * guest timer interrupt. + */ + kvm_migrate_count(vcpu); + } + + /* Restore guest state to registers */ + _kvm_vcpu_load(vcpu, cpu); + local_irq_restore(flags); +} + +static int _kvm_vcpu_put(struct kvm_vcpu *vcpu, int cpu) +{ + struct loongarch_csrs *csr = vcpu->arch.csr; + + kvm_lose_fpu(vcpu); + + /* + * Update CSR state from hardware if software CSR state is stale, + * most CSR registers are kept unchanged during process context + * switch except CSR registers like remaining timer tick value and + * injected interrupt state. + */ + if (vcpu->arch.aux_inuse & KVM_LARCH_SWCSR_LATEST) + goto out; + + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_CRMD); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PRMD); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_EUEN); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_MISC); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ECFG); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ERA); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_BADV); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_BADI); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_EENTRY); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBIDX); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBEHI); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBELO0); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBELO1); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ASID); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PGDL); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PGDH); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PWCTL0); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PWCTL1); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_STLBPGSIZE); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_RVACFG); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_CPUID); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PRCFG1); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PRCFG2); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PRCFG3); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_KS0); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_KS1); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_KS2); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_KS3); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_KS4); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_KS5); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_KS6); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_KS7); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TMID); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_CNTC); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_LLBCTL); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBRENTRY); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBRBADV); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBRERA); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBRSAVE); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBRELO0); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBRELO1); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBREHI); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBRPRMD); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN0); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN1); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN2); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN3); + + vcpu->arch.aux_inuse |= KVM_LARCH_SWCSR_LATEST; + +out: + kvm_save_timer(vcpu); + /* Save Root.GINTC into unused Guest.GINTC register */ + csr->csrs[LOONGARCH_CSR_GINTC] = read_csr_gintc(); + + return 0; +} + +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ + int cpu; + unsigned long flags; + + local_irq_save(flags); + cpu = smp_processor_id(); + vcpu->arch.last_sched_cpu = cpu; + + /* Save guest state in registers */ + _kvm_vcpu_put(vcpu, cpu); + local_irq_restore(flags); +} + +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) +{ + int r = -EINTR; + struct kvm_run *run = vcpu->run; + + if (vcpu->mmio_needed) { + if (!vcpu->mmio_is_write) + kvm_complete_mmio_read(vcpu, run); + vcpu->mmio_needed = 0; + } + + if (run->exit_reason == KVM_EXIT_LOONGARCH_IOCSR) { + if (!run->iocsr_io.is_write) + kvm_complete_iocsr_read(vcpu, run); + } + + if (run->immediate_exit) + return r; + + /* Clear exit_reason */ + run->exit_reason = KVM_EXIT_UNKNOWN; + lose_fpu(1); + vcpu_load(vcpu); + kvm_sigset_activate(vcpu); + r = kvm_pre_enter_guest(vcpu); + if (r != RESUME_GUEST) + goto out; + + guest_timing_enter_irqoff(); + guest_state_enter_irqoff(); + trace_kvm_enter(vcpu); + r = kvm_loongarch_ops->enter_guest(run, vcpu); + + trace_kvm_out(vcpu); + /* + * Guest exit is already recorded at kvm_handle_exit() + * return value must not be RESUME_GUEST + */ + local_irq_enable(); +out: + kvm_sigset_deactivate(vcpu); + vcpu_put(vcpu); + + return r; +} diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c new file mode 100644 index 000000000000..0a37f6fa8f2d --- /dev/null +++ b/arch/loongarch/kvm/vm.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_mmu.h> + +const struct _kvm_stats_desc kvm_vm_stats_desc[] = { + KVM_GENERIC_VM_STATS(), + STATS_DESC_ICOUNTER(VM, pages), + STATS_DESC_ICOUNTER(VM, hugepages), +}; + +const struct kvm_stats_header kvm_vm_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vm_stats_desc), +}; + +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) +{ + int i; + + /* Allocate page table to map GPA -> RPA */ + kvm->arch.pgd = kvm_pgd_alloc(); + if (!kvm->arch.pgd) + return -ENOMEM; + + kvm_init_vmcs(kvm); + kvm->arch.gpa_size = BIT(cpu_vabits - 1); + kvm->arch.root_level = CONFIG_PGTABLE_LEVELS - 1; + kvm->arch.invalid_ptes[0] = 0; + kvm->arch.invalid_ptes[1] = (unsigned long)invalid_pte_table; +#if CONFIG_PGTABLE_LEVELS > 2 + kvm->arch.invalid_ptes[2] = (unsigned long)invalid_pmd_table; +#endif +#if CONFIG_PGTABLE_LEVELS > 3 + kvm->arch.invalid_ptes[3] = (unsigned long)invalid_pud_table; +#endif + for (i = 0; i <= kvm->arch.root_level; i++) + kvm->arch.pte_shifts[i] = PAGE_SHIFT + i * (PAGE_SHIFT - 3); + + return 0; +} + +void kvm_arch_destroy_vm(struct kvm *kvm) +{ + kvm_destroy_vcpus(kvm); + free_page((unsigned long)kvm->arch.pgd); + kvm->arch.pgd = NULL; +} + +int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) +{ + int r; + + switch (ext) { + case KVM_CAP_ONE_REG: + case KVM_CAP_ENABLE_CAP: + case KVM_CAP_READONLY_MEM: + case KVM_CAP_SYNC_MMU: + case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_IOEVENTFD: + case KVM_CAP_MP_STATE: + r = 1; + break; + case KVM_CAP_NR_VCPUS: + r = num_online_cpus(); + break; + case KVM_CAP_MAX_VCPUS: + r = KVM_MAX_VCPUS; + break; + case KVM_CAP_MAX_VCPU_ID: + r = KVM_MAX_VCPU_IDS; + break; + case KVM_CAP_NR_MEMSLOTS: + r = KVM_USER_MEM_SLOTS; + break; + default: + r = 0; + break; + } + + return r; +} + +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + return -ENOIOCTLCMD; +} diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index f3fe8c06ba4d..4dd53427f657 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -43,11 +43,11 @@ void copy_user_highpage(struct page *to, struct page *from, { void *vfrom, *vto; - vto = kmap_atomic(to); - vfrom = kmap_atomic(from); + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); copy_page(vto, vfrom); - kunmap_atomic(vfrom); - kunmap_atomic(vto); + kunmap_local(vfrom); + kunmap_local(vto); /* Make sure this page is cleared on other CPU's too before using it */ smp_wmb(); } @@ -240,6 +240,7 @@ pgd_t swapper_pg_dir[_PTRS_PER_PGD] __section(".bss..swapper_pg_dir"); pgd_t invalid_pg_dir[_PTRS_PER_PGD] __page_aligned_bss; #ifndef __PAGETABLE_PUD_FOLDED pud_t invalid_pud_table[PTRS_PER_PUD] __page_aligned_bss; +EXPORT_SYMBOL(invalid_pud_table); #endif #ifndef __PAGETABLE_PMD_FOLDED pmd_t invalid_pmd_table[PTRS_PER_PMD] __page_aligned_bss; diff --git a/arch/loongarch/mm/tlbex.S b/arch/loongarch/mm/tlbex.S index ca17dd3a1915..d5d682f3d29f 100644 --- a/arch/loongarch/mm/tlbex.S +++ b/arch/loongarch/mm/tlbex.S @@ -17,7 +17,7 @@ #define PTRS_PER_PTE_BITS (PAGE_SHIFT - 3) .macro tlb_do_page_fault, write - SYM_FUNC_START(tlb_do_page_fault_\write) + SYM_CODE_START(tlb_do_page_fault_\write) SAVE_ALL csrrd a2, LOONGARCH_CSR_BADV move a0, sp @@ -25,13 +25,13 @@ li.w a1, \write bl do_page_fault RESTORE_ALL_AND_RET - SYM_FUNC_END(tlb_do_page_fault_\write) + SYM_CODE_END(tlb_do_page_fault_\write) .endm tlb_do_page_fault 0 tlb_do_page_fault 1 -SYM_FUNC_START(handle_tlb_protect) +SYM_CODE_START(handle_tlb_protect) BACKUP_T0T1 SAVE_ALL move a0, sp @@ -41,9 +41,9 @@ SYM_FUNC_START(handle_tlb_protect) la_abs t0, do_page_fault jirl ra, t0, 0 RESTORE_ALL_AND_RET -SYM_FUNC_END(handle_tlb_protect) +SYM_CODE_END(handle_tlb_protect) -SYM_FUNC_START(handle_tlb_load) +SYM_CODE_START(handle_tlb_load) csrwr t0, EXCEPTION_KS0 csrwr t1, EXCEPTION_KS1 csrwr ra, EXCEPTION_KS2 @@ -187,16 +187,16 @@ nopage_tlb_load: csrrd ra, EXCEPTION_KS2 la_abs t0, tlb_do_page_fault_0 jr t0 -SYM_FUNC_END(handle_tlb_load) +SYM_CODE_END(handle_tlb_load) -SYM_FUNC_START(handle_tlb_load_ptw) +SYM_CODE_START(handle_tlb_load_ptw) csrwr t0, LOONGARCH_CSR_KS0 csrwr t1, LOONGARCH_CSR_KS1 la_abs t0, tlb_do_page_fault_0 jr t0 -SYM_FUNC_END(handle_tlb_load_ptw) +SYM_CODE_END(handle_tlb_load_ptw) -SYM_FUNC_START(handle_tlb_store) +SYM_CODE_START(handle_tlb_store) csrwr t0, EXCEPTION_KS0 csrwr t1, EXCEPTION_KS1 csrwr ra, EXCEPTION_KS2 @@ -343,16 +343,16 @@ nopage_tlb_store: csrrd ra, EXCEPTION_KS2 la_abs t0, tlb_do_page_fault_1 jr t0 -SYM_FUNC_END(handle_tlb_store) +SYM_CODE_END(handle_tlb_store) -SYM_FUNC_START(handle_tlb_store_ptw) +SYM_CODE_START(handle_tlb_store_ptw) csrwr t0, LOONGARCH_CSR_KS0 csrwr t1, LOONGARCH_CSR_KS1 la_abs t0, tlb_do_page_fault_1 jr t0 -SYM_FUNC_END(handle_tlb_store_ptw) +SYM_CODE_END(handle_tlb_store_ptw) -SYM_FUNC_START(handle_tlb_modify) +SYM_CODE_START(handle_tlb_modify) csrwr t0, EXCEPTION_KS0 csrwr t1, EXCEPTION_KS1 csrwr ra, EXCEPTION_KS2 @@ -497,16 +497,16 @@ nopage_tlb_modify: csrrd ra, EXCEPTION_KS2 la_abs t0, tlb_do_page_fault_1 jr t0 -SYM_FUNC_END(handle_tlb_modify) +SYM_CODE_END(handle_tlb_modify) -SYM_FUNC_START(handle_tlb_modify_ptw) +SYM_CODE_START(handle_tlb_modify_ptw) csrwr t0, LOONGARCH_CSR_KS0 csrwr t1, LOONGARCH_CSR_KS1 la_abs t0, tlb_do_page_fault_1 jr t0 -SYM_FUNC_END(handle_tlb_modify_ptw) +SYM_CODE_END(handle_tlb_modify_ptw) -SYM_FUNC_START(handle_tlb_refill) +SYM_CODE_START(handle_tlb_refill) csrwr t0, LOONGARCH_CSR_TLBRSAVE csrrd t0, LOONGARCH_CSR_PGD lddir t0, t0, 3 @@ -521,4 +521,4 @@ SYM_FUNC_START(handle_tlb_refill) tlbfill csrrd t0, LOONGARCH_CSR_TLBRSAVE ertn -SYM_FUNC_END(handle_tlb_refill) +SYM_CODE_END(handle_tlb_refill) diff --git a/arch/mips/alchemy/devboards/db1000.c b/arch/mips/alchemy/devboards/db1000.c index 012da042d0a4..7b9f91db227f 100644 --- a/arch/mips/alchemy/devboards/db1000.c +++ b/arch/mips/alchemy/devboards/db1000.c @@ -164,6 +164,7 @@ static struct platform_device db1x00_audio_dev = { /******************************************************************************/ +#ifdef CONFIG_MMC_AU1X static irqreturn_t db1100_mmc_cd(int irq, void *ptr) { mmc_detect_change(ptr, msecs_to_jiffies(500)); @@ -369,6 +370,7 @@ static struct platform_device db1100_mmc1_dev = { .num_resources = ARRAY_SIZE(au1100_mmc1_res), .resource = au1100_mmc1_res, }; +#endif /* CONFIG_MMC_AU1X */ /******************************************************************************/ @@ -440,8 +442,10 @@ static struct platform_device *db1x00_devs[] = { static struct platform_device *db1100_devs[] = { &au1100_lcd_device, +#ifdef CONFIG_MMC_AU1X &db1100_mmc0_dev, &db1100_mmc1_dev, +#endif }; int __init db1000_dev_setup(void) diff --git a/arch/mips/alchemy/devboards/db1200.c b/arch/mips/alchemy/devboards/db1200.c index 76080c71a2a7..f521874ebb07 100644 --- a/arch/mips/alchemy/devboards/db1200.c +++ b/arch/mips/alchemy/devboards/db1200.c @@ -326,6 +326,7 @@ static struct platform_device db1200_ide_dev = { /**********************************************************************/ +#ifdef CONFIG_MMC_AU1X /* SD carddetects: they're supposed to be edge-triggered, but ack * doesn't seem to work (CPLD Rev 2). Instead, the screaming one * is disabled and its counterpart enabled. The 200ms timeout is @@ -584,6 +585,7 @@ static struct platform_device pb1200_mmc1_dev = { .num_resources = ARRAY_SIZE(au1200_mmc1_res), .resource = au1200_mmc1_res, }; +#endif /* CONFIG_MMC_AU1X */ /**********************************************************************/ @@ -751,7 +753,9 @@ static struct platform_device db1200_audiodma_dev = { static struct platform_device *db1200_devs[] __initdata = { NULL, /* PSC0, selected by S6.8 */ &db1200_ide_dev, +#ifdef CONFIG_MMC_AU1X &db1200_mmc0_dev, +#endif &au1200_lcd_dev, &db1200_eth_dev, &db1200_nand_dev, @@ -762,7 +766,9 @@ static struct platform_device *db1200_devs[] __initdata = { }; static struct platform_device *pb1200_devs[] __initdata = { +#ifdef CONFIG_MMC_AU1X &pb1200_mmc1_dev, +#endif }; /* Some peripheral base addresses differ on the PB1200 */ diff --git a/arch/mips/alchemy/devboards/db1300.c b/arch/mips/alchemy/devboards/db1300.c index ff61901329c6..d377e043b49f 100644 --- a/arch/mips/alchemy/devboards/db1300.c +++ b/arch/mips/alchemy/devboards/db1300.c @@ -450,6 +450,7 @@ static struct platform_device db1300_ide_dev = { /**********************************************************************/ +#ifdef CONFIG_MMC_AU1X static irqreturn_t db1300_mmc_cd(int irq, void *ptr) { disable_irq_nosync(irq); @@ -632,6 +633,7 @@ static struct platform_device db1300_sd0_dev = { .resource = au1300_sd0_res, .num_resources = ARRAY_SIZE(au1300_sd0_res), }; +#endif /* CONFIG_MMC_AU1X */ /**********************************************************************/ @@ -767,8 +769,10 @@ static struct platform_device *db1300_dev[] __initdata = { &db1300_5waysw_dev, &db1300_nand_dev, &db1300_ide_dev, +#ifdef CONFIG_MMC_AU1X &db1300_sd0_dev, &db1300_sd1_dev, +#endif &db1300_lcd_dev, &db1300_ac97_dev, &db1300_i2s_dev, diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 7b2ac1319d70..467ee6b95ae1 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -592,7 +592,7 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, gfn_t gfn = gpa >> PAGE_SHIFT; int srcu_idx, err; kvm_pfn_t pfn; - pte_t *ptep, entry, old_pte; + pte_t *ptep, entry; bool writeable; unsigned long prot_bits; unsigned long mmu_seq; @@ -664,7 +664,6 @@ retry: entry = pfn_pte(pfn, __pgprot(prot_bits)); /* Write the PTE */ - old_pte = *ptep; set_pte(ptep, entry); err = 0; diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index f7f078c2872c..72daacc472a0 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -6,7 +6,7 @@ #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte); + pte_t *ptep, pte_t pte, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, diff --git a/arch/parisc/include/asm/ldcw.h b/arch/parisc/include/asm/ldcw.h index 6d28b5514699..ee9e071859b2 100644 --- a/arch/parisc/include/asm/ldcw.h +++ b/arch/parisc/include/asm/ldcw.h @@ -2,39 +2,42 @@ #ifndef __PARISC_LDCW_H #define __PARISC_LDCW_H -#ifndef CONFIG_PA20 /* Because kmalloc only guarantees 8-byte alignment for kmalloc'd data, and GCC only guarantees 8-byte alignment for stack locals, we can't be assured of 16-byte alignment for atomic lock data even if we specify "__attribute ((aligned(16)))" in the type declaration. So, we use a struct containing an array of four ints for the atomic lock type and dynamically select the 16-byte aligned int from the array - for the semaphore. */ + for the semaphore. */ + +/* From: "Jim Hull" <jim.hull of hp.com> + I've attached a summary of the change, but basically, for PA 2.0, as + long as the ",CO" (coherent operation) completer is implemented, then the + 16-byte alignment requirement for ldcw and ldcd is relaxed, and instead + they only require "natural" alignment (4-byte for ldcw, 8-byte for + ldcd). + + Although the cache control hint is accepted by all PA 2.0 processors, + it is only implemented on PA8800/PA8900 CPUs. Prior PA8X00 CPUs still + require 16-byte alignment. If the address is unaligned, the operation + of the instruction is undefined. The ldcw instruction does not generate + unaligned data reference traps so misaligned accesses are not detected. + This hid the problem for years. So, restore the 16-byte alignment dropped + by Kyle McMartin in "Remove __ldcw_align for PA-RISC 2.0 processors". */ #define __PA_LDCW_ALIGNMENT 16 -#define __PA_LDCW_ALIGN_ORDER 4 #define __ldcw_align(a) ({ \ unsigned long __ret = (unsigned long) &(a)->lock[0]; \ __ret = (__ret + __PA_LDCW_ALIGNMENT - 1) \ & ~(__PA_LDCW_ALIGNMENT - 1); \ (volatile unsigned int *) __ret; \ }) -#define __LDCW "ldcw" -#else /*CONFIG_PA20*/ -/* From: "Jim Hull" <jim.hull of hp.com> - I've attached a summary of the change, but basically, for PA 2.0, as - long as the ",CO" (coherent operation) completer is specified, then the - 16-byte alignment requirement for ldcw and ldcd is relaxed, and instead - they only require "natural" alignment (4-byte for ldcw, 8-byte for - ldcd). */ - -#define __PA_LDCW_ALIGNMENT 4 -#define __PA_LDCW_ALIGN_ORDER 2 -#define __ldcw_align(a) (&(a)->slock) +#ifdef CONFIG_PA20 #define __LDCW "ldcw,co" - -#endif /*!CONFIG_PA20*/ +#else +#define __LDCW "ldcw" +#endif /* LDCW, the only atomic read-write operation PA-RISC has. *sigh*. We don't explicitly expose that "*a" may be written as reload diff --git a/arch/parisc/include/asm/spinlock_types.h b/arch/parisc/include/asm/spinlock_types.h index efd06a897c6a..7b986b09dba8 100644 --- a/arch/parisc/include/asm/spinlock_types.h +++ b/arch/parisc/include/asm/spinlock_types.h @@ -9,15 +9,10 @@ #ifndef __ASSEMBLY__ typedef struct { -#ifdef CONFIG_PA20 - volatile unsigned int slock; -# define __ARCH_SPIN_LOCK_UNLOCKED { __ARCH_SPIN_LOCK_UNLOCKED_VAL } -#else volatile unsigned int lock[4]; # define __ARCH_SPIN_LOCK_UNLOCKED \ { { __ARCH_SPIN_LOCK_UNLOCKED_VAL, __ARCH_SPIN_LOCK_UNLOCKED_VAL, \ __ARCH_SPIN_LOCK_UNLOCKED_VAL, __ARCH_SPIN_LOCK_UNLOCKED_VAL } } -#endif } arch_spinlock_t; diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 4098f9a0964b..2019c1f04bd0 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -440,7 +440,9 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) if (cpu_online(cpu)) return 0; - if (num_online_cpus() < setup_max_cpus && smp_boot_one_cpu(cpu, tidle)) + if (num_online_cpus() < nr_cpu_ids && + num_online_cpus() < setup_max_cpus && + smp_boot_one_cpu(cpu, tidle)) return -EIO; return cpu_online(cpu) ? 0 : -EIO; diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index a8a1a7c1e16e..a9f7e21f6656 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -140,7 +140,7 @@ static void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, } void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry) + pte_t *ptep, pte_t entry, unsigned long sz) { __set_huge_pte_at(mm, addr, ptep, entry); } diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3aaadfd2c8eb..d5d5388973ac 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -910,7 +910,7 @@ config ARCH_FORCE_MAX_ORDER default "6" if PPC32 && PPC_64K_PAGES range 4 10 if PPC32 && PPC_256K_PAGES default "4" if PPC32 && PPC_256K_PAGES - range 10 10 + range 10 12 default "10" help The kernel page allocator limits the size of maximal physically diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h index de092b04ee1a..92df40c6cc6b 100644 --- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h @@ -46,7 +46,8 @@ static inline int check_and_get_huge_psize(int shift) } #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned long sz); #define __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index 21f681ee535a..e6fe1d5731f2 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -94,6 +94,13 @@ static inline pte_t pte_wrprotect(pte_t pte) #define pte_wrprotect pte_wrprotect +static inline int pte_read(pte_t pte) +{ + return (pte_val(pte) & _PAGE_RO) != _PAGE_NA; +} + +#define pte_read pte_read + static inline int pte_write(pte_t pte) { return !(pte_val(pte) & _PAGE_RO); diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 5cd9acf58a7d..eb6891e34cbd 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -197,7 +197,7 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm, { unsigned long old; - if (pte_young(*ptep)) + if (!pte_young(*ptep)) return 0; old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); return (old & _PAGE_ACCESSED) != 0; diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 56ea48276356..c721478c5934 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -25,7 +25,9 @@ static inline int pte_write(pte_t pte) return pte_val(pte) & _PAGE_RW; } #endif +#ifndef pte_read static inline int pte_read(pte_t pte) { return 1; } +#endif static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_special(pte_t pte) { return pte_val(pte) & _PAGE_SPECIAL; } static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; } diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 9692acb0361f..7eda33a24bb4 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -137,8 +137,9 @@ ret_from_syscall: lis r4,icache_44x_need_flush@ha lwz r5,icache_44x_need_flush@l(r4) cmplwi cr0,r5,0 - bne- 2f + bne- .L44x_icache_flush #endif /* CONFIG_PPC_47x */ +.L44x_icache_flush_return: kuep_unlock lwz r4,_LINK(r1) lwz r5,_CCR(r1) @@ -172,10 +173,11 @@ syscall_exit_finish: b 1b #ifdef CONFIG_44x -2: li r7,0 +.L44x_icache_flush: + li r7,0 iccci r0,r0 stw r7,icache_44x_need_flush@l(r4) - b 1b + b .L44x_icache_flush_return #endif /* CONFIG_44x */ .globl ret_from_fork diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S index 97e9ea0c7297..0f1641a31250 100644 --- a/arch/powerpc/kernel/head_85xx.S +++ b/arch/powerpc/kernel/head_85xx.S @@ -395,7 +395,7 @@ interrupt_base: #ifdef CONFIG_PPC_FPU FP_UNAVAILABLE_EXCEPTION #else - EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, unknown_exception) + EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, emulation_assist_interrupt) #endif /* System Call Interrupt */ diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 2f1026fba00d..20f72cd1d813 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -948,6 +948,8 @@ void __init setup_arch(char **cmdline_p) /* Parse memory topology */ mem_topology_setup(); + /* Set max_mapnr before paging_init() */ + set_max_mapnr(max_pfn); /* * Release secondary cpus out of their spinloops at 0x60 now that diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index b15f15dcacb5..e6a958a5da27 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -73,29 +73,12 @@ int __no_sanitize_address arch_stack_walk_reliable(stack_trace_consume_fn consum bool firstframe; stack_end = stack_page + THREAD_SIZE; - if (!is_idle_task(task)) { - /* - * For user tasks, this is the SP value loaded on - * kernel entry, see "PACAKSAVE(r13)" in _switch() and - * system_call_common(). - * - * Likewise for non-swapper kernel threads, - * this also happens to be the top of the stack - * as setup by copy_thread(). - * - * Note that stack backlinks are not properly setup by - * copy_thread() and thus, a forked task() will have - * an unreliable stack trace until it's been - * _switch()'ed to for the first time. - */ - stack_end -= STACK_USER_INT_FRAME_SIZE; - } else { - /* - * idle tasks have a custom stack layout, - * c.f. cpu_idle_thread_init(). - */ + + // See copy_thread() for details. + if (task->flags & PF_KTHREAD) stack_end -= STACK_FRAME_MIN_SIZE; - } + else + stack_end -= STACK_USER_INT_FRAME_SIZE; if (task == current) sp = current_stack_frame(); diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 253620979d0c..6dd2f46bd3ef 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -406,6 +406,9 @@ static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode * if ((yield_count & 1) == 0) goto yield_prev; /* owner vcpu is running */ + if (get_owner_cpu(READ_ONCE(lock->val)) != yield_cpu) + goto yield_prev; /* re-sample lock owner */ + spin_end(); preempted = true; diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c index 3bc0eb21b2a0..5a2e512e96db 100644 --- a/arch/powerpc/mm/book3s64/hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c @@ -143,11 +143,14 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { + unsigned long psize; if (radix_enabled()) return radix__huge_ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + + psize = huge_page_size(hstate_vma(vma)); + set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } void __init hugetlbpage_init_defaultsize(void) diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index 17075c78d4bc..35fd2a95be24 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -47,6 +47,7 @@ void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, pte_t old_pte, pte_t pte) { struct mm_struct *mm = vma->vm_mm; + unsigned long psize = huge_page_size(hstate_vma(vma)); /* * POWER9 NMMU must flush the TLB after clearing the PTE before @@ -58,5 +59,5 @@ void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, atomic_read(&mm->context.copros) > 0) radix__flush_hugetlb_page(vma, addr); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 39acc2cbab4c..9e1f6558d026 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -1212,14 +1212,7 @@ void radix__tlb_flush(struct mmu_gather *tlb) smp_mb(); /* see radix__flush_tlb_mm */ exit_flush_lazy_tlbs(mm); - _tlbiel_pid(mm->context.id, RIC_FLUSH_ALL); - - /* - * It should not be possible to have coprocessors still - * attached here. - */ - if (WARN_ON_ONCE(atomic_read(&mm->context.copros) > 0)) - __flush_all_mm(mm, true); + __flush_all_mm(mm, true); preempt_enable(); } else { diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 8b121df7b08f..07e8f4f1e07f 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -288,7 +288,6 @@ void __init mem_init(void) #endif high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); - set_max_mapnr(max_pfn); kasan_late_init(); diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index dbbfe897455d..a642a7929892 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -91,7 +91,8 @@ static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa, if (new && WARN_ON(pte_present(*ptep) && pgprot_val(prot))) return -EINVAL; - set_huge_pte_at(&init_mm, va, ptep, pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot))); + set_huge_pte_at(&init_mm, va, ptep, + pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)), psize); return 0; } diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 3f86fd217690..4d69bfb9bc11 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -104,6 +104,8 @@ static pte_t set_pte_filter_hash(pte_t pte) { return pte; } /* Embedded type MMU with HW exec support. This is a bit more complicated * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so * instead we "filter out" the exec permission for non clean pages. + * + * This is also called once for the folio. So only work with folio->flags here. */ static inline pte_t set_pte_filter(pte_t pte) { @@ -190,29 +192,39 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { - /* - * Make sure hardware valid bit is not set. We don't do - * tlb flush for this update. - */ - VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); /* Note: mm->context.id might not yet have been assigned as * this context might not have been activated yet when this - * is called. + * is called. Filter the pte value and use the filtered value + * to setup all the ptes in the range. */ pte = set_pte_filter(pte); - /* Perform the setting of the PTE */ - arch_enter_lazy_mmu_mode(); + /* + * We don't need to call arch_enter/leave_lazy_mmu_mode() + * because we expect set_ptes to be only be used on not present + * and not hw_valid ptes. Hence there is no translation cache flush + * involved that need to be batched. + */ for (;;) { + + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); + + /* Perform the setting of the PTE */ __set_pte_at(mm, addr, ptep, pte, 0); if (--nr == 0) break; ptep++; - pte = __pte(pte_val(pte) + (1UL << PTE_RPN_SHIFT)); addr += PAGE_SIZE; + /* + * increment the pfn. + */ + pte = pfn_pte(pte_pfn(pte) + 1, pte_pgprot((pte))); } - arch_leave_lazy_mmu_mode(); } void unmap_kernel_page(unsigned long va) @@ -288,7 +300,8 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, } #if defined(CONFIG_PPC_8xx) -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned long sz) { pmd_t *pmd = pmd_off(mm, addr); pte_basic_t val; diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S index bae45b358a09..2b0cac6fb61f 100644 --- a/arch/powerpc/platforms/pseries/hvCall.S +++ b/arch/powerpc/platforms/pseries/hvCall.S @@ -184,9 +184,6 @@ _GLOBAL_TOC(plpar_hcall) plpar_hcall_trace: HCALL_INST_PRECALL(R5) - std r4,STK_PARAM(R4)(r1) - mr r0,r4 - mr r4,r5 mr r5,r6 mr r6,r7 @@ -196,7 +193,7 @@ plpar_hcall_trace: HVSC - ld r12,STK_PARAM(R4)(r1) + ld r12,STACK_FRAME_MIN_SIZE+STK_PARAM(R4)(r1) std r4,0(r12) std r5,8(r12) std r6,16(r12) @@ -296,9 +293,6 @@ _GLOBAL_TOC(plpar_hcall9) plpar_hcall9_trace: HCALL_INST_PRECALL(R5) - std r4,STK_PARAM(R4)(r1) - mr r0,r4 - mr r4,r5 mr r5,r6 mr r6,r7 diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index d607ab0f7c6d..9c48fecc6719 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -273,11 +273,9 @@ config RISCV_DMA_NONCOHERENT select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE select DMA_BOUNCE_UNALIGNED_KMALLOC if SWIOTLB - select DMA_DIRECT_REMAP if MMU config RISCV_NONSTANDARD_CACHE_OPS bool - depends on RISCV_DMA_NONCOHERENT help This enables function pointer support for non-standard noncoherent systems to handle cache management. @@ -550,6 +548,7 @@ config RISCV_ISA_ZICBOM depends on RISCV_ALTERNATIVE default y select RISCV_DMA_NONCOHERENT + select DMA_DIRECT_REMAP help Adds support to dynamically detect the presence of the ZICBOM extension (Cache Block Management Operations) and enable its diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata index 566bcefeab50..e2c731cfed8c 100644 --- a/arch/riscv/Kconfig.errata +++ b/arch/riscv/Kconfig.errata @@ -77,6 +77,7 @@ config ERRATA_THEAD_PBMT config ERRATA_THEAD_CMO bool "Apply T-Head cache management errata" depends on ERRATA_THEAD && MMU + select DMA_DIRECT_REMAP select RISCV_DMA_NONCOHERENT default y help diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 1329e060c548..b43a6bb7e4dc 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -6,7 +6,6 @@ # for more details. # -OBJCOPYFLAGS := -O binary LDFLAGS_vmlinux := -z norelro ifeq ($(CONFIG_RELOCATABLE),y) LDFLAGS_vmlinux += -shared -Bsymbolic -z notext --emit-relocs diff --git a/arch/riscv/boot/dts/starfive/jh7110-starfive-visionfive-2.dtsi b/arch/riscv/boot/dts/starfive/jh7110-starfive-visionfive-2.dtsi index d79f94432b27..2c02358abd71 100644 --- a/arch/riscv/boot/dts/starfive/jh7110-starfive-visionfive-2.dtsi +++ b/arch/riscv/boot/dts/starfive/jh7110-starfive-visionfive-2.dtsi @@ -262,7 +262,7 @@ reg = <0x100000 0x400000>; }; reserved-data@600000 { - reg = <0x600000 0x1000000>; + reg = <0x600000 0xa00000>; }; }; }; @@ -431,7 +431,7 @@ }; ss-pins { - pinmux = <GPIOMUX(48, GPOUT_SYS_SPI0_FSS, + pinmux = <GPIOMUX(49, GPOUT_SYS_SPI0_FSS, GPOEN_ENABLE, GPI_SYS_SPI0_FSS)>; bias-disable; @@ -440,30 +440,6 @@ }; }; - uart0_pins: uart0-0 { - tx-pins { - pinmux = <GPIOMUX(5, GPOUT_SYS_UART0_TX, - GPOEN_ENABLE, - GPI_NONE)>; - bias-disable; - drive-strength = <12>; - input-disable; - input-schmitt-disable; - slew-rate = <0>; - }; - - rx-pins { - pinmux = <GPIOMUX(6, GPOUT_LOW, - GPOEN_DISABLE, - GPI_SYS_UART0_RX)>; - bias-disable; /* external pull-up */ - drive-strength = <2>; - input-enable; - input-schmitt-enable; - slew-rate = <0>; - }; - }; - tdm_pins: tdm-0 { tx-pins { pinmux = <GPIOMUX(44, GPOUT_SYS_TDM_TXD, @@ -497,6 +473,30 @@ input-enable; }; }; + + uart0_pins: uart0-0 { + tx-pins { + pinmux = <GPIOMUX(5, GPOUT_SYS_UART0_TX, + GPOEN_ENABLE, + GPI_NONE)>; + bias-disable; + drive-strength = <12>; + input-disable; + input-schmitt-disable; + slew-rate = <0>; + }; + + rx-pins { + pinmux = <GPIOMUX(6, GPOUT_LOW, + GPOEN_DISABLE, + GPI_SYS_UART0_RX)>; + bias-disable; /* external pull-up */ + drive-strength = <2>; + input-enable; + input-schmitt-enable; + slew-rate = <0>; + }; + }; }; &tdm { @@ -513,6 +513,7 @@ &usb0 { dr_mode = "peripheral"; + status = "okay"; }; &U74_1 { diff --git a/arch/riscv/boot/dts/thead/th1520.dtsi b/arch/riscv/boot/dts/thead/th1520.dtsi index ce708183b6f6..ff364709a6df 100644 --- a/arch/riscv/boot/dts/thead/th1520.dtsi +++ b/arch/riscv/boot/dts/thead/th1520.dtsi @@ -139,6 +139,7 @@ interrupt-parent = <&plic>; #address-cells = <2>; #size-cells = <2>; + dma-noncoherent; ranges; plic: interrupt-controller@ffd8000000 { diff --git a/arch/riscv/errata/andes/Makefile b/arch/riscv/errata/andes/Makefile index 2d644e19caef..6278c389b801 100644 --- a/arch/riscv/errata/andes/Makefile +++ b/arch/riscv/errata/andes/Makefile @@ -1 +1,5 @@ +ifdef CONFIG_RISCV_ALTERNATIVE_EARLY +CFLAGS_errata.o := -mcmodel=medany +endif + obj-y += errata.o diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 777cb8299551..306a19a5509c 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -203,6 +203,18 @@ #define ENVCFG_CBIE_INV _AC(0x3, UL) #define ENVCFG_FIOM _AC(0x1, UL) +/* Smstateen bits */ +#define SMSTATEEN0_AIA_IMSIC_SHIFT 58 +#define SMSTATEEN0_AIA_IMSIC (_ULL(1) << SMSTATEEN0_AIA_IMSIC_SHIFT) +#define SMSTATEEN0_AIA_SHIFT 59 +#define SMSTATEEN0_AIA (_ULL(1) << SMSTATEEN0_AIA_SHIFT) +#define SMSTATEEN0_AIA_ISEL_SHIFT 60 +#define SMSTATEEN0_AIA_ISEL (_ULL(1) << SMSTATEEN0_AIA_ISEL_SHIFT) +#define SMSTATEEN0_HSENVCFG_SHIFT 62 +#define SMSTATEEN0_HSENVCFG (_ULL(1) << SMSTATEEN0_HSENVCFG_SHIFT) +#define SMSTATEEN0_SSTATEEN0_SHIFT 63 +#define SMSTATEEN0_SSTATEEN0 (_ULL(1) << SMSTATEEN0_SSTATEEN0_SHIFT) + /* symbolic CSR names: */ #define CSR_CYCLE 0xc00 #define CSR_TIME 0xc01 @@ -275,6 +287,8 @@ #define CSR_SIE 0x104 #define CSR_STVEC 0x105 #define CSR_SCOUNTEREN 0x106 +#define CSR_SENVCFG 0x10a +#define CSR_SSTATEEN0 0x10c #define CSR_SSCRATCH 0x140 #define CSR_SEPC 0x141 #define CSR_SCAUSE 0x142 @@ -349,6 +363,10 @@ #define CSR_VSIEH 0x214 #define CSR_VSIPH 0x254 +/* Hypervisor stateen CSRs */ +#define CSR_HSTATEEN0 0x60c +#define CSR_HSTATEEN0H 0x61c + #define CSR_MSTATUS 0x300 #define CSR_MISA 0x301 #define CSR_MIDELEG 0x303 diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index 740a979171e5..2b2f5df7ef2c 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -31,6 +31,27 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) return addr; } +/* + * Let's do like x86/arm64 and ignore the compat syscalls. + */ +#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS +static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) +{ + return is_compat_task(); +} + +#define ARCH_HAS_SYSCALL_MATCH_SYM_NAME +static inline bool arch_syscall_match_sym_name(const char *sym, + const char *name) +{ + /* + * Since all syscall functions have __riscv_ prefix, we must skip it. + * However, as we described above, we decided to ignore compat + * syscalls, so we don't care about __riscv_compat_ prefix here. + */ + return !strcmp(sym + 8, name); +} + struct dyn_arch_ftrace { }; #endif diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index 34e24f078cc1..4c5b0e929890 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -18,7 +18,8 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT void set_huge_pte_at(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, pte_t pte); + unsigned long addr, pte_t *ptep, pte_t pte, + unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index b7b58258f6c7..6fc51c1b34cf 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -58,6 +58,8 @@ #define RISCV_ISA_EXT_ZICSR 40 #define RISCV_ISA_EXT_ZIFENCEI 41 #define RISCV_ISA_EXT_ZIHPM 42 +#define RISCV_ISA_EXT_SMSTATEEN 43 +#define RISCV_ISA_EXT_ZICOND 44 #define RISCV_ISA_EXT_MAX 64 diff --git a/arch/riscv/include/asm/kprobes.h b/arch/riscv/include/asm/kprobes.h index e7882ccb0fd4..78ea44f76718 100644 --- a/arch/riscv/include/asm/kprobes.h +++ b/arch/riscv/include/asm/kprobes.h @@ -40,6 +40,15 @@ void arch_remove_kprobe(struct kprobe *p); int kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr); bool kprobe_breakpoint_handler(struct pt_regs *regs); bool kprobe_single_step_handler(struct pt_regs *regs); - +#else +static inline bool kprobe_breakpoint_handler(struct pt_regs *regs) +{ + return false; +} + +static inline bool kprobe_single_step_handler(struct pt_regs *regs) +{ + return false; +} #endif /* CONFIG_KPROBES */ #endif /* _ASM_RISCV_KPROBES_H */ diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 1ebf20dfbaa6..0eefd9c991ae 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -162,6 +162,16 @@ struct kvm_vcpu_csr { unsigned long hvip; unsigned long vsatp; unsigned long scounteren; + unsigned long senvcfg; +}; + +struct kvm_vcpu_config { + u64 henvcfg; + u64 hstateen0; +}; + +struct kvm_vcpu_smstateen_csr { + unsigned long sstateen0; }; struct kvm_vcpu_arch { @@ -183,6 +193,8 @@ struct kvm_vcpu_arch { unsigned long host_sscratch; unsigned long host_stvec; unsigned long host_scounteren; + unsigned long host_senvcfg; + unsigned long host_sstateen0; /* CPU context of Host */ struct kvm_cpu_context host_context; @@ -193,6 +205,9 @@ struct kvm_vcpu_arch { /* CPU CSR context of Guest VCPU */ struct kvm_vcpu_csr guest_csr; + /* CPU Smstateen CSR context of Guest VCPU */ + struct kvm_vcpu_smstateen_csr smstateen_csr; + /* CPU context upon Guest VCPU reset */ struct kvm_cpu_context guest_reset_context; @@ -244,6 +259,9 @@ struct kvm_vcpu_arch { /* Performance monitoring context */ struct kvm_pmu pmu_context; + + /* 'static' configurations which are set only once */ + struct kvm_vcpu_config cfg; }; static inline void kvm_arch_sync_events(struct kvm *kvm) {} diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h index cdcf0ff07be7..6a453f7f8b56 100644 --- a/arch/riscv/include/asm/kvm_vcpu_sbi.h +++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h @@ -11,7 +11,7 @@ #define KVM_SBI_IMPID 3 -#define KVM_SBI_VERSION_MAJOR 1 +#define KVM_SBI_VERSION_MAJOR 2 #define KVM_SBI_VERSION_MINOR 0 enum kvm_riscv_sbi_ext_status { @@ -35,6 +35,9 @@ struct kvm_vcpu_sbi_return { struct kvm_vcpu_sbi_extension { unsigned long extid_start; unsigned long extid_end; + + bool default_unavail; + /** * SBI extension handler. It can be defined for a given extension or group of * extension. But it should always return linux error codes rather than SBI @@ -59,6 +62,7 @@ int kvm_riscv_vcpu_get_reg_sbi_ext(struct kvm_vcpu *vcpu, const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext( struct kvm_vcpu *vcpu, unsigned long extid); int kvm_riscv_vcpu_sbi_ecall(struct kvm_vcpu *vcpu, struct kvm_run *run); +void kvm_riscv_vcpu_sbi_init(struct kvm_vcpu *vcpu); #ifdef CONFIG_RISCV_SBI_V01 extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_v01; @@ -69,6 +73,7 @@ extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_ipi; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_rfence; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_srst; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_hsm; +extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_dbcn; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_experimental; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_vendor; diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 5b4a1bf5f439..12dfda6bb924 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -30,6 +30,7 @@ enum sbi_ext_id { SBI_EXT_HSM = 0x48534D, SBI_EXT_SRST = 0x53525354, SBI_EXT_PMU = 0x504D55, + SBI_EXT_DBCN = 0x4442434E, /* Experimentals extensions must lie within this range */ SBI_EXT_EXPERIMENTAL_START = 0x08000000, @@ -236,6 +237,12 @@ enum sbi_pmu_ctr_type { /* Flags defined for counter stop function */ #define SBI_PMU_STOP_FLAG_RESET (1 << 0) +enum sbi_ext_dbcn_fid { + SBI_EXT_DBCN_CONSOLE_WRITE = 0, + SBI_EXT_DBCN_CONSOLE_READ = 1, + SBI_EXT_DBCN_CONSOLE_WRITE_BYTE = 2, +}; + #define SBI_SPEC_VERSION_DEFAULT 0x1 #define SBI_SPEC_VERSION_MAJOR_SHIFT 24 #define SBI_SPEC_VERSION_MAJOR_MASK 0x7f diff --git a/arch/riscv/include/asm/uprobes.h b/arch/riscv/include/asm/uprobes.h index f2183e00fdd2..3fc7deda9190 100644 --- a/arch/riscv/include/asm/uprobes.h +++ b/arch/riscv/include/asm/uprobes.h @@ -34,7 +34,18 @@ struct arch_uprobe { bool simulate; }; +#ifdef CONFIG_UPROBES bool uprobe_breakpoint_handler(struct pt_regs *regs); bool uprobe_single_step_handler(struct pt_regs *regs); - +#else +static inline bool uprobe_breakpoint_handler(struct pt_regs *regs) +{ + return false; +} + +static inline bool uprobe_single_step_handler(struct pt_regs *regs) +{ + return false; +} +#endif /* CONFIG_UPROBES */ #endif /* _ASM_RISCV_UPROBES_H */ diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 992c5e407104..60d3b21dead7 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -80,6 +80,7 @@ struct kvm_riscv_csr { unsigned long sip; unsigned long satp; unsigned long scounteren; + unsigned long senvcfg; }; /* AIA CSR registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ @@ -93,6 +94,11 @@ struct kvm_riscv_aia_csr { unsigned long iprio2h; }; +/* Smstateen CSR for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ +struct kvm_riscv_smstateen_csr { + unsigned long sstateen0; +}; + /* TIMER registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ struct kvm_riscv_timer { __u64 frequency; @@ -131,6 +137,8 @@ enum KVM_RISCV_ISA_EXT_ID { KVM_RISCV_ISA_EXT_ZICSR, KVM_RISCV_ISA_EXT_ZIFENCEI, KVM_RISCV_ISA_EXT_ZIHPM, + KVM_RISCV_ISA_EXT_SMSTATEEN, + KVM_RISCV_ISA_EXT_ZICOND, KVM_RISCV_ISA_EXT_MAX, }; @@ -148,6 +156,7 @@ enum KVM_RISCV_SBI_EXT_ID { KVM_RISCV_SBI_EXT_PMU, KVM_RISCV_SBI_EXT_EXPERIMENTAL, KVM_RISCV_SBI_EXT_VENDOR, + KVM_RISCV_SBI_EXT_DBCN, KVM_RISCV_SBI_EXT_MAX, }; @@ -178,10 +187,13 @@ enum KVM_RISCV_SBI_EXT_ID { #define KVM_REG_RISCV_CSR (0x03 << KVM_REG_RISCV_TYPE_SHIFT) #define KVM_REG_RISCV_CSR_GENERAL (0x0 << KVM_REG_RISCV_SUBTYPE_SHIFT) #define KVM_REG_RISCV_CSR_AIA (0x1 << KVM_REG_RISCV_SUBTYPE_SHIFT) +#define KVM_REG_RISCV_CSR_SMSTATEEN (0x2 << KVM_REG_RISCV_SUBTYPE_SHIFT) #define KVM_REG_RISCV_CSR_REG(name) \ (offsetof(struct kvm_riscv_csr, name) / sizeof(unsigned long)) #define KVM_REG_RISCV_CSR_AIA_REG(name) \ (offsetof(struct kvm_riscv_aia_csr, name) / sizeof(unsigned long)) +#define KVM_REG_RISCV_CSR_SMSTATEEN_REG(name) \ + (offsetof(struct kvm_riscv_smstateen_csr, name) / sizeof(unsigned long)) /* Timer registers are mapped as type 4 */ #define KVM_REG_RISCV_TIMER (0x04 << KVM_REG_RISCV_TYPE_SHIFT) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 1cfbba65d11a..e3803822ab5a 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -167,6 +167,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM), __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ), __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR), + __RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND), __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR), __RISCV_ISA_EXT_DATA(zifencei, RISCV_ISA_EXT_ZIFENCEI), __RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE), @@ -175,6 +176,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { __RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB), __RISCV_ISA_EXT_DATA(zbs, RISCV_ISA_EXT_ZBS), __RISCV_ISA_EXT_DATA(smaia, RISCV_ISA_EXT_SMAIA), + __RISCV_ISA_EXT_DATA(smstateen, RISCV_ISA_EXT_SMSTATEEN), __RISCV_ISA_EXT_DATA(ssaia, RISCV_ISA_EXT_SSAIA), __RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF), __RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC), diff --git a/arch/riscv/kernel/irq.c b/arch/riscv/kernel/irq.c index a8efa053c4a5..9cc0a7669271 100644 --- a/arch/riscv/kernel/irq.c +++ b/arch/riscv/kernel/irq.c @@ -60,7 +60,7 @@ static void init_irq_stacks(void) } #endif /* CONFIG_VMAP_STACK */ -#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK void do_softirq_own_stack(void) { #ifdef CONFIG_IRQ_STACKS @@ -92,7 +92,7 @@ void do_softirq_own_stack(void) #endif __do_softirq(); } -#endif /* CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK */ +#endif /* CONFIG_SOFTIRQ_ON_OWN_STACK */ #else static void init_irq_stacks(void) {} diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index e600aab116a4..aac853ae4eb7 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -173,19 +173,6 @@ static void __init init_resources(void) if (ret < 0) goto error; -#ifdef CONFIG_KEXEC_CORE - if (crashk_res.start != crashk_res.end) { - ret = add_resource(&iomem_resource, &crashk_res); - if (ret < 0) - goto error; - } - if (crashk_low_res.start != crashk_low_res.end) { - ret = add_resource(&iomem_resource, &crashk_low_res); - if (ret < 0) - goto error; - } -#endif - #ifdef CONFIG_CRASH_DUMP if (elfcorehdr_size > 0) { elfcorehdr_res.start = elfcorehdr_addr; diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 180d951d3624..21a4d0e111bc 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -311,13 +311,6 @@ static inline void __user *get_sigframe(struct ksignal *ksig, /* Align the stack frame. */ sp &= ~0xfUL; - /* - * Fail if the size of the altstack is not large enough for the - * sigframe construction. - */ - if (current->sas_ss_size && sp < current->sas_ss_sp) - return (void __user __force *)-1UL; - return (void __user *)sp; } diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 19807c4d3805..fae8f610d867 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -13,6 +13,8 @@ #include <linux/kdebug.h> #include <linux/uaccess.h> #include <linux/kprobes.h> +#include <linux/uprobes.h> +#include <asm/uprobes.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/irq.h> @@ -247,22 +249,28 @@ static inline unsigned long get_break_insn_length(unsigned long pc) return GET_INSN_LENGTH(insn); } +static bool probe_single_step_handler(struct pt_regs *regs) +{ + bool user = user_mode(regs); + + return user ? uprobe_single_step_handler(regs) : kprobe_single_step_handler(regs); +} + +static bool probe_breakpoint_handler(struct pt_regs *regs) +{ + bool user = user_mode(regs); + + return user ? uprobe_breakpoint_handler(regs) : kprobe_breakpoint_handler(regs); +} + void handle_break(struct pt_regs *regs) { -#ifdef CONFIG_KPROBES - if (kprobe_single_step_handler(regs)) + if (probe_single_step_handler(regs)) return; - if (kprobe_breakpoint_handler(regs)) - return; -#endif -#ifdef CONFIG_UPROBES - if (uprobe_single_step_handler(regs)) + if (probe_breakpoint_handler(regs)) return; - if (uprobe_breakpoint_handler(regs)) - return; -#endif current->thread.bad_cause = regs->cause; if (user_mode(regs)) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 82229db1ce73..e087c809073c 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -141,6 +141,12 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (rc) return rc; + /* + * Setup SBI extensions + * NOTE: This must be the last thing to be initialized. + */ + kvm_riscv_vcpu_sbi_init(vcpu); + /* Reset VCPU */ kvm_riscv_reset_vcpu(vcpu); @@ -471,31 +477,38 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, return -EINVAL; } -static void kvm_riscv_vcpu_update_config(const unsigned long *isa) +static void kvm_riscv_vcpu_setup_config(struct kvm_vcpu *vcpu) { - u64 henvcfg = 0; + const unsigned long *isa = vcpu->arch.isa; + struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; if (riscv_isa_extension_available(isa, SVPBMT)) - henvcfg |= ENVCFG_PBMTE; + cfg->henvcfg |= ENVCFG_PBMTE; if (riscv_isa_extension_available(isa, SSTC)) - henvcfg |= ENVCFG_STCE; + cfg->henvcfg |= ENVCFG_STCE; if (riscv_isa_extension_available(isa, ZICBOM)) - henvcfg |= (ENVCFG_CBIE | ENVCFG_CBCFE); + cfg->henvcfg |= (ENVCFG_CBIE | ENVCFG_CBCFE); if (riscv_isa_extension_available(isa, ZICBOZ)) - henvcfg |= ENVCFG_CBZE; - - csr_write(CSR_HENVCFG, henvcfg); -#ifdef CONFIG_32BIT - csr_write(CSR_HENVCFGH, henvcfg >> 32); -#endif + cfg->henvcfg |= ENVCFG_CBZE; + + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) { + cfg->hstateen0 |= SMSTATEEN0_HSENVCFG; + if (riscv_isa_extension_available(isa, SSAIA)) + cfg->hstateen0 |= SMSTATEEN0_AIA_IMSIC | + SMSTATEEN0_AIA | + SMSTATEEN0_AIA_ISEL; + if (riscv_isa_extension_available(isa, SMSTATEEN)) + cfg->hstateen0 |= SMSTATEEN0_SSTATEEN0; + } } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; csr_write(CSR_VSSTATUS, csr->vsstatus); csr_write(CSR_VSIE, csr->vsie); @@ -506,8 +519,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) csr_write(CSR_VSTVAL, csr->vstval); csr_write(CSR_HVIP, csr->hvip); csr_write(CSR_VSATP, csr->vsatp); - - kvm_riscv_vcpu_update_config(vcpu->arch.isa); + csr_write(CSR_HENVCFG, cfg->henvcfg); + if (IS_ENABLED(CONFIG_32BIT)) + csr_write(CSR_HENVCFGH, cfg->henvcfg >> 32); + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) { + csr_write(CSR_HSTATEEN0, cfg->hstateen0); + if (IS_ENABLED(CONFIG_32BIT)) + csr_write(CSR_HSTATEEN0H, cfg->hstateen0 >> 32); + } kvm_riscv_gstage_update_hgatp(vcpu); @@ -606,6 +625,32 @@ static void kvm_riscv_update_hvip(struct kvm_vcpu *vcpu) kvm_riscv_vcpu_aia_update_hvip(vcpu); } +static __always_inline void kvm_riscv_vcpu_swap_in_guest_state(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_smstateen_csr *smcsr = &vcpu->arch.smstateen_csr; + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; + + vcpu->arch.host_senvcfg = csr_swap(CSR_SENVCFG, csr->senvcfg); + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN) && + (cfg->hstateen0 & SMSTATEEN0_SSTATEEN0)) + vcpu->arch.host_sstateen0 = csr_swap(CSR_SSTATEEN0, + smcsr->sstateen0); +} + +static __always_inline void kvm_riscv_vcpu_swap_in_host_state(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_smstateen_csr *smcsr = &vcpu->arch.smstateen_csr; + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; + + csr->senvcfg = csr_swap(CSR_SENVCFG, vcpu->arch.host_senvcfg); + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN) && + (cfg->hstateen0 & SMSTATEEN0_SSTATEEN0)) + smcsr->sstateen0 = csr_swap(CSR_SSTATEEN0, + vcpu->arch.host_sstateen0); +} + /* * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while * the vCPU is running. @@ -615,10 +660,12 @@ static void kvm_riscv_update_hvip(struct kvm_vcpu *vcpu) */ static void noinstr kvm_riscv_vcpu_enter_exit(struct kvm_vcpu *vcpu) { + kvm_riscv_vcpu_swap_in_guest_state(vcpu); guest_state_enter_irqoff(); __kvm_riscv_switch_to(&vcpu->arch); vcpu->arch.last_exit_cpu = vcpu->cpu; guest_state_exit_irqoff(); + kvm_riscv_vcpu_swap_in_host_state(vcpu); } int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) @@ -627,6 +674,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) struct kvm_cpu_trap trap; struct kvm_run *run = vcpu->run; + if (!vcpu->arch.ran_atleast_once) + kvm_riscv_vcpu_setup_config(vcpu); + /* Mark this VCPU ran at least once */ vcpu->arch.ran_atleast_once = true; diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index b7e0e03c69b1..c6ebce6126b5 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -34,6 +34,7 @@ static const unsigned long kvm_isa_ext_arr[] = { [KVM_RISCV_ISA_EXT_M] = RISCV_ISA_EXT_m, [KVM_RISCV_ISA_EXT_V] = RISCV_ISA_EXT_v, /* Multi letter extensions (alphabetically sorted) */ + KVM_ISA_EXT_ARR(SMSTATEEN), KVM_ISA_EXT_ARR(SSAIA), KVM_ISA_EXT_ARR(SSTC), KVM_ISA_EXT_ARR(SVINVAL), @@ -45,6 +46,7 @@ static const unsigned long kvm_isa_ext_arr[] = { KVM_ISA_EXT_ARR(ZICBOM), KVM_ISA_EXT_ARR(ZICBOZ), KVM_ISA_EXT_ARR(ZICNTR), + KVM_ISA_EXT_ARR(ZICOND), KVM_ISA_EXT_ARR(ZICSR), KVM_ISA_EXT_ARR(ZIFENCEI), KVM_ISA_EXT_ARR(ZIHINTPAUSE), @@ -80,11 +82,11 @@ static bool kvm_riscv_vcpu_isa_enable_allowed(unsigned long ext) static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext) { switch (ext) { + /* Extensions which don't have any mechanism to disable */ case KVM_RISCV_ISA_EXT_A: case KVM_RISCV_ISA_EXT_C: case KVM_RISCV_ISA_EXT_I: case KVM_RISCV_ISA_EXT_M: - case KVM_RISCV_ISA_EXT_SSAIA: case KVM_RISCV_ISA_EXT_SSTC: case KVM_RISCV_ISA_EXT_SVINVAL: case KVM_RISCV_ISA_EXT_SVNAPOT: @@ -92,11 +94,15 @@ static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext) case KVM_RISCV_ISA_EXT_ZBB: case KVM_RISCV_ISA_EXT_ZBS: case KVM_RISCV_ISA_EXT_ZICNTR: + case KVM_RISCV_ISA_EXT_ZICOND: case KVM_RISCV_ISA_EXT_ZICSR: case KVM_RISCV_ISA_EXT_ZIFENCEI: case KVM_RISCV_ISA_EXT_ZIHINTPAUSE: case KVM_RISCV_ISA_EXT_ZIHPM: return false; + /* Extensions which can be disabled using Smstateen */ + case KVM_RISCV_ISA_EXT_SSAIA: + return riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN); default: break; } @@ -378,6 +384,34 @@ static int kvm_riscv_vcpu_general_set_csr(struct kvm_vcpu *vcpu, return 0; } +static inline int kvm_riscv_vcpu_smstateen_set_csr(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long reg_val) +{ + struct kvm_vcpu_smstateen_csr *csr = &vcpu->arch.smstateen_csr; + + if (reg_num >= sizeof(struct kvm_riscv_smstateen_csr) / + sizeof(unsigned long)) + return -EINVAL; + + ((unsigned long *)csr)[reg_num] = reg_val; + return 0; +} + +static int kvm_riscv_vcpu_smstateen_get_csr(struct kvm_vcpu *vcpu, + unsigned long reg_num, + unsigned long *out_val) +{ + struct kvm_vcpu_smstateen_csr *csr = &vcpu->arch.smstateen_csr; + + if (reg_num >= sizeof(struct kvm_riscv_smstateen_csr) / + sizeof(unsigned long)) + return -EINVAL; + + *out_val = ((unsigned long *)csr)[reg_num]; + return 0; +} + static int kvm_riscv_vcpu_get_reg_csr(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { @@ -401,6 +435,12 @@ static int kvm_riscv_vcpu_get_reg_csr(struct kvm_vcpu *vcpu, case KVM_REG_RISCV_CSR_AIA: rc = kvm_riscv_vcpu_aia_get_csr(vcpu, reg_num, ®_val); break; + case KVM_REG_RISCV_CSR_SMSTATEEN: + rc = -EINVAL; + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) + rc = kvm_riscv_vcpu_smstateen_get_csr(vcpu, reg_num, + ®_val); + break; default: rc = -ENOENT; break; @@ -440,6 +480,12 @@ static int kvm_riscv_vcpu_set_reg_csr(struct kvm_vcpu *vcpu, case KVM_REG_RISCV_CSR_AIA: rc = kvm_riscv_vcpu_aia_set_csr(vcpu, reg_num, reg_val); break; + case KVM_REG_RISCV_CSR_SMSTATEEN: + rc = -EINVAL; + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) + rc = kvm_riscv_vcpu_smstateen_set_csr(vcpu, reg_num, + reg_val); +break; default: rc = -ENOENT; break; @@ -696,6 +742,8 @@ static inline unsigned long num_csr_regs(const struct kvm_vcpu *vcpu) if (riscv_isa_extension_available(vcpu->arch.isa, SSAIA)) n += sizeof(struct kvm_riscv_aia_csr) / sizeof(unsigned long); + if (riscv_isa_extension_available(vcpu->arch.isa, SMSTATEEN)) + n += sizeof(struct kvm_riscv_smstateen_csr) / sizeof(unsigned long); return n; } @@ -704,7 +752,7 @@ static int copy_csr_reg_indices(const struct kvm_vcpu *vcpu, u64 __user *uindices) { int n1 = sizeof(struct kvm_riscv_csr) / sizeof(unsigned long); - int n2 = 0; + int n2 = 0, n3 = 0; /* copy general csr regs */ for (int i = 0; i < n1; i++) { @@ -738,7 +786,25 @@ static int copy_csr_reg_indices(const struct kvm_vcpu *vcpu, } } - return n1 + n2; + /* copy Smstateen csr regs */ + if (riscv_isa_extension_available(vcpu->arch.isa, SMSTATEEN)) { + n3 = sizeof(struct kvm_riscv_smstateen_csr) / sizeof(unsigned long); + + for (int i = 0; i < n3; i++) { + u64 size = IS_ENABLED(CONFIG_32BIT) ? + KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64; + u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_CSR | + KVM_REG_RISCV_CSR_SMSTATEEN | i; + + if (uindices) { + if (put_user(reg, uindices)) + return -EFAULT; + uindices++; + } + } + } + + return n1 + n2 + n3; } static inline unsigned long num_timer_regs(void) diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index 9cd97091c723..a04ff98085d9 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -67,6 +67,10 @@ static const struct kvm_riscv_sbi_extension_entry sbi_ext[] = { .ext_ptr = &vcpu_sbi_ext_pmu, }, { + .ext_idx = KVM_RISCV_SBI_EXT_DBCN, + .ext_ptr = &vcpu_sbi_ext_dbcn, + }, + { .ext_idx = KVM_RISCV_SBI_EXT_EXPERIMENTAL, .ext_ptr = &vcpu_sbi_ext_experimental, }, @@ -155,14 +159,8 @@ static int riscv_vcpu_set_sbi_ext_single(struct kvm_vcpu *vcpu, if (!sext) return -ENOENT; - /* - * We can't set the extension status to available here, since it may - * have a probe() function which needs to confirm availability first, - * but it may be too early to call that here. We can set the status to - * unavailable, though. - */ - if (!reg_val) - scontext->ext_status[sext->ext_idx] = + scontext->ext_status[sext->ext_idx] = (reg_val) ? + KVM_RISCV_SBI_EXT_AVAILABLE : KVM_RISCV_SBI_EXT_UNAVAILABLE; return 0; @@ -188,16 +186,8 @@ static int riscv_vcpu_get_sbi_ext_single(struct kvm_vcpu *vcpu, if (!sext) return -ENOENT; - /* - * If the extension status is still uninitialized, then we should probe - * to determine if it's available, but it may be too early to do that - * here. The best we can do is report that the extension has not been - * disabled, i.e. we return 1 when the extension is available and also - * when it only may be available. - */ - *reg_val = scontext->ext_status[sext->ext_idx] != - KVM_RISCV_SBI_EXT_UNAVAILABLE; - + *reg_val = scontext->ext_status[sext->ext_idx] == + KVM_RISCV_SBI_EXT_AVAILABLE; return 0; } @@ -337,18 +327,8 @@ const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext( scontext->ext_status[entry->ext_idx] == KVM_RISCV_SBI_EXT_AVAILABLE) return ext; - if (scontext->ext_status[entry->ext_idx] == - KVM_RISCV_SBI_EXT_UNAVAILABLE) - return NULL; - if (ext->probe && !ext->probe(vcpu)) { - scontext->ext_status[entry->ext_idx] = - KVM_RISCV_SBI_EXT_UNAVAILABLE; - return NULL; - } - scontext->ext_status[entry->ext_idx] = - KVM_RISCV_SBI_EXT_AVAILABLE; - return ext; + return NULL; } } @@ -419,3 +399,26 @@ ecall_done: return ret; } + +void kvm_riscv_vcpu_sbi_init(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context; + const struct kvm_riscv_sbi_extension_entry *entry; + const struct kvm_vcpu_sbi_extension *ext; + int i; + + for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) { + entry = &sbi_ext[i]; + ext = entry->ext_ptr; + + if (ext->probe && !ext->probe(vcpu)) { + scontext->ext_status[entry->ext_idx] = + KVM_RISCV_SBI_EXT_UNAVAILABLE; + continue; + } + + scontext->ext_status[entry->ext_idx] = ext->default_unavail ? + KVM_RISCV_SBI_EXT_UNAVAILABLE : + KVM_RISCV_SBI_EXT_AVAILABLE; + } +} diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 7c4d5d38a339..23b57c931b15 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -175,3 +175,35 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_srst = { .extid_end = SBI_EXT_SRST, .handler = kvm_sbi_ext_srst_handler, }; + +static int kvm_sbi_ext_dbcn_handler(struct kvm_vcpu *vcpu, + struct kvm_run *run, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_cpu_context *cp = &vcpu->arch.guest_context; + unsigned long funcid = cp->a6; + + switch (funcid) { + case SBI_EXT_DBCN_CONSOLE_WRITE: + case SBI_EXT_DBCN_CONSOLE_READ: + case SBI_EXT_DBCN_CONSOLE_WRITE_BYTE: + /* + * The SBI debug console functions are unconditionally + * forwarded to the userspace. + */ + kvm_riscv_vcpu_sbi_forward(vcpu, run); + retdata->uexit = true; + break; + default: + retdata->err_val = SBI_ERR_NOT_SUPPORTED; + } + + return 0; +} + +const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_dbcn = { + .extid_start = SBI_EXT_DBCN, + .extid_end = SBI_EXT_DBCN, + .default_unavail = true, + .handler = kvm_sbi_ext_dbcn_handler, +}; diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 6115d7514972..90d4ba36d1d0 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -72,7 +72,7 @@ static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_f } pagefault_out_of_memory(); return; - } else if (fault & VM_FAULT_SIGBUS) { + } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) { /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) { no_context(regs, addr); diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 96225a8533ad..b52f0210481f 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -180,17 +180,25 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, - pte_t pte) + pte_t pte, + unsigned long sz) { + unsigned long hugepage_shift; int i, pte_num; - if (!pte_napot(pte)) { - set_pte_at(mm, addr, ptep, pte); - return; - } + if (sz >= PGDIR_SIZE) + hugepage_shift = PGDIR_SHIFT; + else if (sz >= P4D_SIZE) + hugepage_shift = P4D_SHIFT; + else if (sz >= PUD_SIZE) + hugepage_shift = PUD_SHIFT; + else if (sz >= PMD_SIZE) + hugepage_shift = PMD_SHIFT; + else + hugepage_shift = PAGE_SHIFT; - pte_num = napot_pte_num(napot_cont_order(pte)); - for (i = 0; i < pte_num; i++, ptep++, addr += PAGE_SIZE) + pte_num = sz >> hugepage_shift; + for (i = 0; i < pte_num; i++, ptep++, addr += (1 << hugepage_shift)) set_pte_at(mm, addr, ptep, pte); } diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index ecd3ae6f4116..8581693e62d3 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -245,7 +245,7 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx) emit_addi(RV_REG_SP, RV_REG_SP, stack_adjust, ctx); /* Set return value. */ if (!is_tail_call) - emit_mv(RV_REG_A0, RV_REG_A5, ctx); + emit_addiw(RV_REG_A0, RV_REG_A5, 0, ctx); emit_jalr(RV_REG_ZERO, is_tail_call ? RV_REG_T3 : RV_REG_RA, is_tail_call ? (RV_FENTRY_NINSNS + 1) * 4 : 0, /* skip reserved nops and TCC init */ ctx); @@ -759,8 +759,10 @@ static int invoke_bpf_prog(struct bpf_tramp_link *l, int args_off, int retval_of if (ret) return ret; - if (save_ret) - emit_sd(RV_REG_FP, -retval_off, regmap[BPF_REG_0], ctx); + if (save_ret) { + emit_sd(RV_REG_FP, -retval_off, RV_REG_A0, ctx); + emit_sd(RV_REG_FP, -(retval_off - 8), regmap[BPF_REG_0], ctx); + } /* update branch with beqz */ if (ctx->insns) { @@ -853,7 +855,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET); if (save_ret) { - stack_size += 8; + stack_size += 16; /* Save both A5 (BPF R0) and A0 */ retval_off = stack_size; } @@ -957,6 +959,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, if (ret) goto out; emit_sd(RV_REG_FP, -retval_off, RV_REG_A0, ctx); + emit_sd(RV_REG_FP, -(retval_off - 8), regmap[BPF_REG_0], ctx); im->ip_after_call = ctx->insns + ctx->ninsns; /* 2 nops reserved for auipc+jalr pair */ emit(rv_nop(), ctx); @@ -988,8 +991,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, if (flags & BPF_TRAMP_F_RESTORE_REGS) restore_args(nregs, args_off, ctx); - if (save_ret) + if (save_ret) { emit_ld(RV_REG_A0, -retval_off, RV_REG_FP, ctx); + emit_ld(regmap[BPF_REG_0], -(retval_off - 8), RV_REG_FP, ctx); + } emit_ld(RV_REG_S1, -sreg_off, RV_REG_FP, ctx); @@ -1515,7 +1520,8 @@ out_be: if (ret) return ret; - emit_mv(bpf_to_rv_reg(BPF_REG_0, ctx), RV_REG_A0, ctx); + if (insn->src_reg != BPF_PSEUDO_CALL) + emit_mv(bpf_to_rv_reg(BPF_REG_0, ctx), RV_REG_A0, ctx); break; } /* tail call */ diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 01257ce3b89c..442a74f113cb 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -57,6 +57,7 @@ static void kasan_populate_shadow(void) pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY); pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY); p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY); + unsigned long memgap_start = 0; unsigned long untracked_end; unsigned long start, end; int i; @@ -101,8 +102,12 @@ static void kasan_populate_shadow(void) * +- shadow end ----+---------+- shadow end ---+ */ - for_each_physmem_usable_range(i, &start, &end) + for_each_physmem_usable_range(i, &start, &end) { kasan_populate(start, end, POPULATE_KASAN_MAP_SHADOW); + if (memgap_start && physmem_info.info_source == MEM_DETECT_DIAG260) + kasan_populate(memgap_start, start, POPULATE_KASAN_ZERO_SHADOW); + memgap_start = end; + } if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { untracked_end = VMALLOC_START; /* shallowly populate kasan shadow for vmalloc and modules */ diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index f07267875a19..deb198a61039 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -16,6 +16,8 @@ #define hugepages_supported() (MACHINE_HAS_EDAT1) void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz); +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); pte_t huge_ptep_get(pte_t *ptep); pte_t huge_ptep_get_and_clear(struct mm_struct *mm, @@ -65,7 +67,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, int changed = !pte_same(huge_ptep_get(ptep), pte); if (changed) { huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + __set_huge_pte_at(vma->vm_mm, addr, ptep, pte); } return changed; } @@ -74,7 +76,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = huge_ptep_get_and_clear(mm, addr, ptep); - set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); + __set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); } static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 427f9528a7b6..67a298b6cf6e 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -777,6 +777,13 @@ struct kvm_vm_stat { u64 inject_service_signal; u64 inject_virtio; u64 aen_forward; + u64 gmap_shadow_create; + u64 gmap_shadow_reuse; + u64 gmap_shadow_r1_entry; + u64 gmap_shadow_r2_entry; + u64 gmap_shadow_r3_entry; + u64 gmap_shadow_sg_entry; + u64 gmap_shadow_pg_entry; }; struct kvm_arch_memory_slot { diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 6d6bc19b37dc..ff8349d17b33 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1382,6 +1382,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, unsigned long *pgt, int *dat_protection, int *fake) { + struct kvm *kvm; struct gmap *parent; union asce asce; union vaddress vaddr; @@ -1390,6 +1391,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, *fake = 0; *dat_protection = 0; + kvm = sg->private; parent = sg->parent; vaddr.addr = saddr; asce.val = sg->orig_asce; @@ -1450,6 +1452,7 @@ shadow_r2t: rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); if (rc) return rc; + kvm->stat.gmap_shadow_r1_entry++; } fallthrough; case ASCE_TYPE_REGION2: { @@ -1478,6 +1481,7 @@ shadow_r3t: rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); if (rc) return rc; + kvm->stat.gmap_shadow_r2_entry++; } fallthrough; case ASCE_TYPE_REGION3: { @@ -1515,6 +1519,7 @@ shadow_sgt: rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); if (rc) return rc; + kvm->stat.gmap_shadow_r3_entry++; } fallthrough; case ASCE_TYPE_SEGMENT: { @@ -1548,6 +1553,7 @@ shadow_pgt: rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); if (rc) return rc; + kvm->stat.gmap_shadow_sg_entry++; } } /* Return the parent address of the page table */ @@ -1618,6 +1624,7 @@ shadow_page: pte.p |= dat_protection; if (!rc) rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); + vcpu->kvm->stat.gmap_shadow_pg_entry++; ipte_unlock(vcpu->kvm); mmap_read_unlock(sg->mm); return rc; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index c1b47d608a2b..efaebba5ee19 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -303,11 +303,6 @@ static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi) return 0; } -static inline int gisa_in_alert_list(struct kvm_s390_gisa *gisa) -{ - return READ_ONCE(gisa->next_alert) != (u32)virt_to_phys(gisa); -} - static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); @@ -3216,11 +3211,12 @@ void kvm_s390_gisa_destroy(struct kvm *kvm) if (!gi->origin) return; - if (gi->alert.mask) - KVM_EVENT(3, "vm 0x%pK has unexpected iam 0x%02x", - kvm, gi->alert.mask); - while (gisa_in_alert_list(gi->origin)) - cpu_relax(); + WARN(gi->alert.mask != 0x00, + "unexpected non zero alert.mask 0x%02x", + gi->alert.mask); + gi->alert.mask = 0x00; + if (gisa_set_iam(gi->origin, gi->alert.mask)) + process_gib_alert_list(); hrtimer_cancel(&gi->timer); gi->origin = NULL; VM_EVENT(kvm, 3, "gisa 0x%pK destroyed", gisa); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index b3f17e014cab..11676b81e6bf 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -66,7 +66,14 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_COUNTER(VM, inject_pfault_done), STATS_DESC_COUNTER(VM, inject_service_signal), STATS_DESC_COUNTER(VM, inject_virtio), - STATS_DESC_COUNTER(VM, aen_forward) + STATS_DESC_COUNTER(VM, aen_forward), + STATS_DESC_COUNTER(VM, gmap_shadow_reuse), + STATS_DESC_COUNTER(VM, gmap_shadow_create), + STATS_DESC_COUNTER(VM, gmap_shadow_r1_entry), + STATS_DESC_COUNTER(VM, gmap_shadow_r2_entry), + STATS_DESC_COUNTER(VM, gmap_shadow_r3_entry), + STATS_DESC_COUNTER(VM, gmap_shadow_sg_entry), + STATS_DESC_COUNTER(VM, gmap_shadow_pg_entry), }; const struct kvm_stats_header kvm_vm_stats_header = { @@ -4053,6 +4060,8 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long prefix; unsigned long i; + trace_kvm_s390_gmap_notifier(start, end, gmap_is_shadow(gmap)); + if (gmap_is_shadow(gmap)) return; if (start >= 1UL << 31) diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h index 6f0209d45164..9ac92dbf680d 100644 --- a/arch/s390/kvm/trace-s390.h +++ b/arch/s390/kvm/trace-s390.h @@ -333,6 +333,29 @@ TRACE_EVENT(kvm_s390_airq_suppressed, __entry->id, __entry->isc) ); +/* + * Trace point for gmap notifier calls. + */ +TRACE_EVENT(kvm_s390_gmap_notifier, + TP_PROTO(unsigned long start, unsigned long end, unsigned int shadow), + TP_ARGS(start, end, shadow), + + TP_STRUCT__entry( + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned int, shadow) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->shadow = shadow; + ), + + TP_printk("gmap notified (start:0x%lx end:0x%lx shadow:%d)", + __entry->start, __entry->end, __entry->shadow) + ); + #endif /* _TRACE_KVMS390_H */ diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 61499293c2ac..02dcbe82a8e5 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1214,8 +1214,10 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, * we're holding has been unshadowed. If the gmap is still valid, * we can safely reuse it. */ - if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) + if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) { + vcpu->kvm->stat.gmap_shadow_reuse++; return 0; + } /* release the old shadow - if any, and mark the prefix as unmapped */ release_gmap_shadow(vsie_page); @@ -1223,6 +1225,7 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, if (IS_ERR(gmap)) return PTR_ERR(gmap); gmap->private = vcpu->kvm; + vcpu->kvm->stat.gmap_shadow_create++; WRITE_ONCE(vsie_page->gmap, gmap); return 0; } diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index c718f2a0de94..297a6d897d5a 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -142,7 +142,7 @@ static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) __storage_key_init_range(paddr, paddr + size - 1); } -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { unsigned long rste; @@ -163,6 +163,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, set_pte(ptep, __pte(rste)); } +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz) +{ + __set_huge_pte_at(mm, addr, ptep, pte); +} + pte_t huge_ptep_get(pte_t *ptep) { return __rste_to_pte(pte_val(*ptep)); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index de2fb12120d2..e507692e51e7 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -2066,6 +2066,7 @@ struct bpf_tramp_jit { * func_addr's original caller */ int stack_size; /* Trampoline stack size */ + int backchain_off; /* Offset of backchain */ int stack_args_off; /* Offset of stack arguments for calling * func_addr, has to be at the top */ @@ -2086,9 +2087,10 @@ struct bpf_tramp_jit { * for __bpf_prog_enter() return value and * func_addr respectively */ - int r14_off; /* Offset of saved %r14 */ int run_ctx_off; /* Offset of struct bpf_tramp_run_ctx */ int tccnt_off; /* Offset of saved tailcall counter */ + int r14_off; /* Offset of saved %r14, has to be at the + * bottom */ int do_fexit; /* do_fexit: label */ }; @@ -2247,8 +2249,12 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, * Calculate the stack layout. */ - /* Reserve STACK_FRAME_OVERHEAD bytes for the callees. */ + /* + * Allocate STACK_FRAME_OVERHEAD bytes for the callees. As the s390x + * ABI requires, put our backchain at the end of the allocated memory. + */ tjit->stack_size = STACK_FRAME_OVERHEAD; + tjit->backchain_off = tjit->stack_size - sizeof(u64); tjit->stack_args_off = alloc_stack(tjit, nr_stack_args * sizeof(u64)); tjit->reg_args_off = alloc_stack(tjit, nr_reg_args * sizeof(u64)); tjit->ip_off = alloc_stack(tjit, sizeof(u64)); @@ -2256,16 +2262,25 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, tjit->bpf_args_off = alloc_stack(tjit, nr_bpf_args * sizeof(u64)); tjit->retval_off = alloc_stack(tjit, sizeof(u64)); tjit->r7_r8_off = alloc_stack(tjit, 2 * sizeof(u64)); - tjit->r14_off = alloc_stack(tjit, sizeof(u64)); tjit->run_ctx_off = alloc_stack(tjit, sizeof(struct bpf_tramp_run_ctx)); tjit->tccnt_off = alloc_stack(tjit, sizeof(u64)); - /* The caller has already reserved STACK_FRAME_OVERHEAD bytes. */ - tjit->stack_size -= STACK_FRAME_OVERHEAD; + tjit->r14_off = alloc_stack(tjit, sizeof(u64) * 2); + /* + * In accordance with the s390x ABI, the caller has allocated + * STACK_FRAME_OVERHEAD bytes for us. 8 of them contain the caller's + * backchain, and the rest we can use. + */ + tjit->stack_size -= STACK_FRAME_OVERHEAD - sizeof(u64); tjit->orig_stack_args_off = tjit->stack_size + STACK_FRAME_OVERHEAD; + /* lgr %r1,%r15 */ + EMIT4(0xb9040000, REG_1, REG_15); /* aghi %r15,-stack_size */ EMIT4_IMM(0xa70b0000, REG_15, -tjit->stack_size); + /* stg %r1,backchain_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_1, REG_0, REG_15, + tjit->backchain_off); /* mvc tccnt_off(4,%r15),stack_size+STK_OFF_TCCNT(%r15) */ _EMIT6(0xd203f000 | tjit->tccnt_off, 0xf000 | (tjit->stack_size + STK_OFF_TCCNT)); @@ -2513,7 +2528,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, return -E2BIG; } - return ret; + return tjit.common.prg; } bool bpf_jit_supports_subprog_tailcalls(void) diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index 2d9b01d7ca4c..99209085c75b 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -564,6 +564,17 @@ static void s390_dma_unmap_sg(struct device *dev, struct scatterlist *sg, s->dma_length = 0; } } + +static unsigned long *bitmap_vzalloc(size_t bits, gfp_t flags) +{ + size_t n = BITS_TO_LONGS(bits); + size_t bytes; + + if (unlikely(check_mul_overflow(n, sizeof(unsigned long), &bytes))) + return NULL; + + return vzalloc(bytes); +} int zpci_dma_init_device(struct zpci_dev *zdev) { @@ -604,13 +615,13 @@ int zpci_dma_init_device(struct zpci_dev *zdev) zdev->end_dma - zdev->start_dma + 1); zdev->end_dma = zdev->start_dma + zdev->iommu_size - 1; zdev->iommu_pages = zdev->iommu_size >> PAGE_SHIFT; - zdev->iommu_bitmap = vzalloc(zdev->iommu_pages / 8); + zdev->iommu_bitmap = bitmap_vzalloc(zdev->iommu_pages, GFP_KERNEL); if (!zdev->iommu_bitmap) { rc = -ENOMEM; goto free_dma_table; } if (!s390_iommu_strict) { - zdev->lazy_bitmap = vzalloc(zdev->iommu_pages / 8); + zdev->lazy_bitmap = bitmap_vzalloc(zdev->iommu_pages, GFP_KERNEL); if (!zdev->lazy_bitmap) { rc = -ENOMEM; goto free_bitmap; diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 0a26cca24232..c714ca6a05aa 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -14,6 +14,8 @@ extern struct pud_huge_patch_entry __pud_huge_patch, __pud_huge_patch_end; #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz); +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR @@ -32,7 +34,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t old_pte = *ptep; - set_huge_pte_at(mm, addr, ptep, pte_wrprotect(old_pte)); + __set_huge_pte_at(mm, addr, ptep, pte_wrprotect(old_pte)); } #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS @@ -42,7 +44,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, { int changed = !pte_same(*ptep, pte); if (changed) { - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + __set_huge_pte_at(vma->vm_mm, addr, ptep, pte); flush_tlb_page(vma, addr); } return changed; diff --git a/arch/sparc/lib/checksum_32.S b/arch/sparc/lib/checksum_32.S index 84ad709cbecb..66eda40fce36 100644 --- a/arch/sparc/lib/checksum_32.S +++ b/arch/sparc/lib/checksum_32.S @@ -453,5 +453,5 @@ ccslow: cmp %g1, 0 * we only bother with faults on loads... */ cc_fault: - ret + retl clr %o0 diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index d7018823206c..b432500c13a5 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -328,7 +328,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return pte_offset_huge(pmd, addr); } -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry) { unsigned int nptes, orig_shift, shift; @@ -364,6 +364,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, orig_shift); } +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry, unsigned long sz) +{ + __set_huge_pte_at(mm, addr, ptep, entry); +} + pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index dc8c876fbd8f..80d76aea1f7b 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -103,6 +103,16 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, return ES_OK; } +static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +{ + return ES_OK; +} + +static bool fault_in_kernel_space(unsigned long address) +{ + return false; +} + #undef __init #define __init diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index abadd5f23425..e24976593a29 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -534,8 +534,12 @@ static void amd_pmu_cpu_reset(int cpu) /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */ wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); - /* Clear overflow bits i.e. PerfCntrGLobalStatus.PerfCntrOvfl */ - wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, amd_pmu_global_cntr_mask); + /* + * Clear freeze and overflow bits i.e. PerfCntrGLobalStatus.LbrFreeze + * and PerfCntrGLobalStatus.PerfCntrOvfl + */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + GLOBAL_STATUS_LBRS_FROZEN | amd_pmu_global_cntr_mask); } static int amd_pmu_cpu_prepare(int cpu) @@ -570,6 +574,7 @@ static void amd_pmu_cpu_starting(int cpu) int i, nb_id; cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; + amd_pmu_cpu_reset(cpu); if (!x86_pmu.amd_nb_constraints) return; @@ -591,8 +596,6 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; - - amd_pmu_cpu_reset(cpu); } static void amd_pmu_cpu_dead(int cpu) @@ -601,6 +604,7 @@ static void amd_pmu_cpu_dead(int cpu) kfree(cpuhw->lbr_sel); cpuhw->lbr_sel = NULL; + amd_pmu_cpu_reset(cpu); if (!x86_pmu.amd_nb_constraints) return; @@ -613,8 +617,6 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } - - amd_pmu_cpu_reset(cpu); } static inline void amd_pmu_set_global_ctl(u64 ctl) @@ -884,7 +886,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) struct hw_perf_event *hwc; struct perf_event *event; int handled = 0, idx; - u64 status, mask; + u64 reserved, status, mask; bool pmu_enabled; /* @@ -909,6 +911,14 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) status &= ~GLOBAL_STATUS_LBRS_FROZEN; } + reserved = status & ~amd_pmu_global_cntr_mask; + if (reserved) + pr_warn_once("Reserved PerfCntrGlobalStatus bits are set (0x%llx), please consider updating microcode\n", + reserved); + + /* Clear any reserved bits set by buggy microcode */ + status &= amd_pmu_global_cntr_mask; + for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c index 76b1f8bb0fd5..dab4ed199227 100644 --- a/arch/x86/events/utils.c +++ b/arch/x86/events/utils.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <asm/insn.h> +#include <linux/mm.h> #include "perf_event.h" @@ -132,9 +133,9 @@ static int get_branch_type(unsigned long from, unsigned long to, int abort, * The LBR logs any address in the IP, even if the IP just * faulted. This means userspace can control the from address. * Ensure we don't blindly read any address by validating it is - * a known text address. + * a known text address and not a vsyscall address. */ - if (kernel_text_address(from)) { + if (kernel_text_address(from) && !in_gate_area_no_mm(from)) { addr = (void *)from; /* * Assume we can get the maximum possible size diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 783ed339f341..21556ad87f4b 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -7,6 +7,8 @@ * Author : K. Y. Srinivasan <kys@microsoft.com> */ +#define pr_fmt(fmt) "Hyper-V: " fmt + #include <linux/efi.h> #include <linux/types.h> #include <linux/bitfield.h> @@ -191,7 +193,7 @@ void set_hv_tscchange_cb(void (*cb)(void)) struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; if (!hv_reenlightenment_available()) { - pr_warn("Hyper-V: reenlightenment support is unavailable\n"); + pr_warn("reenlightenment support is unavailable\n"); return; } @@ -394,6 +396,7 @@ static void __init hv_get_partition_id(void) local_irq_restore(flags); } +#if IS_ENABLED(CONFIG_HYPERV_VTL_MODE) static u8 __init get_vtl(void) { u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS; @@ -416,13 +419,16 @@ static u8 __init get_vtl(void) if (hv_result_success(ret)) { ret = output->as64.low & HV_X64_VTL_MASK; } else { - pr_err("Failed to get VTL(%lld) and set VTL to zero by default.\n", ret); - ret = 0; + pr_err("Failed to get VTL(error: %lld) exiting...\n", ret); + BUG(); } local_irq_restore(flags); return ret; } +#else +static inline u8 get_vtl(void) { return 0; } +#endif /* * This function is to be invoked early in the boot sequence after the @@ -564,7 +570,7 @@ skip_hypercall_pg_init: if (cpu_feature_enabled(X86_FEATURE_IBT) && *(u32 *)hv_hypercall_pg != gen_endbr()) { setup_clear_cpu_cap(X86_FEATURE_IBT); - pr_warn("Hyper-V: Disabling IBT because of Hyper-V bug\n"); + pr_warn("Disabling IBT because of Hyper-V bug\n"); } #endif @@ -604,8 +610,10 @@ skip_hypercall_pg_init: hv_query_ext_cap(0); /* Find the VTL */ - if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) - ms_hyperv.vtl = get_vtl(); + ms_hyperv.vtl = get_vtl(); + + if (ms_hyperv.vtl > 0) /* non default VTL */ + hv_vtl_early_init(); return; diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 36a562218010..999f5ac82fe9 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -215,7 +215,7 @@ static int hv_vtl_wakeup_secondary_cpu(int apicid, unsigned long start_eip) return hv_vtl_bringup_vcpu(vp_id, start_eip); } -static int __init hv_vtl_early_init(void) +int __init hv_vtl_early_init(void) { /* * `boot_cpu_has` returns the runtime feature support, @@ -230,4 +230,3 @@ static int __init hv_vtl_early_init(void) return 0; } -early_initcall(hv_vtl_early_init); diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 3a233ebff712..25050d953eee 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -28,8 +28,6 @@ struct x86_cpu { }; #ifdef CONFIG_HOTPLUG_CPU -extern int arch_register_cpu(int num); -extern void arch_unregister_cpu(int); extern void soft_restart_cpu(void); #endif diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 58cb9495e40f..4af140cf5719 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -443,6 +443,7 @@ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ #define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */ +#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* "" WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */ #define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */ #define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */ #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 31089b851c4f..a2be3aefff9f 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -157,7 +157,8 @@ static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { static inline void fpu_sync_guest_vmexit_xfd_state(void) { } #endif -extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru); +extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, + unsigned int size, u64 xfeatures, u32 pkru); extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru); static inline void fpstate_set_confidential(struct fpu_guest *gfpu) diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 637fa1df3512..c715097e92fd 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -69,6 +69,8 @@ struct legacy_pic { void (*make_irq)(unsigned int irq); }; +void legacy_pic_pcat_compat(void); + extern struct legacy_pic *legacy_pic; extern struct legacy_pic null_legacy_pic; diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 5fcd85fd64fd..197316121f04 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -27,6 +27,7 @@ * _X - regular server parts * _D - micro server parts * _N,_P - other mobile parts + * _H - premium mobile parts * _S - other client parts * * Historical OPTDIFFs: @@ -124,6 +125,7 @@ #define INTEL_FAM6_METEORLAKE 0xAC #define INTEL_FAM6_METEORLAKE_L 0xAA +#define INTEL_FAM6_ARROWLAKE_H 0xC5 #define INTEL_FAM6_ARROWLAKE 0xC6 #define INTEL_FAM6_LUNARLAKE_M 0xBD diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index e3054e3e46d5..26b628d84594 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -108,6 +108,7 @@ KVM_X86_OP_OPTIONAL(vcpu_blocking) KVM_X86_OP_OPTIONAL(vcpu_unblocking) KVM_X86_OP_OPTIONAL(pi_update_irte) KVM_X86_OP_OPTIONAL(pi_start_assignment) +KVM_X86_OP_OPTIONAL(apicv_pre_state_restore) KVM_X86_OP_OPTIONAL(apicv_post_state_restore) KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) KVM_X86_OP_OPTIONAL(set_hv_timer) @@ -126,7 +127,7 @@ KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) KVM_X86_OP(get_msr_feature) -KVM_X86_OP(can_emulate_instruction) +KVM_X86_OP(check_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) KVM_X86_OP_OPTIONAL(migrate_timers) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 17715cb8731d..d7036982332e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -39,7 +39,15 @@ #define __KVM_HAVE_ARCH_VCPU_DEBUGFS +/* + * CONFIG_KVM_MAX_NR_VCPUS is defined iff CONFIG_KVM!=n, provide a dummy max if + * KVM is disabled (arbitrarily use the default from CONFIG_KVM_MAX_NR_VCPUS). + */ +#ifdef CONFIG_KVM_MAX_NR_VCPUS +#define KVM_MAX_VCPUS CONFIG_KVM_MAX_NR_VCPUS +#else #define KVM_MAX_VCPUS 1024 +#endif /* * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs @@ -528,7 +536,6 @@ struct kvm_pmu { u64 raw_event_mask; struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; - struct irq_work irq_work; /* * Overlay the bitmap with a 64-bit atomic so that all bits can be @@ -680,6 +687,7 @@ struct kvm_hypervisor_cpuid { u32 limit; }; +#ifdef CONFIG_KVM_XEN /* Xen HVM per vcpu emulation context */ struct kvm_vcpu_xen { u64 hypercall_rip; @@ -702,6 +710,7 @@ struct kvm_vcpu_xen { struct timer_list poll_timer; struct kvm_hypervisor_cpuid cpuid; }; +#endif struct kvm_queued_exception { bool pending; @@ -930,8 +939,9 @@ struct kvm_vcpu_arch { bool hyperv_enabled; struct kvm_vcpu_hv *hyperv; +#ifdef CONFIG_KVM_XEN struct kvm_vcpu_xen xen; - +#endif cpumask_var_t wbinvd_dirty_mask; unsigned long last_retry_eip; @@ -1276,7 +1286,6 @@ struct kvm_arch { */ spinlock_t mmu_unsync_pages_lock; - struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; bool iommu_noncoherent; #define __KVM_HAVE_ARCH_NONCOHERENT_DMA @@ -1324,6 +1333,7 @@ struct kvm_arch { int nr_vcpus_matched_tsc; u32 default_tsc_khz; + bool user_set_tsc; seqcount_raw_spinlock_t pvclock_sc; bool use_master_clock; @@ -1692,7 +1702,7 @@ struct kvm_x86_ops { void (*request_immediate_exit)(struct kvm_vcpu *vcpu); - void (*sched_in)(struct kvm_vcpu *kvm, int cpu); + void (*sched_in)(struct kvm_vcpu *vcpu, int cpu); /* * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero @@ -1709,6 +1719,7 @@ struct kvm_x86_ops { int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*pi_start_assignment)(struct kvm *kvm); + void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); @@ -1734,8 +1745,8 @@ struct kvm_x86_ops { int (*get_msr_feature)(struct kvm_msr_entry *entry); - bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len); + int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 033b53f993c6..896445edc6a8 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -340,8 +340,10 @@ static inline u64 hv_get_non_nested_register(unsigned int reg) { return 0; } #ifdef CONFIG_HYPERV_VTL_MODE void __init hv_vtl_init_platform(void); +int __init hv_vtl_early_init(void); #else static inline void __init hv_vtl_init_platform(void) {} +static inline int __init hv_vtl_early_init(void) { return 0; } #endif #include <asm-generic/mshyperv.h> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1d111350197f..389f9594746e 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -553,6 +553,7 @@ #define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 +#define MSR_AMD64_TW_CFG 0xc0011023 #define MSR_AMD64_DE_CFG 0xc0011029 #define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1 @@ -637,12 +638,17 @@ /* AMD Last Branch Record MSRs */ #define MSR_AMD64_LBR_SELECT 0xc000010e -/* Fam 17h MSRs */ -#define MSR_F17H_IRPERF 0xc00000e9 +/* Zen4 */ +#define MSR_ZEN4_BP_CFG 0xc001102e +#define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5 +/* Zen 2 */ #define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 #define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) +/* Fam 17h MSRs */ +#define MSR_F17H_IRPERF 0xc00000e9 + /* Fam 16h MSRs */ #define MSR_F16H_L2I_PERF_CTL 0xc0010230 #define MSR_F16H_L2I_PERF_CTR 0xc0010231 diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d6ad98ca1288..e02b179ec659 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -955,6 +955,14 @@ static inline int pte_same(pte_t a, pte_t b) return a.pte == b.pte; } +static inline pte_t pte_next_pfn(pte_t pte) +{ + if (__pte_needs_invert(pte_val(pte))) + return __pte(pte_val(pte) - (1UL << PFN_PTE_SHIFT)); + return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); +} +#define pte_next_pfn pte_next_pfn + static inline int pte_present(pte_t a) { return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index ad98dd1d9cfb..c31c633419fe 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -129,7 +129,6 @@ void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); -bool smp_park_other_cpus_in_init(void); void smp_store_cpu_info(int id); asmlinkage __visible void smp_reboot_interrupt(void); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 19bf955b67e0..3ac0ffc4f3e2 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -268,6 +268,7 @@ enum avic_ipi_failure_cause { AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, AVIC_IPI_FAILURE_INVALID_TARGET, AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, + AVIC_IPI_FAILURE_INVALID_IPI_VECTOR, }; #define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(8, 0) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 2a0ea38955df..c55c0ef47a18 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -148,6 +148,9 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) pr_debug("Local APIC address 0x%08x\n", madt->address); } + if (madt->flags & ACPI_MADT_PCAT_COMPAT) + legacy_pic_pcat_compat(); + /* ACPI 6.3 and newer support the online capable bit. */ if (acpi_gbl_FADT.header.revision > 6 || (acpi_gbl_FADT.header.revision == 6 && diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 517ee01503be..73be3931e4f0 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -403,6 +403,17 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, u8 insn_buff[MAX_PATCH_LEN]; DPRINTK(ALT, "alt table %px, -> %px", start, end); + + /* + * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using + * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. + * During the process, KASAN becomes confused seeing partial LA57 + * conversion and triggers a false-positive out-of-bound report. + * + * Disable KASAN until the patching is complete. + */ + kasan_disable_current(); + /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. @@ -452,6 +463,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, text_poke_early(instr, insn_buff, insn_buff_sz); } + + kasan_enable_current(); } static inline bool is_jcc32(struct insn *insn) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 03ef962a6992..ece2b5b7b0fe 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -80,6 +80,10 @@ static const int amd_div0[] = AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf), AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf)); +static const int amd_erratum_1485[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf), + AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf)); + static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) { int osvw_id = *erratum++; @@ -1149,6 +1153,10 @@ static void init_amd(struct cpuinfo_x86 *c) pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); setup_force_cpu_bug(X86_BUG_DIV0); } + + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && + cpu_has_amd_erratum(c, amd_erratum_1485)) + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 382d4e6b848d..4e5ffc8b0e46 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1303,7 +1303,7 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), - VULNBL_HYGON(0x18, RETBLEED | SMT_RSB), + VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO), VULNBL_AMD(0x19, SRSO), {} }; diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index ded1fc7cb7cb..f136ac046851 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -30,15 +30,15 @@ struct rmid_entry { struct list_head list; }; -/** - * @rmid_free_lru A least recently used list of free RMIDs +/* + * @rmid_free_lru - A least recently used list of free RMIDs * These RMIDs are guaranteed to have an occupancy less than the * threshold occupancy */ static LIST_HEAD(rmid_free_lru); -/** - * @rmid_limbo_count count of currently unused but (potentially) +/* + * @rmid_limbo_count - count of currently unused but (potentially) * dirty RMIDs. * This counts RMIDs that no one is currently using but that * may have a occupancy value > resctrl_rmid_realloc_threshold. User can @@ -46,7 +46,7 @@ static LIST_HEAD(rmid_free_lru); */ static unsigned int rmid_limbo_count; -/** +/* * @rmid_entry - The entry in the limbo and free lists. */ static struct rmid_entry *rmid_ptrs; diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 91fa70e51004..279148e72459 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -235,6 +235,21 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; } +/* + * Ensure the SECS page is not swapped out. Must be called with encl->lock + * to protect the enclave states including SECS and ensure the SECS page is + * not swapped out again while being used. + */ +static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) +{ + struct sgx_epc_page *epc_page = encl->secs.epc_page; + + if (!epc_page) + epc_page = sgx_encl_eldu(&encl->secs, NULL); + + return epc_page; +} + static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, struct sgx_encl_page *entry) { @@ -248,11 +263,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, return entry; } - if (!(encl->secs.epc_page)) { - epc_page = sgx_encl_eldu(&encl->secs, NULL); - if (IS_ERR(epc_page)) - return ERR_CAST(epc_page); - } + epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); if (IS_ERR(epc_page)) @@ -339,6 +352,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, mutex_lock(&encl->lock); + epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_unlock; + } + epc_page = sgx_alloc_epc_page(encl_page, false); if (IS_ERR(epc_page)) { if (PTR_ERR(epc_page) == -EBUSY) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index a86d37052a64..a21a4d0ecc34 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -369,14 +369,15 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, - unsigned int size, u32 pkru) + unsigned int size, u64 xfeatures, u32 pkru) { struct fpstate *kstate = gfpu->fpstate; union fpregs_state *ustate = buf; struct membuf mb = { .p = buf, .left = size }; if (cpu_feature_enabled(X86_FEATURE_XSAVE)) { - __copy_xstate_to_uabi_buf(mb, kstate, pkru, XSTATE_COPY_XSAVE); + __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru, + XSTATE_COPY_XSAVE); } else { memcpy(&ustate->fxsave, &kstate->regs.fxsave, sizeof(ustate->fxsave)); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index cadf68737e6b..ef6906107c54 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1049,6 +1049,7 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor * @fpstate: The fpstate buffer from which to copy + * @xfeatures: The mask of xfeatures to save (XSAVE mode only) * @pkru_val: The PKRU value to store in the PKRU component * @copy_mode: The requested copy mode * @@ -1059,7 +1060,8 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, * It supports partial copy but @to.pos always starts from zero. */ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, - u32 pkru_val, enum xstate_copy_mode copy_mode) + u64 xfeatures, u32 pkru_val, + enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); struct xregs_state *xinit = &init_fpstate.regs.xsave; @@ -1083,7 +1085,7 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, break; case XSTATE_COPY_XSAVE: - header.xfeatures &= fpstate->user_xfeatures; + header.xfeatures &= fpstate->user_xfeatures & xfeatures; break; } @@ -1185,6 +1187,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode copy_mode) { __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, + tsk->thread.fpu.fpstate->user_xfeatures, tsk->thread.pkru, copy_mode); } @@ -1536,10 +1539,7 @@ static int fpstate_realloc(u64 xfeatures, unsigned int ksize, fpregs_restore_userregs(); newfps->xfeatures = curfps->xfeatures | xfeatures; - - if (!guest_fpu) - newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; - + newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; newfps->xfd = curfps->xfd & ~xfeatures; /* Do the final updates within the locked region */ diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index a4ecb04d8d64..3518fb26d06b 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -43,7 +43,8 @@ enum xstate_copy_mode { struct membuf; extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, - u32 pkru_val, enum xstate_copy_mode copy_mode); + u64 xfeatures, u32 pkru_val, + enum xstate_copy_mode copy_mode); extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode mode); extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 30a55207c000..c20d1832c481 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -32,6 +32,7 @@ */ static void init_8259A(int auto_eoi); +static bool pcat_compat __ro_after_init; static int i8259A_auto_eoi; DEFINE_RAW_SPINLOCK(i8259A_lock); @@ -299,15 +300,32 @@ static void unmask_8259A(void) static int probe_8259A(void) { + unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR); unsigned long flags; - unsigned char probe_val = ~(1 << PIC_CASCADE_IR); - unsigned char new_val; + + /* + * If MADT has the PCAT_COMPAT flag set, then do not bother probing + * for the PIC. Some BIOSes leave the PIC uninitialized and probing + * fails. + * + * Right now this causes problems as quite some code depends on + * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly + * when the system has an IO/APIC because then PIC is not required + * at all, except for really old machines where the timer interrupt + * must be routed through the PIC. So just pretend that the PIC is + * there and let legacy_pic->init() initialize it for nothing. + * + * Alternatively this could just try to initialize the PIC and + * repeat the probe, but for cases where there is no PIC that's + * just pointless. + */ + if (pcat_compat) + return nr_legacy_irqs(); + /* - * Check to see if we have a PIC. - * Mask all except the cascade and read - * back the value we just wrote. If we don't - * have a PIC, we will read 0xff as opposed to the - * value we wrote. + * Check to see if we have a PIC. Mask all except the cascade and + * read back the value we just wrote. If we don't have a PIC, we + * will read 0xff as opposed to the value we wrote. */ raw_spin_lock_irqsave(&i8259A_lock, flags); @@ -429,5 +447,9 @@ static int __init i8259A_init_ops(void) return 0; } - device_initcall(i8259A_init_ops); + +void __init legacy_pic_pcat_compat(void) +{ + pcat_compat = true; +} diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 3a43a2dee658..9c9faa1634fb 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -695,7 +695,6 @@ void kgdb_arch_exit(void) } /** - * * kgdb_skipexception - Bail out of KGDB when we've been triggered. * @exception: Exception vector number * @regs: Current &struct pt_regs. diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 2eabccde94fb..ccb0915e84e1 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -256,7 +256,7 @@ static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg) return 0; } -static int sev_cpuid_hv(struct cpuid_leaf *leaf) +static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf) { int ret; @@ -279,6 +279,45 @@ static int sev_cpuid_hv(struct cpuid_leaf *leaf) return ret; } +static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +{ + u32 cr4 = native_read_cr4(); + int ret; + + ghcb_set_rax(ghcb, leaf->fn); + ghcb_set_rcx(ghcb, leaf->subfn); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #UD - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + leaf->eax = ghcb->save.rax; + leaf->ebx = ghcb->save.rbx; + leaf->ecx = ghcb->save.rcx; + leaf->edx = ghcb->save.rdx; + + return ES_OK; +} + +static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +{ + return ghcb ? __sev_cpuid_hv_ghcb(ghcb, ctxt, leaf) + : __sev_cpuid_hv_msr(leaf); +} + /* * This may be called early while still running on the initial identity * mapping. Use RIP-relative addressing to obtain the correct address @@ -388,19 +427,20 @@ snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) return false; } -static void snp_cpuid_hv(struct cpuid_leaf *leaf) +static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) { - if (sev_cpuid_hv(leaf)) + if (sev_cpuid_hv(ghcb, ctxt, leaf)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); } -static int snp_cpuid_postprocess(struct cpuid_leaf *leaf) +static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + struct cpuid_leaf *leaf) { struct cpuid_leaf leaf_hv = *leaf; switch (leaf->fn) { case 0x1: - snp_cpuid_hv(&leaf_hv); + snp_cpuid_hv(ghcb, ctxt, &leaf_hv); /* initial APIC ID */ leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); @@ -419,7 +459,7 @@ static int snp_cpuid_postprocess(struct cpuid_leaf *leaf) break; case 0xB: leaf_hv.subfn = 0; - snp_cpuid_hv(&leaf_hv); + snp_cpuid_hv(ghcb, ctxt, &leaf_hv); /* extended APIC ID */ leaf->edx = leaf_hv.edx; @@ -467,7 +507,7 @@ static int snp_cpuid_postprocess(struct cpuid_leaf *leaf) } break; case 0x8000001E: - snp_cpuid_hv(&leaf_hv); + snp_cpuid_hv(ghcb, ctxt, &leaf_hv); /* extended APIC ID */ leaf->eax = leaf_hv.eax; @@ -488,7 +528,7 @@ static int snp_cpuid_postprocess(struct cpuid_leaf *leaf) * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value * should be treated as fatal by caller. */ -static int snp_cpuid(struct cpuid_leaf *leaf) +static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -522,7 +562,7 @@ static int snp_cpuid(struct cpuid_leaf *leaf) return 0; } - return snp_cpuid_postprocess(leaf); + return snp_cpuid_postprocess(ghcb, ctxt, leaf); } /* @@ -544,14 +584,14 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) leaf.fn = fn; leaf.subfn = subfn; - ret = snp_cpuid(&leaf); + ret = snp_cpuid(NULL, NULL, &leaf); if (!ret) goto cpuid_done; if (ret != -EOPNOTSUPP) goto fail; - if (sev_cpuid_hv(&leaf)) + if (__sev_cpuid_hv_msr(&leaf)) goto fail; cpuid_done: @@ -592,6 +632,23 @@ fail: sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } +static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt, + unsigned long address, + bool write) +{ + if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_USER; + ctxt->fi.cr2 = address; + if (write) + ctxt->fi.error_code |= X86_PF_WRITE; + + return ES_EXCEPTION; + } + + return ES_OK; +} + static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, void *src, char *buf, unsigned int data_size, @@ -599,7 +656,12 @@ static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, bool backwards) { int i, b = backwards ? -1 : 1; - enum es_result ret = ES_OK; + unsigned long address = (unsigned long)src; + enum es_result ret; + + ret = vc_insn_string_check(ctxt, address, false); + if (ret != ES_OK) + return ret; for (i = 0; i < count; i++) { void *s = src + (i * data_size * b); @@ -620,7 +682,12 @@ static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, bool backwards) { int i, s = backwards ? -1 : 1; - enum es_result ret = ES_OK; + unsigned long address = (unsigned long)dst; + enum es_result ret; + + ret = vc_insn_string_check(ctxt, address, true); + if (ret != ES_OK) + return ret; for (i = 0; i < count; i++) { void *d = dst + (i * data_size * s); @@ -656,6 +723,9 @@ static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) { struct insn *insn = &ctxt->insn; + size_t size; + u64 port; + *exitinfo = 0; switch (insn->opcode.bytes[0]) { @@ -664,7 +734,7 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) case 0x6d: *exitinfo |= IOIO_TYPE_INS; *exitinfo |= IOIO_SEG_ES; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + port = ctxt->regs->dx & 0xffff; break; /* OUTS opcodes */ @@ -672,41 +742,43 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) case 0x6f: *exitinfo |= IOIO_TYPE_OUTS; *exitinfo |= IOIO_SEG_DS; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + port = ctxt->regs->dx & 0xffff; break; /* IN immediate opcodes */ case 0xe4: case 0xe5: *exitinfo |= IOIO_TYPE_IN; - *exitinfo |= (u8)insn->immediate.value << 16; + port = (u8)insn->immediate.value & 0xffff; break; /* OUT immediate opcodes */ case 0xe6: case 0xe7: *exitinfo |= IOIO_TYPE_OUT; - *exitinfo |= (u8)insn->immediate.value << 16; + port = (u8)insn->immediate.value & 0xffff; break; /* IN register opcodes */ case 0xec: case 0xed: *exitinfo |= IOIO_TYPE_IN; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + port = ctxt->regs->dx & 0xffff; break; /* OUT register opcodes */ case 0xee: case 0xef: *exitinfo |= IOIO_TYPE_OUT; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + port = ctxt->regs->dx & 0xffff; break; default: return ES_DECODE_FAILED; } + *exitinfo |= port << 16; + switch (insn->opcode.bytes[0]) { case 0x6c: case 0x6e: @@ -716,12 +788,15 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) case 0xee: /* Single byte opcodes */ *exitinfo |= IOIO_DATA_8; + size = 1; break; default: /* Length determined by instruction parsing */ *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 : IOIO_DATA_32; + size = (insn->opnd_bytes == 2) ? 2 : 4; } + switch (insn->addr_bytes) { case 2: *exitinfo |= IOIO_ADDR_16; @@ -737,7 +812,7 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) if (insn_has_rep_prefix(insn)) *exitinfo |= IOIO_REP; - return ES_OK; + return vc_ioio_check(ctxt, (u16)port, size); } static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) @@ -848,14 +923,15 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } -static int vc_handle_cpuid_snp(struct pt_regs *regs) +static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { + struct pt_regs *regs = ctxt->regs; struct cpuid_leaf leaf; int ret; leaf.fn = regs->ax; leaf.subfn = regs->cx; - ret = snp_cpuid(&leaf); + ret = snp_cpuid(ghcb, ctxt, &leaf); if (!ret) { regs->ax = leaf.eax; regs->bx = leaf.ebx; @@ -874,7 +950,7 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb, enum es_result ret; int snp_cpuid_ret; - snp_cpuid_ret = vc_handle_cpuid_snp(regs); + snp_cpuid_ret = vc_handle_cpuid_snp(ghcb, ctxt); if (!snp_cpuid_ret) return ES_OK; if (snp_cpuid_ret != -EOPNOTSUPP) diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 2787826d9f60..6395bfd87b68 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -524,6 +524,33 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt return ES_OK; } +static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +{ + BUG_ON(size > 4); + + if (user_mode(ctxt->regs)) { + struct thread_struct *t = ¤t->thread; + struct io_bitmap *iobm = t->io_bitmap; + size_t idx; + + if (!iobm) + goto fault; + + for (idx = port; idx < port + size; ++idx) { + if (test_bit(idx, iobm->bitmap)) + goto fault; + } + } + + return ES_OK; + +fault: + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + + return ES_EXCEPTION; +} + /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" @@ -868,8 +895,7 @@ void snp_set_memory_private(unsigned long vaddr, unsigned long npages) void snp_accept_memory(phys_addr_t start, phys_addr_t end) { - unsigned long vaddr; - unsigned int npages; + unsigned long vaddr, npages; if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return; @@ -1509,6 +1535,9 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ES_DECODE_FAILED; } + if (user_mode(ctxt->regs)) + return ES_UNSUPPORTED; + switch (mmio) { case INSN_MMIO_WRITE: memcpy(ghcb->shared_buffer, reg_data, bytes); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 6eb06d001bcc..96a771f9f930 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -131,7 +131,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) } /* - * Disable virtualization, APIC etc. and park the CPU in a HLT loop + * this function calls the 'stop' function on all other CPUs in the system. */ DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { @@ -172,17 +172,13 @@ static void native_stop_other_cpus(int wait) * 2) Wait for all other CPUs to report that they reached the * HLT loop in stop_this_cpu() * - * 3) If the system uses INIT/STARTUP for CPU bringup, then - * send all present CPUs an INIT vector, which brings them - * completely out of the way. + * 3) If #2 timed out send an NMI to the CPUs which did not + * yet report * - * 4) If #3 is not possible and #2 timed out send an NMI to the - * CPUs which did not yet report - * - * 5) Wait for all other CPUs to report that they reached the + * 4) Wait for all other CPUs to report that they reached the * HLT loop in stop_this_cpu() * - * #4 can obviously race against a CPU reaching the HLT loop late. + * #3 can obviously race against a CPU reaching the HLT loop late. * That CPU will have reported already and the "have all CPUs * reached HLT" condition will be true despite the fact that the * other CPU is still handling the NMI. Again, there is no @@ -198,7 +194,7 @@ static void native_stop_other_cpus(int wait) /* * Don't wait longer than a second for IPI completion. The * wait request is not checked here because that would - * prevent an NMI/INIT shutdown in case that not all + * prevent an NMI shutdown attempt in case that not all * CPUs reach shutdown state. */ timeout = USEC_PER_SEC; @@ -206,27 +202,7 @@ static void native_stop_other_cpus(int wait) udelay(1); } - /* - * Park all other CPUs in INIT including "offline" CPUs, if - * possible. That's a safe place where they can't resume execution - * of HLT and then execute the HLT loop from overwritten text or - * page tables. - * - * The only downside is a broadcast MCE, but up to the point where - * the kexec() kernel brought all APs online again an MCE will just - * make HLT resume and handle the MCE. The machine crashes and burns - * due to overwritten text, page tables and data. So there is a - * choice between fire and frying pan. The result is pretty much - * the same. Chose frying pan until x86 provides a sane mechanism - * to park a CPU. - */ - if (smp_park_other_cpus_in_init()) - goto done; - - /* - * If park with INIT was not possible and the REBOOT_VECTOR didn't - * take all secondary CPUs offline, try with the NMI. - */ + /* if the REBOOT_VECTOR didn't work, try with the NMI */ if (!cpumask_empty(&cpus_stop_mask)) { /* * If NMI IPI is enabled, try to register the stop handler @@ -249,7 +225,6 @@ static void native_stop_other_cpus(int wait) udelay(1); } -done: local_irq_save(flags); disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 48e040618731..2a187c0cbd5b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1240,33 +1240,6 @@ void arch_thaw_secondary_cpus_end(void) cache_aps_init(); } -bool smp_park_other_cpus_in_init(void) -{ - unsigned int cpu, this_cpu = smp_processor_id(); - unsigned int apicid; - - if (apic->wakeup_secondary_cpu_64 || apic->wakeup_secondary_cpu) - return false; - - /* - * If this is a crash stop which does not execute on the boot CPU, - * then this cannot use the INIT mechanism because INIT to the boot - * CPU will reset the machine. - */ - if (this_cpu) - return false; - - for_each_cpu_and(cpu, &cpus_booted_once_mask, cpu_present_mask) { - if (cpu == this_cpu) - continue; - apicid = apic->cpu_present_to_apicid(cpu); - if (apicid == BAD_APICID) - continue; - send_init_sequence(apicid); - } - return true; -} - /* * Early setup to make printk work. */ diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index ca004e2e4469..0bab03130033 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -54,7 +54,7 @@ void arch_unregister_cpu(int num) EXPORT_SYMBOL(arch_unregister_cpu); #else /* CONFIG_HOTPLUG_CPU */ -static int __init arch_register_cpu(int num) +int __init arch_register_cpu(int num) { return register_cpu(&per_cpu(cpu_devices, num).cpu, num); } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index bbc440c93e08..1123ef3ccf90 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -15,6 +15,7 @@ * ( The serial nature of the boot logic and the CPU hotplug lock * protects against more than 2 CPUs entering this code. ) */ +#include <linux/workqueue.h> #include <linux/topology.h> #include <linux/spinlock.h> #include <linux/kernel.h> @@ -342,6 +343,13 @@ static inline unsigned int loop_timeout(int cpu) return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20; } +static void tsc_sync_mark_tsc_unstable(struct work_struct *work) +{ + mark_tsc_unstable("check_tsc_sync_source failed"); +} + +static DECLARE_WORK(tsc_sync_work, tsc_sync_mark_tsc_unstable); + /* * The freshly booted CPU initiates this via an async SMP function call. */ @@ -395,7 +403,7 @@ retry: "turning off TSC clock.\n", max_warp); if (random_warps) pr_warn("TSC warped randomly between CPUs\n"); - mark_tsc_unstable("check_tsc_sync_source failed"); + schedule_work(&tsc_sync_work); } /* diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ed90f148140d..950c12868d30 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -154,4 +154,15 @@ config KVM_PROVE_MMU config KVM_EXTERNAL_WRITE_TRACKING bool +config KVM_MAX_NR_VCPUS + int "Maximum number of vCPUs per KVM guest" + depends on KVM + range 1024 4096 + default 4096 if MAXSMP + default 1024 + help + Set the maximum number of vCPUs per KVM guest. Larger values will increase + the memory footprint of each KVM guest, regardless of how many vCPUs are + created for a given VM. + endif # VIRTUALIZATION diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0544e30b4946..dda6fc4cfae8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -360,14 +360,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); - /* - * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if - * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't - * supported by the host. - */ - vcpu->arch.guest_fpu.fpstate->user_xfeatures = vcpu->arch.guest_supported_xcr0 | - XFEATURE_MASK_FPSSE; - kvm_update_pv_runtime(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); @@ -456,7 +448,9 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, vcpu->arch.cpuid_nent = nent; vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE); +#ifdef CONFIG_KVM_XEN vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE); +#endif kvm_vcpu_after_set_cpuid(vcpu); return 0; @@ -761,11 +755,13 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_mask(CPUID_8000_0021_EAX, F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ | - F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ + F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ | + F(WRMSR_XX_BASE_NS) ); - if (cpu_feature_enabled(X86_FEATURE_SRSO_NO)) - kvm_cpu_cap_set(X86_FEATURE_SRSO_NO); + kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB); + kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE); + kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO); kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX, F(PERFMON_V2) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 284fa4704553..0b90532b6e26 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -174,7 +174,8 @@ static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu) static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu) { return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || - guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)); + guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB) || + guest_cpuid_has(vcpu, X86_FEATURE_SBPB)); } static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 7c2dac6824e2..238afd7335e4 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -727,10 +727,12 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, stimer_cleanup(stimer); stimer->count = count; - if (stimer->count == 0) - stimer->config.enable = 0; - else if (stimer->config.auto_enable) - stimer->config.enable = 1; + if (!host) { + if (stimer->count == 0) + stimer->config.enable = 0; + else if (stimer->config.auto_enable) + stimer->config.enable = 1; + } if (stimer->config.enable) stimer_mark_pending(stimer, false); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index dcd60b39e794..245b20973cae 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2444,22 +2444,22 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { struct kvm_lapic *apic = vcpu->arch.apic; - u64 val; /* - * ICR is a single 64-bit register when x2APIC is enabled. For legacy - * xAPIC, ICR writes need to go down the common (slightly slower) path - * to get the upper half from ICR2. + * ICR is a single 64-bit register when x2APIC is enabled, all others + * registers hold 32-bit values. For legacy xAPIC, ICR writes need to + * go down the common path to get the upper half from ICR2. + * + * Note, using the write helpers may incur an unnecessary write to the + * virtual APIC state, but KVM needs to conditionally modify the value + * in certain cases, e.g. to clear the ICR busy bit. The cost of extra + * conditional branches is likely a wash relative to the cost of the + * maybe-unecessary write, and both are in the noise anyways. */ - if (apic_x2apic_mode(apic) && offset == APIC_ICR) { - val = kvm_lapic_get_reg64(apic, APIC_ICR); - kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32)); - trace_kvm_apic_write(APIC_ICR, val); - } else { - /* TODO: optimize to just emulate side effect w/o one more write */ - val = kvm_lapic_get_reg(apic, offset); - kvm_lapic_reg_write(apic, offset, (u32)val); - } + if (apic_x2apic_mode(apic) && offset == APIC_ICR) + kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR)); + else + kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); } EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); @@ -2670,6 +2670,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) u64 msr_val; int i; + static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu); + if (!init_event) { msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_reset_bsp(vcpu)) @@ -2759,13 +2761,17 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { u32 reg = kvm_lapic_get_reg(apic, lvt_type); int vector, mode, trig_mode; + int r; if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { vector = reg & APIC_VECTOR_MASK; mode = reg & APIC_MODE_MASK; trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; - return __apic_accept_irq(apic, mode, vector, 1, trig_mode, - NULL); + + r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); + if (r && lvt_type == APIC_LVTPC) + kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED); + return r; } return 0; } @@ -2977,6 +2983,8 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) struct kvm_lapic *apic = vcpu->arch.apic; int r; + static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu); + kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); /* set SPIV separately to get count of SW disabled APICs right */ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 253fb2093d5d..bb8c86eefac0 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -237,6 +237,13 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return -(u32)fault & errcode; } +bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma); + +static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm) +{ + return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm)); +} + void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f7901cb4d2fa..b0f01d605617 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3425,8 +3425,8 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; - u64 spte = 0ull; - u64 *sptep = NULL; + u64 spte; + u64 *sptep; uint retry_count = 0; if (!page_fault_can_be_fast(fault)) @@ -3442,6 +3442,14 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) else sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); + /* + * It's entirely possible for the mapping to have been zapped + * by a different task, but the root page should always be + * available as the vCPU holds a reference to its root(s). + */ + if (WARN_ON_ONCE(!sptep)) + spte = REMOVED_SPTE; + if (!is_shadow_present_pte(spte)) break; @@ -4479,21 +4487,28 @@ out_unlock: } #endif -int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma) { /* - * If the guest's MTRRs may be used to compute the "real" memtype, - * restrict the mapping level to ensure KVM uses a consistent memtype - * across the entire mapping. If the host MTRRs are ignored by TDP - * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA - * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype - * from the guest's MTRRs so that guest accesses to memory that is - * DMA'd aren't cached against the guest's wishes. + * If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the + * VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is + * to honor the memtype from the guest's MTRRs so that guest accesses + * to memory that is DMA'd aren't cached against the guest's wishes. * * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs, * e.g. KVM will force UC memtype for host MMIO. */ - if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + return vm_has_noncoherent_dma && shadow_memtype_mask; +} + +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +{ + /* + * If the guest's MTRRs may be used to compute the "real" memtype, + * restrict the mapping level to ensure KVM uses a consistent memtype + * across the entire mapping. + */ + if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) { for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); gfn_t base = gfn_round_for_level(fault->gfn, diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 3eb6e7f47e96..a67c28a56417 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -320,7 +320,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; gfn_t start, end; - if (!tdp_enabled || !kvm_arch_has_noncoherent_dma(vcpu->kvm)) + if (!kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) return; if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index edb89b51b383..9ae07db6f0f6 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -93,14 +93,6 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } -static void kvm_pmi_trigger_fn(struct irq_work *irq_work) -{ - struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); - struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); - - kvm_pmu_deliver_pmi(vcpu); -} - static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -124,20 +116,7 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); } - if (!pmc->intr || skip_pmi) - return; - - /* - * Inject PMI. If vcpu was in a guest mode during NMI PMI - * can be ejected on a guest mode re-entry. Otherwise we can't - * be sure that vcpu wasn't executing hlt instruction at the - * time of vmexit and is not going to re-enter guest mode until - * woken up. So we should wake it, but this is impossible from - * NMI context. Do it from irq work instead. - */ - if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu)) - irq_work_queue(&pmc_to_pmu(pmc)->irq_work); - else + if (pmc->intr && !skip_pmi) kvm_make_request(KVM_REQ_PMI, pmc->vcpu); } @@ -675,9 +654,6 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) void kvm_pmu_reset(struct kvm_vcpu *vcpu) { - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - - irq_work_sync(&pmu->irq_work); static_call(kvm_x86_pmu_reset)(vcpu); } @@ -687,7 +663,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) memset(pmu, 0, sizeof(*pmu)); static_call(kvm_x86_pmu_init)(vcpu); - init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); pmu->event_count = 0; pmu->need_cleanup = false; kvm_pmu_refresh(vcpu); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 7d9ba301c090..1d64113de488 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -74,6 +74,12 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) return counter & pmc_bitmask(pmc); } +static inline void pmc_write_counter(struct kvm_pmc *pmc, u64 val) +{ + pmc->counter += val - pmc_read_counter(pmc); + pmc->counter &= pmc_bitmask(pmc); +} + static inline void pmc_release_perf_event(struct kvm_pmc *pmc) { if (pmc->perf_event) { diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index b42111a24cc2..dc3d95fdca7d 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -324,7 +324,6 @@ void enter_smm(struct kvm_vcpu *vcpu) cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); static_call(kvm_x86_set_cr0)(vcpu, cr0); - vcpu->arch.cr0 = cr0; static_call(kvm_x86_set_cr4)(vcpu, 0); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 2092db892d7d..4b74ea91f4e6 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -529,8 +529,11 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: WARN_ONCE(1, "Invalid backing page\n"); break; + case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR: + /* Invalid IPI with vector < 16 */ + break; default: - pr_err("Unknown IPI interception\n"); + vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n"); } return 1; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index dd496c9e5f91..3fea8c47679e 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1253,6 +1253,9 @@ void svm_leave_nested(struct kvm_vcpu *vcpu) nested_svm_uninit_mmu_context(vcpu); vmcb_mark_all_dirty(svm->vmcb); + + if (kvm_apicv_activated(vcpu->kvm)) + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index cef5a3d0abd0..373ff6a6687b 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -160,7 +160,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* MSR_PERFCTRn */ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); if (pmc) { - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); return 0; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9507df93f410..1855a6d7c976 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -199,7 +199,7 @@ module_param_named(npt, npt_enabled, bool, 0444); /* allow nested virtualization in KVM/SVM */ static int nested = true; -module_param(nested, int, S_IRUGO); +module_param(nested, int, 0444); /* enable/disable Next RIP Save */ int nrips = true; @@ -364,8 +364,6 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; } -static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len); static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, bool commit_side_effects) @@ -386,14 +384,6 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, } if (!svm->next_rip) { - /* - * FIXME: Drop this when kvm_emulate_instruction() does the - * right thing and treats "can't emulate" as outright failure - * for EMULTYPE_SKIP. - */ - if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0)) - return 0; - if (unlikely(!commit_side_effects)) old_rflags = svm->vmcb->save.rflags; @@ -691,7 +681,7 @@ static int svm_hardware_enable(void) */ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { struct sev_es_save_area *hostsa; - u32 msr_hi; + u32 __maybe_unused msr_hi; hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); @@ -913,8 +903,7 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) if (intercept == svm->x2avic_msrs_intercepted) return; - if (!x2avic_enabled || - !apic_x2apic_mode(svm->vcpu.arch.apic)) + if (!x2avic_enabled) return; for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { @@ -2203,12 +2192,6 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; struct vcpu_svm *svm = to_svm(vcpu); - /* - * The VM save area has already been encrypted so it - * cannot be reinitialized - just terminate. - */ - if (sev_es_guest(vcpu->kvm)) - return -EINVAL; /* * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put @@ -2217,9 +2200,14 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) * userspace. At a platform view, INIT is acceptable behavior as * there exist bare metal platforms that automatically INIT the CPU * in response to shutdown. + * + * The VM save area for SEV-ES guests has already been encrypted so it + * cannot be reinitialized, i.e. synthesizing INIT is futile. */ - clear_page(svm->vmcb); - kvm_vcpu_reset(vcpu, true); + if (!sev_es_guest(vcpu->kvm)) { + clear_page(svm->vmcb); + kvm_vcpu_reset(vcpu, true); + } kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; @@ -4728,15 +4716,15 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) } #endif -static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len) +static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { bool smep, smap, is_user; u64 error_code; /* Emulation is always possible when KVM has access to all guest state. */ if (!sev_guest(vcpu->kvm)) - return true; + return X86EMUL_CONTINUE; /* #UD and #GP should never be intercepted for SEV guests. */ WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD | @@ -4748,14 +4736,14 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * to guest register state. */ if (sev_es_guest(vcpu->kvm)) - return false; + return X86EMUL_RETRY_INSTR; /* * Emulation is possible if the instruction is already decoded, e.g. * when completing I/O after returning from userspace. */ if (emul_type & EMULTYPE_NO_DECODE) - return true; + return X86EMUL_CONTINUE; /* * Emulation is possible for SEV guests if and only if a prefilled @@ -4781,9 +4769,11 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * success (and in practice it will work the vast majority of the time). */ if (unlikely(!insn)) { - if (!(emul_type & EMULTYPE_SKIP)) - kvm_queue_exception(vcpu, UD_VECTOR); - return false; + if (emul_type & EMULTYPE_SKIP) + return X86EMUL_UNHANDLEABLE; + + kvm_queue_exception(vcpu, UD_VECTOR); + return X86EMUL_PROPAGATE_FAULT; } /* @@ -4794,7 +4784,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * table used to translate CS:RIP resides in emulated MMIO. */ if (likely(insn_len)) - return true; + return X86EMUL_CONTINUE; /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. @@ -4852,6 +4842,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, kvm_inject_gp(vcpu, 0); else kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + return X86EMUL_PROPAGATE_FAULT; } resume_guest: @@ -4869,7 +4860,7 @@ resume_guest: * doesn't explicitly define "ignored", i.e. doing nothing and letting * the guest spin is technically "ignoring" the access. */ - return false; + return X86EMUL_RETRY_INSTR; } static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) @@ -5029,7 +5020,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, .vm_move_enc_context_from = sev_vm_move_enc_context_from, - .can_emulate_instruction = svm_can_emulate_instruction, + .check_emulate_instruction = svm_check_emulate_instruction, .apic_init_signal_blocked = svm_apic_init_signal_blocked, diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index f2efa0bf7ae8..820d3e1f6b4f 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -436,11 +436,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); break; } else if ((pmc = get_fixed_pmc(pmu, msr))) { - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); break; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 72e3943f3693..be20a60047b1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -82,28 +82,28 @@ bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); static bool __read_mostly enable_vnmi = 1; -module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); +module_param_named(vnmi, enable_vnmi, bool, 0444); bool __read_mostly flexpriority_enabled = 1; -module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); +module_param_named(flexpriority, flexpriority_enabled, bool, 0444); bool __read_mostly enable_ept = 1; -module_param_named(ept, enable_ept, bool, S_IRUGO); +module_param_named(ept, enable_ept, bool, 0444); bool __read_mostly enable_unrestricted_guest = 1; module_param_named(unrestricted_guest, - enable_unrestricted_guest, bool, S_IRUGO); + enable_unrestricted_guest, bool, 0444); bool __read_mostly enable_ept_ad_bits = 1; -module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); +module_param_named(eptad, enable_ept_ad_bits, bool, 0444); static bool __read_mostly emulate_invalid_guest_state = true; -module_param(emulate_invalid_guest_state, bool, S_IRUGO); +module_param(emulate_invalid_guest_state, bool, 0444); static bool __read_mostly fasteoi = 1; -module_param(fasteoi, bool, S_IRUGO); +module_param(fasteoi, bool, 0444); -module_param(enable_apicv, bool, S_IRUGO); +module_param(enable_apicv, bool, 0444); bool __read_mostly enable_ipiv = true; module_param(enable_ipiv, bool, 0444); @@ -114,10 +114,10 @@ module_param(enable_ipiv, bool, 0444); * use VMX instructions. */ static bool __read_mostly nested = 1; -module_param(nested, bool, S_IRUGO); +module_param(nested, bool, 0444); bool __read_mostly enable_pml = 1; -module_param_named(pml, enable_pml, bool, S_IRUGO); +module_param_named(pml, enable_pml, bool, 0444); static bool __read_mostly error_on_inconsistent_vmcs_config = true; module_param(error_on_inconsistent_vmcs_config, bool, 0444); @@ -1657,8 +1657,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } -static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len) +static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { /* * Emulation of instructions in SGX enclaves is impossible as RIP does @@ -1669,9 +1669,9 @@ static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, */ if (to_vmx(vcpu)->exit_reason.enclave_mode) { kvm_queue_exception(vcpu, UD_VECTOR); - return false; + return X86EMUL_PROPAGATE_FAULT; } - return true; + return X86EMUL_CONTINUE; } static int skip_emulated_instruction(struct kvm_vcpu *vcpu) @@ -5792,7 +5792,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { gpa_t gpa; - if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) + if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) return 1; /* @@ -6912,7 +6912,7 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } -static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) +static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7579,8 +7579,6 @@ static int vmx_vm_init(struct kvm *kvm) static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { - u8 cache; - /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in * memory aliases with conflicting memory types and sometimes MCEs. * We have to be careful as to what are honored and when. @@ -7607,11 +7605,10 @@ static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) { if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - cache = MTRR_TYPE_WRBACK; + return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT; else - cache = MTRR_TYPE_UNCACHABLE; - - return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; + return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) | + VMX_EPT_IPAT_BIT; } return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; @@ -8286,7 +8283,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_apic_access_page_addr = vmx_set_apic_access_page_addr, .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, .load_eoi_exitmap = vmx_load_eoi_exitmap, - .apicv_post_state_restore = vmx_apicv_post_state_restore, + .apicv_pre_state_restore = vmx_apicv_pre_state_restore, .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, @@ -8341,7 +8338,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .enable_smi_window = vmx_enable_smi_window, #endif - .can_emulate_instruction = vmx_can_emulate_instruction, + .check_emulate_instruction = vmx_check_emulate_instruction, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, .migrate_timers = vmx_migrate_timers, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f18b06bbda6..2c924075f6f1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -145,21 +145,21 @@ EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); static bool __read_mostly ignore_msrs = 0; -module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); +module_param(ignore_msrs, bool, 0644); bool __read_mostly report_ignored_msrs = true; -module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); +module_param(report_ignored_msrs, bool, 0644); EXPORT_SYMBOL_GPL(report_ignored_msrs); unsigned int min_timer_period_us = 200; -module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); +module_param(min_timer_period_us, uint, 0644); static bool __read_mostly kvmclock_periodic_sync = true; -module_param(kvmclock_periodic_sync, bool, S_IRUGO); +module_param(kvmclock_periodic_sync, bool, 0444); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; -module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); +module_param(tsc_tolerance_ppm, uint, 0644); /* * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables @@ -168,13 +168,13 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); * tuning, i.e. allows privileged userspace to set an exact advancement time. */ static int __read_mostly lapic_timer_advance_ns = -1; -module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); +module_param(lapic_timer_advance_ns, int, 0644); static bool __read_mostly vector_hashing = true; -module_param(vector_hashing, bool, S_IRUGO); +module_param(vector_hashing, bool, 0444); bool __read_mostly enable_vmware_backdoor = false; -module_param(enable_vmware_backdoor, bool, S_IRUGO); +module_param(enable_vmware_backdoor, bool, 0444); EXPORT_SYMBOL_GPL(enable_vmware_backdoor); /* @@ -186,7 +186,7 @@ static int __read_mostly force_emulation_prefix; module_param(force_emulation_prefix, int, 0644); int __read_mostly pi_inject_timer = -1; -module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); +module_param(pi_inject_timer, bint, 0644); /* Enable/disable PMU virtualization */ bool __read_mostly enable_pmu = true; @@ -962,7 +962,7 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon kvm_mmu_reset_context(vcpu); if (((cr0 ^ old_cr0) & X86_CR0_CD) && - kvm_arch_has_noncoherent_dma(vcpu->kvm) && + kvm_mmu_honors_guest_mtrrs(vcpu->kvm) && !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); } @@ -2331,14 +2331,9 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_o if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) return; - /* - * The guest calculates current wall clock time by adding - * system time (updated by kvm_guest_time_update below) to the - * wall clock specified here. We do the reverse here. - */ - wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); + wall_nsec = kvm_get_wall_clock_epoch(kvm); - wc.nsec = do_div(wall_nsec, 1000000000); + wc.nsec = do_div(wall_nsec, NSEC_PER_SEC); wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ wc.version = version; @@ -2714,8 +2709,9 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, kvm_track_tsc_matching(vcpu); } -static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) +static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) { + u64 data = user_value ? *user_value : 0; struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; @@ -2730,25 +2726,37 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) if (vcpu->arch.virtual_tsc_khz) { if (data == 0) { /* - * detection of vcpu initialization -- need to sync - * with other vCPUs. This particularly helps to keep - * kvm_clock stable after CPU hotplug + * Force synchronization when creating a vCPU, or when + * userspace explicitly writes a zero value. */ synchronizing = true; - } else { + } else if (kvm->arch.user_set_tsc) { u64 tsc_exp = kvm->arch.last_tsc_write + nsec_to_cycles(vcpu, elapsed); u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL; /* - * Special case: TSC write with a small delta (1 second) - * of virtual cycle time against real time is - * interpreted as an attempt to synchronize the CPU. + * Here lies UAPI baggage: when a user-initiated TSC write has + * a small delta (1 second) of virtual cycle time against the + * previously set vCPU, we assume that they were intended to be + * in sync and the delta was only due to the racy nature of the + * legacy API. + * + * This trick falls down when restoring a guest which genuinely + * has been running for less time than the 1 second of imprecision + * which we allow for in the legacy API. In this case, the first + * value written by userspace (on any vCPU) should not be subject + * to this 'correction' to make it sync up with values that only + * come from the kernel's default vCPU creation. Make the 1-second + * slop hack only trigger if the user_set_tsc flag is already set. */ synchronizing = data < tsc_exp + tsc_hz && data + tsc_hz > tsc_exp; } } + if (user_value) + kvm->arch.user_set_tsc = true; + /* * For a reliable TSC, we can match TSC offsets, and for an unstable * TSC, we add elapsed time in this computation. We could let the @@ -3232,16 +3240,94 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (vcpu->pv_time.active) kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0); +#ifdef CONFIG_KVM_XEN if (vcpu->xen.vcpu_info_cache.active) kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache, offsetof(struct compat_vcpu_info, time)); if (vcpu->xen.vcpu_time_info_cache.active) kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0); +#endif kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } /* + * The pvclock_wall_clock ABI tells the guest the wall clock time at + * which it started (i.e. its epoch, when its kvmclock was zero). + * + * In fact those clocks are subtly different; wall clock frequency is + * adjusted by NTP and has leap seconds, while the kvmclock is a + * simple function of the TSC without any such adjustment. + * + * Perhaps the ABI should have exposed CLOCK_TAI and a ratio between + * that and kvmclock, but even that would be subject to change over + * time. + * + * Attempt to calculate the epoch at a given moment using the *same* + * TSC reading via kvm_get_walltime_and_clockread() to obtain both + * wallclock and kvmclock times, and subtracting one from the other. + * + * Fall back to using their values at slightly different moments by + * calling ktime_get_real_ns() and get_kvmclock_ns() separately. + */ +uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + struct pvclock_vcpu_time_info hv_clock; + struct kvm_arch *ka = &kvm->arch; + unsigned long seq, local_tsc_khz; + struct timespec64 ts; + uint64_t host_tsc; + + do { + seq = read_seqcount_begin(&ka->pvclock_sc); + + local_tsc_khz = 0; + if (!ka->use_master_clock) + break; + + /* + * The TSC read and the call to get_cpu_tsc_khz() must happen + * on the same CPU. + */ + get_cpu(); + + local_tsc_khz = get_cpu_tsc_khz(); + + if (local_tsc_khz && + !kvm_get_walltime_and_clockread(&ts, &host_tsc)) + local_tsc_khz = 0; /* Fall back to old method */ + + put_cpu(); + + /* + * These values must be snapshotted within the seqcount loop. + * After that, it's just mathematics which can happen on any + * CPU at any time. + */ + hv_clock.tsc_timestamp = ka->master_cycle_now; + hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; + + } while (read_seqcount_retry(&ka->pvclock_sc, seq)); + + /* + * If the conditions were right, and obtaining the wallclock+TSC was + * successful, calculate the KVM clock at the corresponding time and + * subtract one from the other to get the guest's epoch in nanoseconds + * since 1970-01-01. + */ + if (local_tsc_khz) { + kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC, + &hv_clock.tsc_shift, + &hv_clock.tsc_to_system_mul); + return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec - + __pvclock_read_cycles(&hv_clock, host_tsc); + } +#endif + return ktime_get_real_ns() - get_kvmclock_ns(kvm); +} + +/* * kvmclock updates which are isolated to a given vcpu, such as * vcpu->cpu migration, should not allow system_timestamp from * the rest of the vcpus to remain static. Otherwise ntp frequency @@ -3290,9 +3376,6 @@ static void kvmclock_sync_fn(struct work_struct *work) kvmclock_sync_work); struct kvm *kvm = container_of(ka, struct kvm, arch); - if (!kvmclock_periodic_sync) - return; - schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); @@ -3641,6 +3724,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_AMD64_PATCH_LOADER: case MSR_AMD64_BU_CFG2: case MSR_AMD64_DC_CFG: + case MSR_AMD64_TW_CFG: case MSR_F15H_EX_CFG: break; @@ -3670,17 +3754,36 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.perf_capabilities = data; kvm_pmu_refresh(vcpu); break; - case MSR_IA32_PRED_CMD: - if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu)) - return 1; + case MSR_IA32_PRED_CMD: { + u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB); + + if (!msr_info->host_initiated) { + if ((!guest_has_pred_cmd_msr(vcpu))) + return 1; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) + reserved_bits |= PRED_CMD_IBPB; - if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB)) + if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB)) + reserved_bits |= PRED_CMD_SBPB; + } + + if (!boot_cpu_has(X86_FEATURE_IBPB)) + reserved_bits |= PRED_CMD_IBPB; + + if (!boot_cpu_has(X86_FEATURE_SBPB)) + reserved_bits |= PRED_CMD_SBPB; + + if (data & reserved_bits) return 1; + if (!data) break; - wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); + wrmsrl(MSR_IA32_PRED_CMD, data); break; + } case MSR_IA32_FLUSH_CMD: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)) @@ -3700,13 +3803,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ - /* Handle McStatusWrEn */ - if (data == BIT_ULL(18)) { - vcpu->arch.msr_hwcr = data; - } else if (data != 0) { + /* + * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2 + * through at least v6.6 whine if TscFreqSel is clear, + * depending on F/M/S. + */ + if (data & ~(BIT_ULL(18) | BIT_ULL(24))) { kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } + vcpu->arch.msr_hwcr = data; break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { @@ -3777,7 +3883,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_TSC: if (msr_info->host_initiated) { - kvm_synchronize_tsc(vcpu, data); + kvm_synchronize_tsc(vcpu, &data); } else { u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; adjust_tsc_offset_guest(vcpu, adj); @@ -4065,6 +4171,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_AMD64_BU_CFG2: case MSR_IA32_PERF_CTL: case MSR_AMD64_DC_CFG: + case MSR_AMD64_TW_CFG: case MSR_F15H_EX_CFG: /* * Intel Sandy Bridge CPUs must support the RAPL (running average power @@ -5382,26 +5489,37 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return 0; } -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, - struct kvm_xsave *guest_xsave) -{ - if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) - return; - - fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, - guest_xsave->region, - sizeof(guest_xsave->region), - vcpu->arch.pkru); -} static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, u8 *state, unsigned int size) { + /* + * Only copy state for features that are enabled for the guest. The + * state itself isn't problematic, but setting bits in the header for + * features that are supported in *this* host but not exposed to the + * guest can result in KVM_SET_XSAVE failing when live migrating to a + * compatible host without the features that are NOT exposed to the + * guest. + * + * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if + * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't + * supported by the host. + */ + u64 supported_xcr0 = vcpu->arch.guest_supported_xcr0 | + XFEATURE_MASK_FPSSE; + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return; - fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, - state, size, vcpu->arch.pkru); + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size, + supported_xcr0, vcpu->arch.pkru); +} + +static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) +{ + return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region, + sizeof(guest_xsave->region)); } static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, @@ -5536,6 +5654,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; ns = get_kvmclock_base_ns(); + kvm->arch.user_set_tsc = true; __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); @@ -6248,6 +6367,9 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) struct kvm_vcpu *vcpu; unsigned long i; + if (!kvm_x86_ops.cpu_dirty_log_size) + return; + kvm_for_each_vcpu(i, vcpu, kvm) kvm_vcpu_kick(vcpu); } @@ -7474,11 +7596,11 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, } EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); -static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len) +static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { - return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type, - insn, insn_len); + return static_call(kvm_x86_check_emulate_instruction)(vcpu, emul_type, + insn, insn_len); } int handle_ud(struct kvm_vcpu *vcpu) @@ -7488,8 +7610,10 @@ int handle_ud(struct kvm_vcpu *vcpu) int emul_type = EMULTYPE_TRAP_UD; char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; + int r; - if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0))) + r = kvm_check_emulate_insn(vcpu, emul_type, NULL, 0); + if (r != X86EMUL_CONTINUE) return 1; if (fep_flags && @@ -8871,8 +8995,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; - if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len))) - return 1; + r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len); + if (r != X86EMUL_CONTINUE) { + if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT) + return 1; + + WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE); + return handle_emulation_failure(vcpu, emulation_type); + } vcpu->arch.l1tf_flush_l1d = true; @@ -10576,16 +10706,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) record_steal_time(vcpu); + if (kvm_check_request(KVM_REQ_PMU, vcpu)) + kvm_pmu_handle_event(vcpu); + if (kvm_check_request(KVM_REQ_PMI, vcpu)) + kvm_pmu_deliver_pmi(vcpu); #ifdef CONFIG_KVM_SMM if (kvm_check_request(KVM_REQ_SMI, vcpu)) process_smi(vcpu); #endif if (kvm_check_request(KVM_REQ_NMI, vcpu)) process_nmi(vcpu); - if (kvm_check_request(KVM_REQ_PMU, vcpu)) - kvm_pmu_handle_event(vcpu); - if (kvm_check_request(KVM_REQ_PMI, vcpu)) - kvm_pmu_deliver_pmi(vcpu); if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) { BUG_ON(vcpu->arch.pending_ioapic_eoi > 255); if (test_bit(vcpu->arch.pending_ioapic_eoi, @@ -11521,7 +11651,6 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0); - vcpu->arch.cr0 = sregs->cr0; *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4); @@ -11565,8 +11694,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) if (ret) return ret; - if (mmu_reset_needed) + if (mmu_reset_needed) { kvm_mmu_reset_context(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + } max_bits = KVM_NR_INTERRUPTS; pending_vec = find_first_bit( @@ -11607,8 +11738,10 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) mmu_reset_needed = 1; vcpu->arch.pdptrs_from_userspace = true; } - if (mmu_reset_needed) + if (mmu_reset_needed) { kvm_mmu_reset_context(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + } return 0; } @@ -11959,7 +12092,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) if (mutex_lock_killable(&vcpu->mutex)) return; vcpu_load(vcpu); - kvm_synchronize_tsc(vcpu, 0); + kvm_synchronize_tsc(vcpu, NULL); vcpu_put(vcpu); /* poll control enabled by default */ @@ -12315,7 +12448,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) goto out_uninit_mmu; INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); - INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ @@ -12843,6 +12975,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return true; #endif + if (kvm_test_request(KVM_REQ_PMI, vcpu)) + return true; + if (kvm_arch_interrupt_allowed(vcpu) && (kvm_cpu_has_interrupt(vcpu) || kvm_guest_apic_has_interrupt(vcpu))) @@ -13188,15 +13323,30 @@ bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); +static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm) +{ + /* + * Non-coherent DMA assignment and de-assignment will affect + * whether KVM honors guest MTRRs and cause changes in memtypes + * in TDP. + * So, pass %true unconditionally to indicate non-coherent DMA was, + * or will be involved, and that zapping SPTEs might be necessary. + */ + if (__kvm_mmu_honors_guest_mtrrs(true)) + kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL)); +} + void kvm_arch_register_noncoherent_dma(struct kvm *kvm) { - atomic_inc(&kvm->arch.noncoherent_dma_count); + if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1) + kvm_noncoherent_dma_assignment_start_or_stop(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma); void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) { - atomic_dec(&kvm->arch.noncoherent_dma_count); + if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count)) + kvm_noncoherent_dma_assignment_start_or_stop(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 1e7be1f6ab29..5184fde1dc54 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -293,6 +293,7 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); u64 get_kvmclock_ns(struct kvm *kvm); +uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm); int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 40edf4d1974c..e53fad915a62 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -59,7 +59,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) * This code mirrors kvm_write_wall_clock() except that it writes * directly through the pfn cache and doesn't mark the page dirty. */ - wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); + wall_nsec = kvm_get_wall_clock_epoch(kvm); /* It could be invalid again already, so we need to check */ read_lock_irq(&gpc->lock); @@ -98,7 +98,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) wc_version = wc->version = (wc->version + 1) | 1; smp_wmb(); - wc->nsec = do_div(wall_nsec, 1000000000); + wc->nsec = do_div(wall_nsec, NSEC_PER_SEC); wc->sec = (u32)wall_nsec; *wc_sec_hi = wall_nsec >> 32; smp_wmb(); @@ -134,9 +134,23 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) { struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu, arch.xen.timer); + struct kvm_xen_evtchn e; + int rc; + if (atomic_read(&vcpu->arch.xen.timer_pending)) return HRTIMER_NORESTART; + e.vcpu_id = vcpu->vcpu_id; + e.vcpu_idx = vcpu->vcpu_idx; + e.port = vcpu->arch.xen.timer_virq; + e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm); + if (rc != -EWOULDBLOCK) { + vcpu->arch.xen.timer_expires = 0; + return HRTIMER_NORESTART; + } + atomic_inc(&vcpu->arch.xen.timer_pending); kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); @@ -146,6 +160,14 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns) { + /* + * Avoid races with the old timer firing. Checking timer_expires + * to avoid calling hrtimer_cancel() will only have false positives + * so is fine. + */ + if (vcpu->arch.xen.timer_expires) + hrtimer_cancel(&vcpu->arch.xen.timer); + atomic_set(&vcpu->arch.xen.timer_pending, 0); vcpu->arch.xen.timer_expires = guest_abs; @@ -1019,9 +1041,36 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; case KVM_XEN_VCPU_ATTR_TYPE_TIMER: + /* + * Ensure a consistent snapshot of state is captured, with a + * timer either being pending, or the event channel delivered + * to the corresponding bit in the shared_info. Not still + * lurking in the timer_pending flag for deferred delivery. + * Purely as an optimisation, if the timer_expires field is + * zero, that means the timer isn't active (or even in the + * timer_pending flag) and there is no need to cancel it. + */ + if (vcpu->arch.xen.timer_expires) { + hrtimer_cancel(&vcpu->arch.xen.timer); + kvm_xen_inject_timer_irqs(vcpu); + } + data->u.timer.port = vcpu->arch.xen.timer_virq; data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; data->u.timer.expires_ns = vcpu->arch.xen.timer_expires; + + /* + * The hrtimer may trigger and raise the IRQ immediately, + * while the returned state causes it to be set up and + * raised again on the destination system after migration. + * That's fine, as the guest won't even have had a chance + * to run and handle the interrupt. Asserting an already + * pending event channel is idempotent. + */ + if (vcpu->arch.xen.timer_expires) + hrtimer_start_expires(&vcpu->arch.xen.timer, + HRTIMER_MODE_ABS_HARD); + r = 0; break; @@ -1374,12 +1423,8 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, return true; } + /* A delta <= 0 results in an immediate callback, which is what we want */ delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm); - if ((oneshot.flags & VCPU_SSHOTTMR_future) && delta < 0) { - *r = -ETIME; - return true; - } - kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta); *r = 0; return true; diff --git a/arch/xtensa/boot/Makefile b/arch/xtensa/boot/Makefile index a65b7a9ebff2..d8b0fadf429a 100644 --- a/arch/xtensa/boot/Makefile +++ b/arch/xtensa/boot/Makefile @@ -9,8 +9,7 @@ # KBUILD_CFLAGS used when building rest of boot (takes effect recursively) -KBUILD_CFLAGS += -fno-builtin -Iarch/$(ARCH)/boot/include -HOSTFLAGS += -Iarch/$(ARCH)/boot/include +KBUILD_CFLAGS += -fno-builtin subdir-y := lib targets += vmlinux.bin vmlinux.bin.gz diff --git a/arch/xtensa/boot/lib/zmem.c b/arch/xtensa/boot/lib/zmem.c index e3ecd743c515..b89189355122 100644 --- a/arch/xtensa/boot/lib/zmem.c +++ b/arch/xtensa/boot/lib/zmem.c @@ -4,13 +4,14 @@ /* bits taken from ppc */ extern void *avail_ram, *end_avail; +void gunzip(void *dst, int dstlen, unsigned char *src, int *lenp); -void exit (void) +static void exit(void) { for (;;); } -void *zalloc(unsigned size) +static void *zalloc(unsigned int size) { void *p = avail_ram; diff --git a/arch/xtensa/include/asm/core.h b/arch/xtensa/include/asm/core.h index 3f5ffae89b58..6f02f6f21890 100644 --- a/arch/xtensa/include/asm/core.h +++ b/arch/xtensa/include/asm/core.h @@ -6,6 +6,10 @@ #include <variant/core.h> +#ifndef XCHAL_HAVE_DIV32 +#define XCHAL_HAVE_DIV32 0 +#endif + #ifndef XCHAL_HAVE_EXCLUSIVE #define XCHAL_HAVE_EXCLUSIVE 0 #endif diff --git a/arch/xtensa/include/asm/hw_breakpoint.h b/arch/xtensa/include/asm/hw_breakpoint.h index 9f119c1ca0b5..9ec86f440a48 100644 --- a/arch/xtensa/include/asm/hw_breakpoint.h +++ b/arch/xtensa/include/asm/hw_breakpoint.h @@ -48,6 +48,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp); void hw_breakpoint_pmu_read(struct perf_event *bp); int check_hw_breakpoint(struct pt_regs *regs); void clear_ptrace_hw_breakpoint(struct task_struct *tsk); +void restore_dbreak(void); #else diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index a6d09fe04831..d008a153a2b9 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -14,6 +14,8 @@ #include <linux/compiler.h> #include <linux/stringify.h> + +#include <asm/bootparam.h> #include <asm/ptrace.h> #include <asm/types.h> #include <asm/regs.h> @@ -217,6 +219,9 @@ struct mm_struct; extern unsigned long __get_wchan(struct task_struct *p); +void init_arch(bp_tag_t *bp_start); +void do_notify_resume(struct pt_regs *regs); + #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->areg[1]) diff --git a/arch/xtensa/include/asm/ptrace.h b/arch/xtensa/include/asm/ptrace.h index 308f209a4740..a270467556dc 100644 --- a/arch/xtensa/include/asm/ptrace.h +++ b/arch/xtensa/include/asm/ptrace.h @@ -106,6 +106,9 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) return regs->areg[2]; } +int do_syscall_trace_enter(struct pt_regs *regs); +void do_syscall_trace_leave(struct pt_regs *regs); + #else /* __ASSEMBLY__ */ # include <asm/asm-offsets.h> diff --git a/arch/xtensa/include/asm/smp.h b/arch/xtensa/include/asm/smp.h index 5dc5bf8cdd77..e446e6fc4557 100644 --- a/arch/xtensa/include/asm/smp.h +++ b/arch/xtensa/include/asm/smp.h @@ -23,6 +23,7 @@ struct cpumask; void arch_send_call_function_ipi_mask(const struct cpumask *mask); void arch_send_call_function_single_ipi(int cpu); +void secondary_start_kernel(void); void smp_init_cpus(void); void secondary_init_irq(void); void ipi_init(void); diff --git a/arch/xtensa/include/asm/tlb.h b/arch/xtensa/include/asm/tlb.h index 50889935138a..8c3ceb427018 100644 --- a/arch/xtensa/include/asm/tlb.h +++ b/arch/xtensa/include/asm/tlb.h @@ -18,4 +18,6 @@ #define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) +void check_tlb_sanity(void); + #endif /* _XTENSA_TLB_H */ diff --git a/arch/xtensa/kernel/hw_breakpoint.c b/arch/xtensa/kernel/hw_breakpoint.c index 285fb2942b06..1eeecd58eb0c 100644 --- a/arch/xtensa/kernel/hw_breakpoint.c +++ b/arch/xtensa/kernel/hw_breakpoint.c @@ -13,6 +13,7 @@ #include <linux/percpu.h> #include <linux/perf_event.h> #include <asm/core.h> +#include <asm/hw_breakpoint.h> /* Breakpoint currently in use for each IBREAKA. */ static DEFINE_PER_CPU(struct perf_event *, bp_on_reg[XCHAL_NUM_IBREAK]); diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c index 42f106004400..b1e410f6b5ab 100644 --- a/arch/xtensa/kernel/irq.c +++ b/arch/xtensa/kernel/irq.c @@ -28,6 +28,7 @@ #include <asm/mxregs.h> #include <linux/uaccess.h> #include <asm/platform.h> +#include <asm/traps.h> DECLARE_PER_CPU(unsigned long, nmi_count); diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c index f29477162ede..9056cd1a8302 100644 --- a/arch/xtensa/kernel/ptrace.c +++ b/arch/xtensa/kernel/ptrace.c @@ -541,7 +541,6 @@ long arch_ptrace(struct task_struct *child, long request, return ret; } -void do_syscall_trace_leave(struct pt_regs *regs); int do_syscall_trace_enter(struct pt_regs *regs) { if (regs->syscall == NO_SYSCALL) diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index 5c01d7e70d90..81f0b106cfc1 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -26,6 +26,8 @@ #include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/coprocessor.h> +#include <asm/processor.h> +#include <asm/syscall.h> #include <asm/unistd.h> extern struct task_struct *coproc_owners[]; diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c index 07dd6baf18cf..94a23f100726 100644 --- a/arch/xtensa/kernel/smp.c +++ b/arch/xtensa/kernel/smp.c @@ -21,6 +21,7 @@ #include <linux/irq.h> #include <linux/kdebug.h> #include <linux/module.h> +#include <linux/profile.h> #include <linux/sched/mm.h> #include <linux/sched/hotplug.h> #include <linux/sched/task_stack.h> diff --git a/arch/xtensa/kernel/stacktrace.c b/arch/xtensa/kernel/stacktrace.c index f643ea5e36da..831ffb648bda 100644 --- a/arch/xtensa/kernel/stacktrace.c +++ b/arch/xtensa/kernel/stacktrace.c @@ -12,6 +12,7 @@ #include <linux/sched.h> #include <linux/stacktrace.h> +#include <asm/ftrace.h> #include <asm/stacktrace.h> #include <asm/traps.h> #include <linux/uaccess.h> diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 427c125a137a..38092d21acf8 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -23,6 +23,7 @@ * for more details. */ +#include <linux/cpu.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> diff --git a/arch/xtensa/lib/umulsidi3.S b/arch/xtensa/lib/umulsidi3.S index 8c7a94a0c5d0..5da501b57813 100644 --- a/arch/xtensa/lib/umulsidi3.S +++ b/arch/xtensa/lib/umulsidi3.S @@ -3,7 +3,9 @@ #include <asm/asmmacro.h> #include <asm/core.h> -#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16 +#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32 || XCHAL_HAVE_MAC16 +#define XCHAL_NO_MUL 0 +#else #define XCHAL_NO_MUL 1 #endif diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index d1eb8d6c5b82..16e11b6f6f78 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -20,6 +20,7 @@ #include <asm/mmu_context.h> #include <asm/cacheflush.h> #include <asm/hardirq.h> +#include <asm/traps.h> void bad_page_fault(struct pt_regs*, unsigned long, int); diff --git a/arch/xtensa/mm/tlb.c b/arch/xtensa/mm/tlb.c index 0a11fc5f185b..4f974b74883c 100644 --- a/arch/xtensa/mm/tlb.c +++ b/arch/xtensa/mm/tlb.c @@ -17,6 +17,7 @@ #include <linux/mm.h> #include <asm/processor.h> #include <asm/mmu_context.h> +#include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/cacheflush.h> diff --git a/arch/xtensa/platforms/iss/network.c b/arch/xtensa/platforms/iss/network.c index 85c82cd42188..e89f27f2bb18 100644 --- a/arch/xtensa/platforms/iss/network.c +++ b/arch/xtensa/platforms/iss/network.c @@ -201,7 +201,7 @@ static int tuntap_write(struct iss_net_private *lp, struct sk_buff **skb) return simc_write(lp->tp.info.tuntap.fd, (*skb)->data, (*skb)->len); } -unsigned short tuntap_protocol(struct sk_buff *skb) +static unsigned short tuntap_protocol(struct sk_buff *skb) { return eth_type_trans(skb, skb->dev); } @@ -441,7 +441,7 @@ static int iss_net_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; } -void iss_net_user_timer_expire(struct timer_list *unused) +static void iss_net_user_timer_expire(struct timer_list *unused) { } |