diff options
Diffstat (limited to 'arch')
196 files changed, 5820 insertions, 3522 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index f78c2be4242..8d24bacaa61 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -144,9 +144,6 @@ config HAVE_CLK config HAVE_DMA_API_DEBUG bool -config HAVE_DEFAULT_NO_SPIN_MUTEXES - bool - config HAVE_HW_BREAKPOINT bool depends on PERF_EVENTS diff --git a/arch/arm/mach-at91/at91cap9_devices.c b/arch/arm/mach-at91/at91cap9_devices.c index 9ffbf3a2dfe..21020ceb2f3 100644 --- a/arch/arm/mach-at91/at91cap9_devices.c +++ b/arch/arm/mach-at91/at91cap9_devices.c @@ -171,7 +171,7 @@ void __init at91_add_device_usba(struct usba_platform_data *data) */ usba_udc_data.pdata.vbus_pin = -EINVAL; usba_udc_data.pdata.num_ep = ARRAY_SIZE(usba_udc_ep); - memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep));; + memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep)); if (data && data->vbus_pin > 0) { at91_set_gpio_input(data->vbus_pin, 0); diff --git a/arch/arm/mach-at91/at91sam9g45_devices.c b/arch/arm/mach-at91/at91sam9g45_devices.c index 1e8f275c17f..5e9f8a4c38d 100644 --- a/arch/arm/mach-at91/at91sam9g45_devices.c +++ b/arch/arm/mach-at91/at91sam9g45_devices.c @@ -256,7 +256,7 @@ void __init at91_add_device_usba(struct usba_platform_data *data) { usba_udc_data.pdata.vbus_pin = -EINVAL; usba_udc_data.pdata.num_ep = ARRAY_SIZE(usba_udc_ep); - memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep));; + memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep)); if (data && data->vbus_pin > 0) { at91_set_gpio_input(data->vbus_pin, 0); diff --git a/arch/arm/mach-at91/at91sam9rl_devices.c b/arch/arm/mach-at91/at91sam9rl_devices.c index 53aaa94df75..c49262bddd8 100644 --- a/arch/arm/mach-at91/at91sam9rl_devices.c +++ b/arch/arm/mach-at91/at91sam9rl_devices.c @@ -145,7 +145,7 @@ void __init at91_add_device_usba(struct usba_platform_data *data) */ usba_udc_data.pdata.vbus_pin = -EINVAL; usba_udc_data.pdata.num_ep = ARRAY_SIZE(usba_udc_ep); - memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep));; + memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep)); if (data && data->vbus_pin > 0) { at91_set_gpio_input(data->vbus_pin, 0); diff --git a/arch/arm/mach-exynos4/Kconfig b/arch/arm/mach-exynos4/Kconfig index e849f67be47..805196207ce 100644 --- a/arch/arm/mach-exynos4/Kconfig +++ b/arch/arm/mach-exynos4/Kconfig @@ -170,6 +170,7 @@ config MACH_NURI select S3C_DEV_HSMMC3 select S3C_DEV_I2C1 select S3C_DEV_I2C5 + select S5P_DEV_USB_EHCI select EXYNOS4_SETUP_I2C1 select EXYNOS4_SETUP_I2C5 select EXYNOS4_SETUP_SDHCI diff --git a/arch/arm/mach-exynos4/Makefile b/arch/arm/mach-exynos4/Makefile index 9be104f63c0..777897551e4 100644 --- a/arch/arm/mach-exynos4/Makefile +++ b/arch/arm/mach-exynos4/Makefile @@ -54,3 +54,5 @@ obj-$(CONFIG_EXYNOS4_SETUP_I2C7) += setup-i2c7.o obj-$(CONFIG_EXYNOS4_SETUP_KEYPAD) += setup-keypad.o obj-$(CONFIG_EXYNOS4_SETUP_SDHCI) += setup-sdhci.o obj-$(CONFIG_EXYNOS4_SETUP_SDHCI_GPIO) += setup-sdhci-gpio.o + +obj-$(CONFIG_USB_SUPPORT) += usb-phy.o diff --git a/arch/arm/mach-exynos4/cpu.c b/arch/arm/mach-exynos4/cpu.c index 79301139194..08813a6f66b 100644 --- a/arch/arm/mach-exynos4/cpu.c +++ b/arch/arm/mach-exynos4/cpu.c @@ -97,7 +97,12 @@ static struct map_desc exynos4_iodesc[] __initdata = { .pfn = __phys_to_pfn(EXYNOS4_PA_SROMC), .length = SZ_4K, .type = MT_DEVICE, - }, + }, { + .virtual = (unsigned long)S5P_VA_USB_HSPHY, + .pfn = __phys_to_pfn(EXYNOS4_PA_HSPHY), + .length = SZ_4K, + .type = MT_DEVICE, + } }; static void exynos4_idle(void) diff --git a/arch/arm/mach-exynos4/include/mach/map.h b/arch/arm/mach-exynos4/include/mach/map.h index 6330b73b9ea..0009e77a05f 100644 --- a/arch/arm/mach-exynos4/include/mach/map.h +++ b/arch/arm/mach-exynos4/include/mach/map.h @@ -101,6 +101,9 @@ #define EXYNOS4_PA_SROMC 0x12570000 +#define EXYNOS4_PA_EHCI 0x12580000 +#define EXYNOS4_PA_HSPHY 0x125B0000 + #define EXYNOS4_PA_UART 0x13800000 #define EXYNOS4_PA_IIC(x) (0x13860000 + ((x) * 0x10000)) @@ -143,6 +146,7 @@ #define S5P_PA_SROMC EXYNOS4_PA_SROMC #define S5P_PA_SYSCON EXYNOS4_PA_SYSCON #define S5P_PA_TIMER EXYNOS4_PA_TIMER +#define S5P_PA_EHCI EXYNOS4_PA_EHCI #define SAMSUNG_PA_KEYPAD EXYNOS4_PA_KEYPAD diff --git a/arch/arm/mach-exynos4/include/mach/regs-pmu.h b/arch/arm/mach-exynos4/include/mach/regs-pmu.h index 62b0014d05e..a9643371f8e 100644 --- a/arch/arm/mach-exynos4/include/mach/regs-pmu.h +++ b/arch/arm/mach-exynos4/include/mach/regs-pmu.h @@ -33,6 +33,9 @@ #define S5P_EINT_WAKEUP_MASK S5P_PMUREG(0x0604) #define S5P_WAKEUP_MASK S5P_PMUREG(0x0608) +#define S5P_USBHOST_PHY_CONTROL S5P_PMUREG(0x0708) +#define S5P_USBHOST_PHY_ENABLE (1 << 0) + #define S5P_MIPI_DPHY_CONTROL(n) S5P_PMUREG(0x0710 + (n) * 4) #define S5P_MIPI_DPHY_ENABLE (1 << 0) #define S5P_MIPI_DPHY_SRESETN (1 << 1) diff --git a/arch/arm/mach-exynos4/include/mach/regs-usb-phy.h b/arch/arm/mach-exynos4/include/mach/regs-usb-phy.h new file mode 100644 index 00000000000..703118d5173 --- /dev/null +++ b/arch/arm/mach-exynos4/include/mach/regs-usb-phy.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2011 Samsung Electronics Co.Ltd + * Author: Joonyoung Shim <jy0922.shim@samsung.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#ifndef __PLAT_S5P_REGS_USB_PHY_H +#define __PLAT_S5P_REGS_USB_PHY_H + +#define EXYNOS4_HSOTG_PHYREG(x) ((x) + S5P_VA_USB_HSPHY) + +#define EXYNOS4_PHYPWR EXYNOS4_HSOTG_PHYREG(0x00) +#define PHY1_HSIC_NORMAL_MASK (0xf << 9) +#define PHY1_HSIC1_SLEEP (1 << 12) +#define PHY1_HSIC1_FORCE_SUSPEND (1 << 11) +#define PHY1_HSIC0_SLEEP (1 << 10) +#define PHY1_HSIC0_FORCE_SUSPEND (1 << 9) + +#define PHY1_STD_NORMAL_MASK (0x7 << 6) +#define PHY1_STD_SLEEP (1 << 8) +#define PHY1_STD_ANALOG_POWERDOWN (1 << 7) +#define PHY1_STD_FORCE_SUSPEND (1 << 6) + +#define PHY0_NORMAL_MASK (0x39 << 0) +#define PHY0_SLEEP (1 << 5) +#define PHY0_OTG_DISABLE (1 << 4) +#define PHY0_ANALOG_POWERDOWN (1 << 3) +#define PHY0_FORCE_SUSPEND (1 << 0) + +#define EXYNOS4_PHYCLK EXYNOS4_HSOTG_PHYREG(0x04) +#define PHY1_COMMON_ON_N (1 << 7) +#define PHY0_COMMON_ON_N (1 << 4) +#define PHY0_ID_PULLUP (1 << 2) +#define CLKSEL_MASK (0x3 << 0) +#define CLKSEL_SHIFT (0) +#define CLKSEL_48M (0x0 << 0) +#define CLKSEL_12M (0x2 << 0) +#define CLKSEL_24M (0x3 << 0) + +#define EXYNOS4_RSTCON EXYNOS4_HSOTG_PHYREG(0x08) +#define HOST_LINK_PORT_SWRST_MASK (0xf << 6) +#define HOST_LINK_PORT2_SWRST (1 << 9) +#define HOST_LINK_PORT1_SWRST (1 << 8) +#define HOST_LINK_PORT0_SWRST (1 << 7) +#define HOST_LINK_ALL_SWRST (1 << 6) + +#define PHY1_SWRST_MASK (0x7 << 3) +#define PHY1_HSIC_SWRST (1 << 5) +#define PHY1_STD_SWRST (1 << 4) +#define PHY1_ALL_SWRST (1 << 3) + +#define PHY0_SWRST_MASK (0x7 << 0) +#define PHY0_PHYLINK_SWRST (1 << 2) +#define PHY0_HLINK_SWRST (1 << 1) +#define PHY0_SWRST (1 << 0) + +#define EXYNOS4_PHY1CON EXYNOS4_HSOTG_PHYREG(0x34) +#define FPENABLEN (1 << 0) + +#endif /* __PLAT_S5P_REGS_USB_PHY_H */ diff --git a/arch/arm/mach-exynos4/mach-nuri.c b/arch/arm/mach-exynos4/mach-nuri.c index b79ad010d19..bb5d12f43af 100644 --- a/arch/arm/mach-exynos4/mach-nuri.c +++ b/arch/arm/mach-exynos4/mach-nuri.c @@ -30,6 +30,8 @@ #include <plat/cpu.h> #include <plat/devs.h> #include <plat/sdhci.h> +#include <plat/ehci.h> +#include <plat/clock.h> #include <mach/map.h> @@ -262,6 +264,16 @@ static struct i2c_board_info i2c5_devs[] __initdata = { /* max8997, To be updated */ }; +/* USB EHCI */ +static struct s5p_ehci_platdata nuri_ehci_pdata; + +static void __init nuri_ehci_init(void) +{ + struct s5p_ehci_platdata *pdata = &nuri_ehci_pdata; + + s5p_ehci_set_platdata(pdata); +} + static struct platform_device *nuri_devices[] __initdata = { /* Samsung Platform Devices */ &emmc_fixed_voltage, @@ -270,6 +282,7 @@ static struct platform_device *nuri_devices[] __initdata = { &s3c_device_hsmmc3, &s3c_device_wdt, &s3c_device_timer[0], + &s5p_device_ehci, /* NURI Devices */ &nuri_gpio_keys, @@ -291,6 +304,9 @@ static void __init nuri_machine_init(void) i2c_register_board_info(1, i2c1_devs, ARRAY_SIZE(i2c1_devs)); i2c_register_board_info(5, i2c5_devs, ARRAY_SIZE(i2c5_devs)); + nuri_ehci_init(); + clk_xusbxti.rate = 24000000; + /* Last */ platform_add_devices(nuri_devices, ARRAY_SIZE(nuri_devices)); } diff --git a/arch/arm/mach-exynos4/usb-phy.c b/arch/arm/mach-exynos4/usb-phy.c new file mode 100644 index 00000000000..0883c1b824b --- /dev/null +++ b/arch/arm/mach-exynos4/usb-phy.c @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2011 Samsung Electronics Co.Ltd + * Author: Joonyoung Shim <jy0922.shim@samsung.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + */ + +#include <linux/clk.h> +#include <linux/delay.h> +#include <linux/err.h> +#include <linux/io.h> +#include <linux/platform_device.h> +#include <mach/regs-pmu.h> +#include <mach/regs-usb-phy.h> +#include <plat/cpu.h> +#include <plat/usb-phy.h> + +static int exynos4_usb_phy1_init(struct platform_device *pdev) +{ + struct clk *otg_clk; + struct clk *xusbxti_clk; + u32 phyclk; + u32 rstcon; + int err; + + otg_clk = clk_get(&pdev->dev, "otg"); + if (IS_ERR(otg_clk)) { + dev_err(&pdev->dev, "Failed to get otg clock\n"); + return PTR_ERR(otg_clk); + } + + err = clk_enable(otg_clk); + if (err) { + clk_put(otg_clk); + return err; + } + + writel(readl(S5P_USBHOST_PHY_CONTROL) | S5P_USBHOST_PHY_ENABLE, + S5P_USBHOST_PHY_CONTROL); + + /* set clock frequency for PLL */ + phyclk = readl(EXYNOS4_PHYCLK) & ~CLKSEL_MASK; + + xusbxti_clk = clk_get(&pdev->dev, "xusbxti"); + if (xusbxti_clk && !IS_ERR(xusbxti_clk)) { + switch (clk_get_rate(xusbxti_clk)) { + case 12 * MHZ: + phyclk |= CLKSEL_12M; + break; + case 24 * MHZ: + phyclk |= CLKSEL_24M; + break; + default: + case 48 * MHZ: + /* default reference clock */ + break; + } + clk_put(xusbxti_clk); + } + + writel(phyclk, EXYNOS4_PHYCLK); + + /* floating prevention logic: disable */ + writel((readl(EXYNOS4_PHY1CON) | FPENABLEN), EXYNOS4_PHY1CON); + + /* set to normal HSIC 0 and 1 of PHY1 */ + writel((readl(EXYNOS4_PHYPWR) & ~PHY1_HSIC_NORMAL_MASK), + EXYNOS4_PHYPWR); + + /* set to normal standard USB of PHY1 */ + writel((readl(EXYNOS4_PHYPWR) & ~PHY1_STD_NORMAL_MASK), EXYNOS4_PHYPWR); + + /* reset all ports of both PHY and Link */ + rstcon = readl(EXYNOS4_RSTCON) | HOST_LINK_PORT_SWRST_MASK | + PHY1_SWRST_MASK; + writel(rstcon, EXYNOS4_RSTCON); + udelay(10); + + rstcon &= ~(HOST_LINK_PORT_SWRST_MASK | PHY1_SWRST_MASK); + writel(rstcon, EXYNOS4_RSTCON); + udelay(50); + + clk_disable(otg_clk); + clk_put(otg_clk); + + return 0; +} + +static int exynos4_usb_phy1_exit(struct platform_device *pdev) +{ + struct clk *otg_clk; + int err; + + otg_clk = clk_get(&pdev->dev, "otg"); + if (IS_ERR(otg_clk)) { + dev_err(&pdev->dev, "Failed to get otg clock\n"); + return PTR_ERR(otg_clk); + } + + err = clk_enable(otg_clk); + if (err) { + clk_put(otg_clk); + return err; + } + + writel((readl(EXYNOS4_PHYPWR) | PHY1_STD_ANALOG_POWERDOWN), + EXYNOS4_PHYPWR); + + writel(readl(S5P_USBHOST_PHY_CONTROL) & ~S5P_USBHOST_PHY_ENABLE, + S5P_USBHOST_PHY_CONTROL); + + clk_disable(otg_clk); + clk_put(otg_clk); + + return 0; +} + +int s5p_usb_phy_init(struct platform_device *pdev, int type) +{ + if (type == S5P_USB_PHY_HOST) + return exynos4_usb_phy1_init(pdev); + + return -EINVAL; +} + +int s5p_usb_phy_exit(struct platform_device *pdev, int type) +{ + if (type == S5P_USB_PHY_HOST) + return exynos4_usb_phy1_exit(pdev); + + return -EINVAL; +} diff --git a/arch/arm/mach-msm/include/mach/msm_iomap.h b/arch/arm/mach-msm/include/mach/msm_iomap.h index c98c7591f3b..2f494b6a9d0 100644 --- a/arch/arm/mach-msm/include/mach/msm_iomap.h +++ b/arch/arm/mach-msm/include/mach/msm_iomap.h @@ -55,7 +55,7 @@ #include "msm_iomap-8960.h" -/* Virtual addressses shared across all MSM targets. */ +/* Virtual addresses shared across all MSM targets. */ #define MSM_CSR_BASE IOMEM(0xE0001000) #define MSM_QGIC_DIST_BASE IOMEM(0xF0000000) #define MSM_QGIC_CPU_BASE IOMEM(0xF0001000) diff --git a/arch/arm/mach-omap2/control.h b/arch/arm/mach-omap2/control.h index c2804c1c4ef..a016c8b59e0 100644 --- a/arch/arm/mach-omap2/control.h +++ b/arch/arm/mach-omap2/control.h @@ -236,7 +236,7 @@ #define OMAP343X_CONTROL_WKUP_DEBOBS3 (OMAP343X_CONTROL_GENERAL_WKUP + 0x014) #define OMAP343X_CONTROL_WKUP_DEBOBS4 (OMAP343X_CONTROL_GENERAL_WKUP + 0x018) -/* 36xx-only RTA - Retention till Accesss control registers and bits */ +/* 36xx-only RTA - Retention till Access control registers and bits */ #define OMAP36XX_CONTROL_MEM_RTA_CTRL 0x40C #define OMAP36XX_RTA_DISABLE 0x0 diff --git a/arch/arm/mach-s3c2410/include/mach/map.h b/arch/arm/mach-s3c2410/include/mach/map.h index 25bbf5a942d..425552d84b6 100644 --- a/arch/arm/mach-s3c2410/include/mach/map.h +++ b/arch/arm/mach-s3c2410/include/mach/map.h @@ -21,6 +21,10 @@ /* USB host controller */ #define S3C2410_PA_USBHOST (0x49000000) +/* S3C2416/S3C2443/S3C2450 High-Speed USB Gadget */ +#define S3C2416_PA_HSUDC (0x49800000) +#define S3C2416_SZ_HSUDC (SZ_4K) + /* DMA controller */ #define S3C2410_PA_DMA (0x4B000000) #define S3C24XX_SZ_DMA SZ_1M diff --git a/arch/arm/mach-s3c2410/include/mach/regs-s3c2443-clock.h b/arch/arm/mach-s3c2410/include/mach/regs-s3c2443-clock.h index 44494a56e68..5e06c726583 100644 --- a/arch/arm/mach-s3c2410/include/mach/regs-s3c2443-clock.h +++ b/arch/arm/mach-s3c2410/include/mach/regs-s3c2443-clock.h @@ -37,6 +37,10 @@ #define S3C2443_SYSID S3C2443_CLKREG(0x5C) #define S3C2443_PWRCFG S3C2443_CLKREG(0x60) #define S3C2443_RSTCON S3C2443_CLKREG(0x64) +#define S3C2443_PHYCTRL S3C2443_CLKREG(0x80) +#define S3C2443_PHYPWR S3C2443_CLKREG(0x84) +#define S3C2443_URSTCON S3C2443_CLKREG(0x88) +#define S3C2443_UCLKCON S3C2443_CLKREG(0x8C) #define S3C2443_SWRST_RESET (0x533c2443) @@ -121,6 +125,27 @@ #define S3C2443_PWRCFG_SLEEP (1<<15) +#define S3C2443_PWRCFG_USBPHY (1 << 4) + +#define S3C2443_URSTCON_FUNCRST (1 << 2) +#define S3C2443_URSTCON_PHYRST (1 << 0) + +#define S3C2443_PHYCTRL_CLKSEL (1 << 3) +#define S3C2443_PHYCTRL_EXTCLK (1 << 2) +#define S3C2443_PHYCTRL_PLLSEL (1 << 1) +#define S3C2443_PHYCTRL_DSPORT (1 << 0) + +#define S3C2443_PHYPWR_COMMON_ON (1 << 31) +#define S3C2443_PHYPWR_ANALOG_PD (1 << 4) +#define S3C2443_PHYPWR_PLL_REFCLK (1 << 3) +#define S3C2443_PHYPWR_XO_ON (1 << 2) +#define S3C2443_PHYPWR_PLL_PWRDN (1 << 1) +#define S3C2443_PHYPWR_FSUSPEND (1 << 0) + +#define S3C2443_UCLKCON_DETECT_VBUS (1 << 31) +#define S3C2443_UCLKCON_FUNC_CLKEN (1 << 2) +#define S3C2443_UCLKCON_TCLKEN (1 << 0) + #include <asm/div64.h> static inline unsigned int diff --git a/arch/arm/mach-s3c2416/mach-smdk2416.c b/arch/arm/mach-s3c2416/mach-smdk2416.c index 3f83177246c..ac27ebb31c9 100644 --- a/arch/arm/mach-s3c2416/mach-smdk2416.c +++ b/arch/arm/mach-s3c2416/mach-smdk2416.c @@ -23,6 +23,7 @@ #include <linux/mtd/partitions.h> #include <linux/gpio.h> #include <linux/fb.h> +#include <linux/delay.h> #include <asm/mach/arch.h> #include <asm/mach/map.h> @@ -35,6 +36,7 @@ #include <plat/regs-serial.h> #include <mach/regs-gpio.h> #include <mach/regs-lcd.h> +#include <mach/regs-s3c2443-clock.h> #include <mach/idle.h> #include <mach/leds-gpio.h> @@ -47,6 +49,7 @@ #include <plat/cpu.h> #include <plat/nand.h> #include <plat/sdhci.h> +#include <plat/udc.h> #include <plat/regs-fb-v4.h> #include <plat/fb.h> @@ -121,6 +124,27 @@ static struct s3c2410_uartcfg smdk2416_uartcfgs[] __initdata = { } }; +void smdk2416_hsudc_gpio_init(void) +{ + s3c_gpio_setpull(S3C2410_GPH(14), S3C_GPIO_PULL_UP); + s3c_gpio_setpull(S3C2410_GPF(2), S3C_GPIO_PULL_NONE); + s3c_gpio_cfgpin(S3C2410_GPH(14), S3C_GPIO_SFN(1)); + s3c2410_modify_misccr(S3C2416_MISCCR_SEL_SUSPND, 0); +} + +void smdk2416_hsudc_gpio_uninit(void) +{ + s3c2410_modify_misccr(S3C2416_MISCCR_SEL_SUSPND, 1); + s3c_gpio_setpull(S3C2410_GPH(14), S3C_GPIO_PULL_NONE); + s3c_gpio_cfgpin(S3C2410_GPH(14), S3C_GPIO_SFN(0)); +} + +struct s3c24xx_hsudc_platdata smdk2416_hsudc_platdata = { + .epnum = 9, + .gpio_init = smdk2416_hsudc_gpio_init, + .gpio_uninit = smdk2416_hsudc_gpio_uninit, +}; + struct s3c_fb_pd_win smdk2416_fb_win[] = { [0] = { /* think this is the same as the smdk6410 */ @@ -186,6 +210,7 @@ static struct platform_device *smdk2416_devices[] __initdata = { &s3c_device_i2c0, &s3c_device_hsmmc0, &s3c_device_hsmmc1, + &s3c_device_usb_hsudc, }; static void __init smdk2416_map_io(void) @@ -203,6 +228,8 @@ static void __init smdk2416_machine_init(void) s3c_sdhci0_set_platdata(&smdk2416_hsmmc0_pdata); s3c_sdhci1_set_platdata(&smdk2416_hsmmc1_pdata); + s3c24xx_hsudc_set_platdata(&smdk2416_hsudc_platdata); + gpio_request(S3C2410_GPB(4), "USBHost Power"); gpio_direction_output(S3C2410_GPB(4), 1); diff --git a/arch/arm/mach-tegra/Kconfig b/arch/arm/mach-tegra/Kconfig index 3cdeffc97b4..5ec1846aa1d 100644 --- a/arch/arm/mach-tegra/Kconfig +++ b/arch/arm/mach-tegra/Kconfig @@ -27,12 +27,14 @@ comment "Tegra board type" config MACH_HARMONY bool "Harmony board" + select MACH_HAS_SND_SOC_TEGRA_WM8903 help Support for nVidia Harmony development platform config MACH_KAEN bool "Kaen board" select MACH_SEABOARD + select MACH_HAS_SND_SOC_TEGRA_WM8903 help Support for the Kaen version of Seaboard @@ -43,6 +45,7 @@ config MACH_PAZ00 config MACH_SEABOARD bool "Seaboard board" + select MACH_HAS_SND_SOC_TEGRA_WM8903 help Support for nVidia Seaboard development platform. It will also be included for some of the derivative boards that diff --git a/arch/arm/mach-tegra/board-harmony.c b/arch/arm/mach-tegra/board-harmony.c index 75c918a86a3..30e18bc6064 100644 --- a/arch/arm/mach-tegra/board-harmony.c +++ b/arch/arm/mach-tegra/board-harmony.c @@ -34,7 +34,7 @@ #include <asm/mach/time.h> #include <asm/setup.h> -#include <mach/harmony_audio.h> +#include <mach/tegra_wm8903_pdata.h> #include <mach/iomap.h> #include <mach/irqs.h> #include <mach/sdhci.h> @@ -67,15 +67,16 @@ static struct platform_device debug_uart = { }, }; -static struct harmony_audio_platform_data harmony_audio_pdata = { +static struct tegra_wm8903_platform_data harmony_audio_pdata = { .gpio_spkr_en = TEGRA_GPIO_SPKR_EN, .gpio_hp_det = TEGRA_GPIO_HP_DET, + .gpio_hp_mute = -1, .gpio_int_mic_en = TEGRA_GPIO_INT_MIC_EN, .gpio_ext_mic_en = TEGRA_GPIO_EXT_MIC_EN, }; static struct platform_device harmony_audio_device = { - .name = "tegra-snd-harmony", + .name = "tegra-snd-wm8903", .id = 0, .dev = { .platform_data = &harmony_audio_pdata, diff --git a/arch/arm/mach-tegra/include/mach/harmony_audio.h b/arch/arm/mach-tegra/include/mach/tegra_wm8903_pdata.h index af086500ab7..9d293344a7f 100644 --- a/arch/arm/mach-tegra/include/mach/harmony_audio.h +++ b/arch/arm/mach-tegra/include/mach/tegra_wm8903_pdata.h @@ -1,5 +1,5 @@ /* - * arch/arm/mach-tegra/include/mach/harmony_audio.h + * arch/arm/mach-tegra/include/mach/tegra_wm8903_pdata.h * * Copyright 2011 NVIDIA, Inc. * @@ -14,9 +14,10 @@ * */ -struct harmony_audio_platform_data { +struct tegra_wm8903_platform_data { int gpio_spkr_en; int gpio_hp_det; + int gpio_hp_mute; int gpio_int_mic_en; int gpio_ext_mic_en; }; diff --git a/arch/arm/mach-tegra/tegra2_clocks.c b/arch/arm/mach-tegra/tegra2_clocks.c index 4459470c052..bb618075fab 100644 --- a/arch/arm/mach-tegra/tegra2_clocks.c +++ b/arch/arm/mach-tegra/tegra2_clocks.c @@ -337,7 +337,7 @@ static int tegra2_super_clk_set_parent(struct clk *c, struct clk *p) const struct clk_mux_sel *sel; int shift; - val = clk_readl(c->reg + SUPER_CLK_MUX);; + val = clk_readl(c->reg + SUPER_CLK_MUX); BUG_ON(((val & SUPER_STATE_MASK) != SUPER_STATE_RUN) && ((val & SUPER_STATE_MASK) != SUPER_STATE_IDLE)); shift = ((val & SUPER_STATE_MASK) == SUPER_STATE_IDLE) ? diff --git a/arch/arm/mach-ux500/mbox-db5500.c b/arch/arm/mach-ux500/mbox-db5500.c index a4ffb9f4f46..2b2d51caf9d 100644 --- a/arch/arm/mach-ux500/mbox-db5500.c +++ b/arch/arm/mach-ux500/mbox-db5500.c @@ -416,8 +416,7 @@ struct mbox *mbox_setup(u8 mbox_id, mbox_recv_cb_t *mbox_cb, void *priv) dev_dbg(&(mbox->pdev->dev), "Resource name: %s start: 0x%X, end: 0x%X\n", resource->name, resource->start, resource->end); - mbox->virtbase_peer = - ioremap(resource->start, resource->end - resource->start); + mbox->virtbase_peer = ioremap(resource->start, resource_size(resource)); if (!mbox->virtbase_peer) { dev_err(&(mbox->pdev->dev), "Unable to ioremap peer mbox\n"); mbox = NULL; @@ -440,8 +439,7 @@ struct mbox *mbox_setup(u8 mbox_id, mbox_recv_cb_t *mbox_cb, void *priv) dev_dbg(&(mbox->pdev->dev), "Resource name: %s start: 0x%X, end: 0x%X\n", resource->name, resource->start, resource->end); - mbox->virtbase_local = - ioremap(resource->start, resource->end - resource->start); + mbox->virtbase_local = ioremap(resource->start, resource_size(resource)); if (!mbox->virtbase_local) { dev_err(&(mbox->pdev->dev), "Unable to ioremap local mbox\n"); mbox = NULL; diff --git a/arch/arm/plat-iop/time.c b/arch/arm/plat-iop/time.c index 07f23bb42be..7cdc5161ff2 100644 --- a/arch/arm/plat-iop/time.c +++ b/arch/arm/plat-iop/time.c @@ -17,7 +17,6 @@ #include <linux/interrupt.h> #include <linux/time.h> #include <linux/init.h> -#include <linux/sched.h> #include <linux/timex.h> #include <linux/sched.h> #include <linux/io.h> diff --git a/arch/arm/plat-mxc/cpufreq.c b/arch/arm/plat-mxc/cpufreq.c index 4268a2bdf14..74aac96cda2 100644 --- a/arch/arm/plat-mxc/cpufreq.c +++ b/arch/arm/plat-mxc/cpufreq.c @@ -153,8 +153,8 @@ static int __init mxc_cpufreq_init(struct cpufreq_policy *policy) ret = cpufreq_frequency_table_cpuinfo(policy, imx_freq_table); if (ret < 0) { - printk(KERN_ERR "%s: failed to register i.MXC CPUfreq \ - with error code %d\n", __func__, ret); + printk(KERN_ERR "%s: failed to register i.MXC CPUfreq with error code %d\n", + __func__, ret); goto err; } diff --git a/arch/arm/plat-s3c24xx/devs.c b/arch/arm/plat-s3c24xx/devs.c index 268f3ed0a10..73667994518 100644 --- a/arch/arm/plat-s3c24xx/devs.c +++ b/arch/arm/plat-s3c24xx/devs.c @@ -22,6 +22,7 @@ #include <linux/io.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/dma-mapping.h> #include <asm/mach/arch.h> #include <asm/mach/map.h> @@ -233,6 +234,46 @@ void __init s3c24xx_udc_set_platdata(struct s3c2410_udc_mach_info *pd) } } +/* USB High Speed 2.0 Device (Gadget) */ +static struct resource s3c_hsudc_resource[] = { + [0] = { + .start = S3C2416_PA_HSUDC, + .end = S3C2416_PA_HSUDC + S3C2416_SZ_HSUDC - 1, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = IRQ_USBD, + .end = IRQ_USBD, + .flags = IORESOURCE_IRQ, + } +}; + +static u64 s3c_hsudc_dmamask = DMA_BIT_MASK(32); + +struct platform_device s3c_device_usb_hsudc = { + .name = "s3c-hsudc", + .id = -1, + .num_resources = ARRAY_SIZE(s3c_hsudc_resource), + .resource = s3c_hsudc_resource, + .dev = { + .dma_mask = &s3c_hsudc_dmamask, + .coherent_dma_mask = DMA_BIT_MASK(32), + }, +}; + +void __init s3c24xx_hsudc_set_platdata(struct s3c24xx_hsudc_platdata *pd) +{ + struct s3c24xx_hsudc_platdata *npd; + + npd = kmalloc(sizeof(*npd), GFP_KERNEL); + if (npd) { + memcpy(npd, pd, sizeof(*npd)); + s3c_device_usb_hsudc.dev.platform_data = npd; + } else { + printk(KERN_ERR "no memory for udc platform data\n"); + } +} + /* IIS */ static struct resource s3c_iis_resource[] = { diff --git a/arch/arm/plat-s3c24xx/include/plat/udc.h b/arch/arm/plat-s3c24xx/include/plat/udc.h index 80457c6414a..f6388424250 100644 --- a/arch/arm/plat-s3c24xx/include/plat/udc.h +++ b/arch/arm/plat-s3c24xx/include/plat/udc.h @@ -37,4 +37,21 @@ struct s3c2410_udc_mach_info { extern void __init s3c24xx_udc_set_platdata(struct s3c2410_udc_mach_info *); +/** + * s3c24xx_hsudc_platdata - Platform data for USB High-Speed gadget controller. + * @epnum: Number of endpoints to be instantiated by the controller driver. + * @gpio_init: Platform specific USB related GPIO initialization. + * @gpio_uninit: Platform specific USB releted GPIO uninitialzation. + * + * Representation of platform data for the S3C24XX USB 2.0 High Speed gadget + * controllers. + */ +struct s3c24xx_hsudc_platdata { + unsigned int epnum; + void (*gpio_init)(void); + void (*gpio_uninit)(void); +}; + +extern void __init s3c24xx_hsudc_set_platdata(struct s3c24xx_hsudc_platdata *pd); + #endif /* __ASM_ARM_ARCH_UDC_H */ diff --git a/arch/arm/plat-s5p/Kconfig b/arch/arm/plat-s5p/Kconfig index 84922971658..6751bcf7b88 100644 --- a/arch/arm/plat-s5p/Kconfig +++ b/arch/arm/plat-s5p/Kconfig @@ -85,6 +85,11 @@ config S5P_DEV_CSIS1 help Compile in platform device definitions for MIPI-CSIS channel 1 +config S5P_DEV_USB_EHCI + bool + help + Compile in platform device definition for USB EHCI + config S5P_SETUP_MIPIPHY bool help diff --git a/arch/arm/plat-s5p/Makefile b/arch/arm/plat-s5p/Makefile index 42afff7f60b..e234cc4d49a 100644 --- a/arch/arm/plat-s5p/Makefile +++ b/arch/arm/plat-s5p/Makefile @@ -33,4 +33,5 @@ obj-$(CONFIG_S5P_DEV_FIMC3) += dev-fimc3.o obj-$(CONFIG_S5P_DEV_ONENAND) += dev-onenand.o obj-$(CONFIG_S5P_DEV_CSIS0) += dev-csis0.o obj-$(CONFIG_S5P_DEV_CSIS1) += dev-csis1.o +obj-$(CONFIG_S5P_DEV_USB_EHCI) += dev-ehci.o obj-$(CONFIG_S5P_SETUP_MIPIPHY) += setup-mipiphy.o diff --git a/arch/arm/plat-s5p/dev-ehci.c b/arch/arm/plat-s5p/dev-ehci.c new file mode 100644 index 00000000000..94080fff9e9 --- /dev/null +++ b/arch/arm/plat-s5p/dev-ehci.c @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2011 Samsung Electronics Co.Ltd + * Author: Joonyoung Shim <jy0922.shim@samsung.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + */ + +#include <linux/platform_device.h> +#include <mach/irqs.h> +#include <mach/map.h> +#include <plat/devs.h> +#include <plat/ehci.h> +#include <plat/usb-phy.h> + +/* USB EHCI Host Controller registration */ +static struct resource s5p_ehci_resource[] = { + [0] = { + .start = S5P_PA_EHCI, + .end = S5P_PA_EHCI + SZ_256 - 1, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = IRQ_USB_HOST, + .end = IRQ_USB_HOST, + .flags = IORESOURCE_IRQ, + } +}; + +static u64 s5p_device_ehci_dmamask = 0xffffffffUL; + +struct platform_device s5p_device_ehci = { + .name = "s5p-ehci", + .id = -1, + .num_resources = ARRAY_SIZE(s5p_ehci_resource), + .resource = s5p_ehci_resource, + .dev = { + .dma_mask = &s5p_device_ehci_dmamask, + .coherent_dma_mask = 0xffffffffUL + } +}; + +void __init s5p_ehci_set_platdata(struct s5p_ehci_platdata *pd) +{ + struct s5p_ehci_platdata *npd; + + npd = s3c_set_platdata(pd, sizeof(struct s5p_ehci_platdata), + &s5p_device_ehci); + + if (!npd->phy_init) + npd->phy_init = s5p_usb_phy_init; + if (!npd->phy_exit) + npd->phy_exit = s5p_usb_phy_exit; +} diff --git a/arch/arm/plat-s5p/include/plat/ehci.h b/arch/arm/plat-s5p/include/plat/ehci.h new file mode 100644 index 00000000000..6ae6810c756 --- /dev/null +++ b/arch/arm/plat-s5p/include/plat/ehci.h @@ -0,0 +1,21 @@ +/* + * Copyright (C) 2011 Samsung Electronics Co.Ltd + * Author: Joonyoung Shim <jy0922.shim@samsung.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#ifndef __PLAT_S5P_EHCI_H +#define __PLAT_S5P_EHCI_H + +struct s5p_ehci_platdata { + int (*phy_init)(struct platform_device *pdev, int type); + int (*phy_exit)(struct platform_device *pdev, int type); +}; + +extern void s5p_ehci_set_platdata(struct s5p_ehci_platdata *pd); + +#endif /* __PLAT_S5P_EHCI_H */ diff --git a/arch/arm/plat-s5p/include/plat/map-s5p.h b/arch/arm/plat-s5p/include/plat/map-s5p.h index d973d39666a..a6c3d327ce7 100644 --- a/arch/arm/plat-s5p/include/plat/map-s5p.h +++ b/arch/arm/plat-s5p/include/plat/map-s5p.h @@ -39,7 +39,7 @@ #define S5P_VA_TWD S5P_VA_COREPERI(0x600) #define S5P_VA_GIC_DIST S5P_VA_COREPERI(0x1000) -#define S3C_VA_USB_HSPHY S3C_ADDR(0x02900000) +#define S5P_VA_USB_HSPHY S3C_ADDR(0x02900000) #define VA_VIC(x) (S3C_VA_IRQ + ((x) * 0x10000)) #define VA_VIC0 VA_VIC(0) diff --git a/arch/arm/plat-s5p/include/plat/usb-phy.h b/arch/arm/plat-s5p/include/plat/usb-phy.h new file mode 100644 index 00000000000..6dd6bcfca3c --- /dev/null +++ b/arch/arm/plat-s5p/include/plat/usb-phy.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2011 Samsung Electronics Co.Ltd + * Author: Joonyoung Shim <jy0922.shim@samsung.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#ifndef __PLAT_S5P_USB_PHY_H +#define __PLAT_S5P_USB_PHY_H + +enum s5p_usb_phy_type { + S5P_USB_PHY_DEVICE, + S5P_USB_PHY_HOST, +}; + +extern int s5p_usb_phy_init(struct platform_device *pdev, int type); +extern int s5p_usb_phy_exit(struct platform_device *pdev, int type); + +#endif /* __PLAT_S5P_REGS_USB_PHY_H */ diff --git a/arch/arm/plat-samsung/include/plat/devs.h b/arch/arm/plat-samsung/include/plat/devs.h index f0da6b70fba..39818d8da42 100644 --- a/arch/arm/plat-samsung/include/plat/devs.h +++ b/arch/arm/plat-samsung/include/plat/devs.h @@ -88,6 +88,7 @@ extern struct platform_device s3c64xx_device_onenand1; extern struct platform_device s5p_device_onenand; extern struct platform_device s3c_device_usbgadget; +extern struct platform_device s3c_device_usb_hsudc; extern struct platform_device s3c_device_usb_hsotg; extern struct platform_device s5pv210_device_ac97; @@ -142,6 +143,8 @@ extern struct platform_device s5p_device_fimc3; extern struct platform_device s5p_device_mipi_csis0; extern struct platform_device s5p_device_mipi_csis1; +extern struct platform_device s5p_device_ehci; + extern struct platform_device exynos4_device_sysmmu; /* s3c2440 specific devices */ diff --git a/arch/cris/arch-v32/mach-fs/Makefile b/arch/cris/arch-v32/mach-fs/Makefile index 4ff407a1b93..41fa6a6893a 100644 --- a/arch/cris/arch-v32/mach-fs/Makefile +++ b/arch/cris/arch-v32/mach-fs/Makefile @@ -4,7 +4,7 @@ # obj-y := dma.o pinmux.o io.o arbiter.o -bj-$(CONFIG_ETRAX_VCS_SIM) += vcs_hook.o +obj-$(CONFIG_ETRAX_VCS_SIM) += vcs_hook.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o clean: diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index c04dd576f33..80241fe03f5 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -1064,7 +1064,7 @@ static void sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size, /* ** Address does not fall w/in IOVA, must be bypassing */ - DBG_BYPASS("sba_unmap_single_atttrs() bypass addr: 0x%lx\n", + DBG_BYPASS("sba_unmap_single_attrs() bypass addr: 0x%lx\n", iova); #ifdef ENABLE_MARK_CLEAN diff --git a/arch/ia64/kvm/vti.h b/arch/ia64/kvm/vti.h index f6c5617e16a..b214b5b0432 100644 --- a/arch/ia64/kvm/vti.h +++ b/arch/ia64/kvm/vti.h @@ -83,13 +83,13 @@ union vac { unsigned long value; struct { - int a_int:1; - int a_from_int_cr:1; - int a_to_int_cr:1; - int a_from_psr:1; - int a_from_cpuid:1; - int a_cover:1; - int a_bsw:1; + unsigned int a_int:1; + unsigned int a_from_int_cr:1; + unsigned int a_to_int_cr:1; + unsigned int a_from_psr:1; + unsigned int a_from_cpuid:1; + unsigned int a_cover:1; + unsigned int a_bsw:1; long reserved:57; }; }; @@ -97,12 +97,12 @@ union vac { union vdc { unsigned long value; struct { - int d_vmsw:1; - int d_extint:1; - int d_ibr_dbr:1; - int d_pmc:1; - int d_to_pmd:1; - int d_itm:1; + unsigned int d_vmsw:1; + unsigned int d_extint:1; + unsigned int d_ibr_dbr:1; + unsigned int d_pmc:1; + unsigned int d_to_pmd:1; + unsigned int d_itm:1; long reserved:58; }; }; diff --git a/arch/m68k/include/asm/MC68EZ328.h b/arch/m68k/include/asm/MC68EZ328.h index 69b7f9139e5..d1bde58ab0d 100644 --- a/arch/m68k/include/asm/MC68EZ328.h +++ b/arch/m68k/include/asm/MC68EZ328.h @@ -1047,7 +1047,7 @@ typedef volatile struct { #define WATCHDOG_EN 0x0001 /* Watchdog Enabled */ #define WATCHDOG_ISEL 0x0002 /* Select the watchdog interrupt */ -#define WATCHDOG_INTF 0x0080 /* Watchdog interrupt occcured */ +#define WATCHDOG_INTF 0x0080 /* Watchdog interrupt occurred */ #define WATCHDOG_CNT_MASK 0x0300 /* Watchdog Counter */ #define WATCHDOG_CNT_SHIFT 8 diff --git a/arch/m68k/include/asm/MC68VZ328.h b/arch/m68k/include/asm/MC68VZ328.h index 2b9bf626a0a..6bd1bf1f85e 100644 --- a/arch/m68k/include/asm/MC68VZ328.h +++ b/arch/m68k/include/asm/MC68VZ328.h @@ -1143,7 +1143,7 @@ typedef struct { #define WATCHDOG_EN 0x0001 /* Watchdog Enabled */ #define WATCHDOG_ISEL 0x0002 /* Select the watchdog interrupt */ -#define WATCHDOG_INTF 0x0080 /* Watchdog interrupt occcured */ +#define WATCHDOG_INTF 0x0080 /* Watchdog interrupt occurred */ #define WATCHDOG_CNT_MASK 0x0300 /* Watchdog Counter */ #define WATCHDOG_CNT_SHIFT 8 diff --git a/arch/mips/ath79/Kconfig b/arch/mips/ath79/Kconfig index b05828260f7..47707410582 100644 --- a/arch/mips/ath79/Kconfig +++ b/arch/mips/ath79/Kconfig @@ -26,12 +26,17 @@ config ATH79_MACH_PB44 endmenu config SOC_AR71XX + select USB_ARCH_HAS_EHCI + select USB_ARCH_HAS_OHCI def_bool n config SOC_AR724X + select USB_ARCH_HAS_EHCI + select USB_ARCH_HAS_OHCI def_bool n config SOC_AR913X + select USB_ARCH_HAS_EHCI def_bool n config ATH79_DEV_AR913X_WMAC diff --git a/arch/mips/fw/arc/cmdline.c b/arch/mips/fw/arc/cmdline.c index 5c8603c85f2..9fdf07e50f1 100644 --- a/arch/mips/fw/arc/cmdline.c +++ b/arch/mips/fw/arc/cmdline.c @@ -5,7 +5,7 @@ * * cmdline.c: Kernel command line creation using ARCS argc/argv. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) */ #include <linux/init.h> #include <linux/kernel.h> diff --git a/arch/mips/fw/arc/env.c b/arch/mips/fw/arc/env.c index 6f5dd42b96e..1118a26b32e 100644 --- a/arch/mips/fw/arc/env.c +++ b/arch/mips/fw/arc/env.c @@ -5,7 +5,7 @@ * * env.c: ARCS environment variable routines. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) */ #include <linux/init.h> #include <linux/kernel.h> diff --git a/arch/mips/fw/arc/identify.c b/arch/mips/fw/arc/identify.c index 0ce9acf10c3..788060a53dc 100644 --- a/arch/mips/fw/arc/identify.c +++ b/arch/mips/fw/arc/identify.c @@ -9,7 +9,7 @@ * * This code is based on arch/mips/sgi/kernel/system.c, which is * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) */ #include <linux/init.h> #include <linux/kernel.h> diff --git a/arch/mips/fw/arc/init.c b/arch/mips/fw/arc/init.c index 3ad8788b6ea..629b24db0d3 100644 --- a/arch/mips/fw/arc/init.c +++ b/arch/mips/fw/arc/init.c @@ -5,7 +5,7 @@ * * PROM library initialisation code. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) */ #include <linux/init.h> #include <linux/kernel.h> diff --git a/arch/mips/fw/arc/misc.c b/arch/mips/fw/arc/misc.c index e527c5fd5a3..29627fbae7a 100644 --- a/arch/mips/fw/arc/misc.c +++ b/arch/mips/fw/arc/misc.c @@ -5,7 +5,7 @@ * * Miscellaneous ARCS PROM routines. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 1999 Ralf Baechle (ralf@gnu.org) * Copyright (C) 1999 Silicon Graphics, Inc. */ diff --git a/arch/mips/fw/arc/salone.c b/arch/mips/fw/arc/salone.c index e6afb64723d..9b568950d1f 100644 --- a/arch/mips/fw/arc/salone.c +++ b/arch/mips/fw/arc/salone.c @@ -2,7 +2,7 @@ * Routines to load into memory and execute stand-along program images using * ARCS PROM firmware. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) */ #include <linux/init.h> #include <asm/sgialib.h> diff --git a/arch/mips/fw/arc/time.c b/arch/mips/fw/arc/time.c index 42138c837d4..190cdb50b89 100644 --- a/arch/mips/fw/arc/time.c +++ b/arch/mips/fw/arc/time.c @@ -5,7 +5,7 @@ * * Extracting time information from ARCS prom. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) */ #include <linux/init.h> diff --git a/arch/mips/fw/arc/tree.c b/arch/mips/fw/arc/tree.c index d68e5a59c1f..924a37dc256 100644 --- a/arch/mips/fw/arc/tree.c +++ b/arch/mips/fw/arc/tree.c @@ -5,7 +5,7 @@ * * PROM component device tree code. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 1999 Ralf Baechle (ralf@gnu.org) * Copyright (C) 1999 Silicon Graphics, Inc. */ diff --git a/arch/mips/include/asm/asmmacro-32.h b/arch/mips/include/asm/asmmacro-32.h index 5de3963f511..2413afe21b3 100644 --- a/arch/mips/include/asm/asmmacro-32.h +++ b/arch/mips/include/asm/asmmacro-32.h @@ -1,7 +1,7 @@ /* * asmmacro.h: Assembler macros to make things easier to read. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 1998, 1999, 2003 Ralf Baechle */ #ifndef _ASM_ASMMACRO_32_H diff --git a/arch/mips/include/asm/asmmacro-64.h b/arch/mips/include/asm/asmmacro-64.h index 225feefcb25..08a527dfe4a 100644 --- a/arch/mips/include/asm/asmmacro-64.h +++ b/arch/mips/include/asm/asmmacro-64.h @@ -1,7 +1,7 @@ /* * asmmacro.h: Assembler macros to make things easier to read. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 1998, 1999 Ralf Baechle * Copyright (C) 1999 Silicon Graphics, Inc. */ diff --git a/arch/mips/include/asm/cpu.h b/arch/mips/include/asm/cpu.h index 34c0d3cb116..5f95a4bfc73 100644 --- a/arch/mips/include/asm/cpu.h +++ b/arch/mips/include/asm/cpu.h @@ -2,7 +2,7 @@ * cpu.h: Values of the PRId register used to match up * various MIPS cpu types. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 2004 Maciej W. Rozycki */ #ifndef _ASM_CPU_H diff --git a/arch/mips/include/asm/r4kcache.h b/arch/mips/include/asm/r4kcache.h index 387bf59f1e3..54ea47da59a 100644 --- a/arch/mips/include/asm/r4kcache.h +++ b/arch/mips/include/asm/r4kcache.h @@ -5,7 +5,7 @@ * * Inline assembly cache operations. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 1997 - 2002 Ralf Baechle (ralf@gnu.org) * Copyright (C) 2004 Ralf Baechle (ralf@linux-mips.org) */ diff --git a/arch/mips/include/asm/sgialib.h b/arch/mips/include/asm/sgialib.h index 2a2f1bddc27..f5811576945 100644 --- a/arch/mips/include/asm/sgialib.h +++ b/arch/mips/include/asm/sgialib.h @@ -5,7 +5,7 @@ * * SGI ARCS firmware interface library for the Linux kernel. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 2001, 2002 Ralf Baechle (ralf@gnu.org) */ #ifndef _ASM_SGIALIB_H diff --git a/arch/mips/include/asm/sgiarcs.h b/arch/mips/include/asm/sgiarcs.h index 721327f8860..14934295143 100644 --- a/arch/mips/include/asm/sgiarcs.h +++ b/arch/mips/include/asm/sgiarcs.h @@ -5,7 +5,7 @@ * * ARC firmware interface defines. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 1999, 2001 Ralf Baechle (ralf@gnu.org) * Copyright (C) 1999 Silicon Graphics, Inc. */ diff --git a/arch/mips/kernel/octeon_switch.S b/arch/mips/kernel/octeon_switch.S index dd18b26a358..ce89c806170 100644 --- a/arch/mips/kernel/octeon_switch.S +++ b/arch/mips/kernel/octeon_switch.S @@ -4,7 +4,7 @@ * for more details. * * Copyright (C) 1994, 1995, 1996, 1998, 1999, 2002, 2003 Ralf Baechle - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 1994, 1995, 1996, by Andreas Busse * Copyright (C) 1999 Silicon Graphics, Inc. * Copyright (C) 2000 MIPS Technologies, Inc. diff --git a/arch/mips/kernel/r2300_fpu.S b/arch/mips/kernel/r2300_fpu.S index ac68e68339d..61c8a0f2a60 100644 --- a/arch/mips/kernel/r2300_fpu.S +++ b/arch/mips/kernel/r2300_fpu.S @@ -6,7 +6,7 @@ * Copyright (C) 1996, 1998 by Ralf Baechle * * Multi-arch abstraction and asm macros for easier reading: - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * * Further modifications to make this work: * Copyright (c) 1998 Harald Koerfgen diff --git a/arch/mips/kernel/r2300_switch.S b/arch/mips/kernel/r2300_switch.S index 698414b7a25..293898391e6 100644 --- a/arch/mips/kernel/r2300_switch.S +++ b/arch/mips/kernel/r2300_switch.S @@ -5,7 +5,7 @@ * Copyright (C) 1994, 1995, 1996 by Andreas Busse * * Multi-cpu abstraction and macros for easier reading: - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * * Further modifications to make this work: * Copyright (c) 1998-2000 Harald Koerfgen diff --git a/arch/mips/kernel/r4k_fpu.S b/arch/mips/kernel/r4k_fpu.S index dbd42adc52e..55ffe149dae 100644 --- a/arch/mips/kernel/r4k_fpu.S +++ b/arch/mips/kernel/r4k_fpu.S @@ -6,7 +6,7 @@ * Copyright (C) 1996, 98, 99, 2000, 01 Ralf Baechle * * Multi-arch abstraction and asm macros for easier reading: - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * * Carsten Langgaard, carstenl@mips.com * Copyright (C) 2000 MIPS Technologies, Inc. diff --git a/arch/mips/kernel/r4k_switch.S b/arch/mips/kernel/r4k_switch.S index 8893ee1a236..9414f935446 100644 --- a/arch/mips/kernel/r4k_switch.S +++ b/arch/mips/kernel/r4k_switch.S @@ -4,7 +4,7 @@ * for more details. * * Copyright (C) 1994, 1995, 1996, 1998, 1999, 2002, 2003 Ralf Baechle - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 1994, 1995, 1996, by Andreas Busse * Copyright (C) 1999 Silicon Graphics, Inc. * Copyright (C) 2000 MIPS Technologies, Inc. diff --git a/arch/mips/kernel/r6000_fpu.S b/arch/mips/kernel/r6000_fpu.S index 43cda53f5af..da0fbe46d83 100644 --- a/arch/mips/kernel/r6000_fpu.S +++ b/arch/mips/kernel/r6000_fpu.S @@ -8,7 +8,7 @@ * Copyright (C) 1996 by Ralf Baechle * * Multi-arch abstraction and asm macros for easier reading: - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) */ #include <asm/asm.h> #include <asm/fpregdef.h> diff --git a/arch/mips/mm/c-r3k.c b/arch/mips/mm/c-r3k.c index 54e5f7b9f44..e6b0efd3f6a 100644 --- a/arch/mips/mm/c-r3k.c +++ b/arch/mips/mm/c-r3k.c @@ -1,7 +1,7 @@ /* * r2300.c: R2000 and R3000 specific mmu/cache code. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * * with a lot of changes to make this thing work for R3000s * Tx39XX R4k style caches added. HK diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index d9bc5d3593b..eeb642e4066 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -3,7 +3,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002 Ralf Baechle (ralf@gnu.org) * Copyright (C) 1999, 2000 Silicon Graphics, Inc. */ diff --git a/arch/mips/mm/c-tx39.c b/arch/mips/mm/c-tx39.c index 6515b441871..d352fad3e45 100644 --- a/arch/mips/mm/c-tx39.c +++ b/arch/mips/mm/c-tx39.c @@ -1,7 +1,7 @@ /* * r2300.c: R2000 and R3000 specific mmu/cache code. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * * with a lot of changes to make this thing work for R3000s * Tx39XX R4k style caches added. HK diff --git a/arch/mips/mm/sc-ip22.c b/arch/mips/mm/sc-ip22.c index 13adb578211..a6bd11fba7b 100644 --- a/arch/mips/mm/sc-ip22.c +++ b/arch/mips/mm/sc-ip22.c @@ -2,7 +2,7 @@ * sc-ip22.c: Indy cache management functions. * * Copyright (C) 1997, 2001 Ralf Baechle (ralf@gnu.org), - * derived from r4xx0.c by David S. Miller (dm@engr.sgi.com). + * derived from r4xx0.c by David S. Miller (davem@davemloft.net). */ #include <linux/init.h> #include <linux/kernel.h> diff --git a/arch/mips/mm/sc-r5k.c b/arch/mips/mm/sc-r5k.c index f330d38e557..ae1e533a096 100644 --- a/arch/mips/mm/sc-r5k.c +++ b/arch/mips/mm/sc-r5k.c @@ -1,6 +1,6 @@ /* * Copyright (C) 1997, 2001 Ralf Baechle (ralf@gnu.org), - * derived from r4xx0.c by David S. Miller (dm@engr.sgi.com). + * derived from r4xx0.c by David S. Miller (davem@davemloft.net). */ #include <linux/init.h> #include <linux/kernel.h> diff --git a/arch/mips/mm/tlb-r3k.c b/arch/mips/mm/tlb-r3k.c index 0f5ab236ab6..40424affef8 100644 --- a/arch/mips/mm/tlb-r3k.c +++ b/arch/mips/mm/tlb-r3k.c @@ -1,7 +1,7 @@ /* * r2300.c: R2000 and R3000 specific mmu/cache code. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * * with a lot of changes to make this thing work for R3000s * Tx39XX R4k style caches added. HK diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c index c618eed933a..ba40325caea 100644 --- a/arch/mips/mm/tlb-r4k.c +++ b/arch/mips/mm/tlb-r4k.c @@ -3,7 +3,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 1997, 1998, 1999, 2000 Ralf Baechle ralf@gnu.org * Carsten Langgaard, carstenl@mips.com * Copyright (C) 2002 MIPS Technologies, Inc. All rights reserved. diff --git a/arch/mips/mm/tlb-r8k.c b/arch/mips/mm/tlb-r8k.c index 2b82f23df1a..3d95f76c106 100644 --- a/arch/mips/mm/tlb-r8k.c +++ b/arch/mips/mm/tlb-r8k.c @@ -3,7 +3,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 1997, 1998, 1999, 2000 Ralf Baechle ralf@gnu.org * Carsten Langgaard, carstenl@mips.com * Copyright (C) 2002 MIPS Technologies, Inc. All rights reserved. diff --git a/arch/mips/sgi-ip22/ip22-hpc.c b/arch/mips/sgi-ip22/ip22-hpc.c index 5c00cdd20d8..bb70589b5f7 100644 --- a/arch/mips/sgi-ip22/ip22-hpc.c +++ b/arch/mips/sgi-ip22/ip22-hpc.c @@ -1,7 +1,7 @@ /* * ip22-hpc.c: Routines for generic manipulation of the HPC controllers. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 1998 Ralf Baechle */ diff --git a/arch/mips/sgi-ip22/ip22-int.c b/arch/mips/sgi-ip22/ip22-int.c index 476423a0129..b4d08e4d2ea 100644 --- a/arch/mips/sgi-ip22/ip22-int.c +++ b/arch/mips/sgi-ip22/ip22-int.c @@ -2,7 +2,7 @@ * ip22-int.c: Routines for generic manipulation of the INT[23] ASIC * found on INDY and Indigo2 workstations. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 1997, 1998 Ralf Baechle (ralf@gnu.org) * Copyright (C) 1999 Andrew R. Baker (andrewb@uab.edu) * - Indigo2 changes diff --git a/arch/mips/sgi-ip22/ip22-mc.c b/arch/mips/sgi-ip22/ip22-mc.c index 5268ac187bb..d22262ee685 100644 --- a/arch/mips/sgi-ip22/ip22-mc.c +++ b/arch/mips/sgi-ip22/ip22-mc.c @@ -1,7 +1,7 @@ /* * ip22-mc.c: Routines for manipulating SGI Memory Controller. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 1999 Andrew R. Baker (andrewb@uab.edu) - Indigo2 changes * Copyright (C) 2003 Ladislav Michl (ladis@linux-mips.org) * Copyright (C) 2004 Peter Fuerst (pf@net.alphadv.de) - IP28 diff --git a/arch/mips/sgi-ip22/ip22-setup.c b/arch/mips/sgi-ip22/ip22-setup.c index 5deeb68b6c9..5e662134947 100644 --- a/arch/mips/sgi-ip22/ip22-setup.c +++ b/arch/mips/sgi-ip22/ip22-setup.c @@ -1,7 +1,7 @@ /* * ip22-setup.c: SGI specific setup, including init of the feature struct. * - * Copyright (C) 1996 David S. Miller (dm@engr.sgi.com) + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) * Copyright (C) 1997, 1998 Ralf Baechle (ralf@gnu.org) */ #include <linux/init.h> diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index d18328b3f93..da601dd34c0 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -3,6 +3,7 @@ #include <linux/mm.h> #include <linux/uaccess.h> +#include <asm/tlbflush.h> /* The usual comment is "Caches aren't brain-dead on the <architecture>". * Unfortunately, that doesn't apply to PA-RISC. */ @@ -112,8 +113,10 @@ void flush_dcache_page_asm(unsigned long phys_addr, unsigned long vaddr); static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) { - if (PageAnon(page)) + if (PageAnon(page)) { + flush_tlb_page(vma, vmaddr); flush_dcache_page_asm(page_to_phys(page), vmaddr); + } } #ifdef CONFIG_DEBUG_RODATA diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 5d7b8ce9fdf..22dadeb5869 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -177,7 +177,10 @@ struct vm_area_struct; #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED) #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) -#define _PAGE_KERNEL (_PAGE_PRESENT | _PAGE_EXEC | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED) +#define _PAGE_KERNEL_RO (_PAGE_PRESENT | _PAGE_READ | _PAGE_DIRTY | _PAGE_ACCESSED) +#define _PAGE_KERNEL_EXEC (_PAGE_KERNEL_RO | _PAGE_EXEC) +#define _PAGE_KERNEL_RWX (_PAGE_KERNEL_EXEC | _PAGE_WRITE) +#define _PAGE_KERNEL (_PAGE_KERNEL_RO | _PAGE_WRITE) /* The pgd/pmd contains a ptr (in phys addr space); since all pgds/pmds * are page-aligned, we don't care about the PAGE_OFFSET bits, except @@ -208,7 +211,9 @@ struct vm_area_struct; #define PAGE_COPY PAGE_EXECREAD #define PAGE_RWX __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE | _PAGE_EXEC |_PAGE_ACCESSED) #define PAGE_KERNEL __pgprot(_PAGE_KERNEL) -#define PAGE_KERNEL_RO __pgprot(_PAGE_KERNEL & ~_PAGE_WRITE) +#define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RWX __pgprot(_PAGE_KERNEL_RWX) +#define PAGE_KERNEL_RO __pgprot(_PAGE_KERNEL_RO) #define PAGE_KERNEL_UNC __pgprot(_PAGE_KERNEL | _PAGE_NO_CACHE) #define PAGE_GATEWAY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_GATEWAY| _PAGE_READ) diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h index 3eb82c2a5ec..9cbc2c3bf63 100644 --- a/arch/parisc/include/asm/unistd.h +++ b/arch/parisc/include/asm/unistd.h @@ -814,8 +814,14 @@ #define __NR_recvmmsg (__NR_Linux + 319) #define __NR_accept4 (__NR_Linux + 320) #define __NR_prlimit64 (__NR_Linux + 321) - -#define __NR_Linux_syscalls (__NR_prlimit64 + 1) +#define __NR_fanotify_init (__NR_Linux + 322) +#define __NR_fanotify_mark (__NR_Linux + 323) +#define __NR_clock_adjtime (__NR_Linux + 324) +#define __NR_name_to_handle_at (__NR_Linux + 325) +#define __NR_open_by_handle_at (__NR_Linux + 326) +#define __NR_syncfs (__NR_Linux + 327) + +#define __NR_Linux_syscalls (__NR_syncfs + 1) #define __IGNORE_select /* newselect */ diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 3f11331c277..83335f3da5f 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -304,10 +304,20 @@ void flush_dcache_page(struct page *page) offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; addr = mpnt->vm_start + offset; + /* The TLB is the engine of coherence on parisc: The + * CPU is entitled to speculate any page with a TLB + * mapping, so here we kill the mapping then flush the + * page along a special flush only alias mapping. + * This guarantees that the page is no-longer in the + * cache for any process and nor may it be + * speculatively read in (until the user or kernel + * specifically accesses it, of course) */ + + flush_tlb_page(mpnt, addr); if (old_addr == 0 || (old_addr & (SHMLBA - 1)) != (addr & (SHMLBA - 1))) { __flush_cache_page(mpnt, addr, page_to_phys(page)); if (old_addr) - printk(KERN_ERR "INEQUIVALENT ALIASES 0x%lx and 0x%lx in file %s\n", old_addr, addr, mpnt->vm_file ? mpnt->vm_file->f_path.dentry->d_name.name : "(null)"); + printk(KERN_ERR "INEQUIVALENT ALIASES 0x%lx and 0x%lx in file %s\n", old_addr, addr, mpnt->vm_file ? (char *)mpnt->vm_file->f_path.dentry->d_name.name : "(null)"); old_addr = addr; } } @@ -499,6 +509,7 @@ flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long { BUG_ON(!vma->vm_mm->context); + flush_tlb_page(vma, vmaddr); __flush_cache_page(vma, vmaddr, page_to_phys(pfn_to_page(pfn))); } diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index ead8d2a1034..6f059443914 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -692,6 +692,9 @@ ENTRY(fault_vector_11) END(fault_vector_11) #endif + /* Fault vector is separately protected and *must* be on its own page */ + .align PAGE_SIZE +ENTRY(end_fault_vector) .import handle_interruption,code .import do_cpu_irq_mask,code diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S index 145c5e4caaa..37aabd772fb 100644 --- a/arch/parisc/kernel/head.S +++ b/arch/parisc/kernel/head.S @@ -106,8 +106,9 @@ $bss_loop: #endif - /* Now initialize the PTEs themselves */ - ldo 0+_PAGE_KERNEL(%r0),%r3 /* Hardwired 0 phys addr start */ + /* Now initialize the PTEs themselves. We use RWX for + * everything ... it will get remapped correctly later */ + ldo 0+_PAGE_KERNEL_RWX(%r0),%r3 /* Hardwired 0 phys addr start */ ldi (1<<(KERNEL_INITIAL_ORDER-PAGE_SHIFT)),%r11 /* PFN count */ load32 PA(pg0),%r1 diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index 6e81bb596e5..cedbbb8b18d 100644 --- a/arch/parisc/kernel/module.c +++ b/arch/parisc/kernel/module.c @@ -61,8 +61,10 @@ #include <linux/string.h> #include <linux/kernel.h> #include <linux/bug.h> +#include <linux/mm.h> #include <linux/slab.h> +#include <asm/pgtable.h> #include <asm/unwind.h> #if 0 @@ -214,7 +216,13 @@ void *module_alloc(unsigned long size) { if (size == 0) return NULL; - return vmalloc(size); + /* using RWX means less protection for modules, but it's + * easier than trying to map the text, data, init_text and + * init_data correctly */ + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_HIGHMEM, + PAGE_KERNEL_RWX, -1, + __builtin_return_address(0)); } #ifndef CONFIG_64BIT diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S index a85823668cb..93ff3d90edd 100644 --- a/arch/parisc/kernel/pacache.S +++ b/arch/parisc/kernel/pacache.S @@ -817,10 +817,7 @@ ENTRY(purge_kernel_dcache_page) .procend ENDPROC(purge_kernel_dcache_page) - - .export flush_user_dcache_range_asm - -flush_user_dcache_range_asm: +ENTRY(flush_user_dcache_range_asm) .proc .callinfo NO_CALLS .entry @@ -839,6 +836,7 @@ flush_user_dcache_range_asm: .exit .procend +ENDPROC(flush_user_dcache_range_asm) ENTRY(flush_kernel_dcache_range_asm) .proc diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c index 88a0ad14a9c..dc9a6246232 100644 --- a/arch/parisc/kernel/sys_parisc32.c +++ b/arch/parisc/kernel/sys_parisc32.c @@ -228,3 +228,11 @@ asmlinkage long compat_sys_fallocate(int fd, int mode, u32 offhi, u32 offlo, return sys_fallocate(fd, mode, ((loff_t)offhi << 32) | offlo, ((loff_t)lenhi << 32) | lenlo); } + +asmlinkage long compat_sys_fanotify_mark(int fan_fd, int flags, u32 mask_hi, + u32 mask_lo, int fd, + const char __user *pathname) +{ + return sys_fanotify_mark(fan_fd, flags, ((u64)mask_hi << 32) | mask_lo, + fd, pathname); +} diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S index 4be85ee10b8..a5b02ce4d41 100644 --- a/arch/parisc/kernel/syscall_table.S +++ b/arch/parisc/kernel/syscall_table.S @@ -420,6 +420,12 @@ ENTRY_COMP(recvmmsg) ENTRY_SAME(accept4) /* 320 */ ENTRY_SAME(prlimit64) + ENTRY_SAME(fanotify_init) + ENTRY_COMP(fanotify_mark) + ENTRY_COMP(clock_adjtime) + ENTRY_SAME(name_to_handle_at) /* 325 */ + ENTRY_COMP(open_by_handle_at) + ENTRY_SAME(syncfs) /* Nothing yet */ diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 2d9a5c7c76f..e1a55849bfa 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -137,6 +137,7 @@ SECTIONS . = ALIGN(16384); __init_begin = .; INIT_TEXT_SECTION(16384) + . = ALIGN(PAGE_SIZE); INIT_DATA_SECTION(16) /* we have to discard exit text and such at runtime, not link time */ .exit.text : diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index b1d126258de..5fa1e273006 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -371,24 +371,158 @@ static void __init setup_bootmem(void) request_resource(&sysram_resources[0], &pdcdata_resource); } +static void __init map_pages(unsigned long start_vaddr, + unsigned long start_paddr, unsigned long size, + pgprot_t pgprot, int force) +{ + pgd_t *pg_dir; + pmd_t *pmd; + pte_t *pg_table; + unsigned long end_paddr; + unsigned long start_pmd; + unsigned long start_pte; + unsigned long tmp1; + unsigned long tmp2; + unsigned long address; + unsigned long vaddr; + unsigned long ro_start; + unsigned long ro_end; + unsigned long fv_addr; + unsigned long gw_addr; + extern const unsigned long fault_vector_20; + extern void * const linux_gateway_page; + + ro_start = __pa((unsigned long)_text); + ro_end = __pa((unsigned long)&data_start); + fv_addr = __pa((unsigned long)&fault_vector_20) & PAGE_MASK; + gw_addr = __pa((unsigned long)&linux_gateway_page) & PAGE_MASK; + + end_paddr = start_paddr + size; + + pg_dir = pgd_offset_k(start_vaddr); + +#if PTRS_PER_PMD == 1 + start_pmd = 0; +#else + start_pmd = ((start_vaddr >> PMD_SHIFT) & (PTRS_PER_PMD - 1)); +#endif + start_pte = ((start_vaddr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)); + + address = start_paddr; + vaddr = start_vaddr; + while (address < end_paddr) { +#if PTRS_PER_PMD == 1 + pmd = (pmd_t *)__pa(pg_dir); +#else + pmd = (pmd_t *)pgd_address(*pg_dir); + + /* + * pmd is physical at this point + */ + + if (!pmd) { + pmd = (pmd_t *) alloc_bootmem_low_pages_node(NODE_DATA(0), PAGE_SIZE << PMD_ORDER); + pmd = (pmd_t *) __pa(pmd); + } + + pgd_populate(NULL, pg_dir, __va(pmd)); +#endif + pg_dir++; + + /* now change pmd to kernel virtual addresses */ + + pmd = (pmd_t *)__va(pmd) + start_pmd; + for (tmp1 = start_pmd; tmp1 < PTRS_PER_PMD; tmp1++, pmd++) { + + /* + * pg_table is physical at this point + */ + + pg_table = (pte_t *)pmd_address(*pmd); + if (!pg_table) { + pg_table = (pte_t *) + alloc_bootmem_low_pages_node(NODE_DATA(0), PAGE_SIZE); + pg_table = (pte_t *) __pa(pg_table); + } + + pmd_populate_kernel(NULL, pmd, __va(pg_table)); + + /* now change pg_table to kernel virtual addresses */ + + pg_table = (pte_t *) __va(pg_table) + start_pte; + for (tmp2 = start_pte; tmp2 < PTRS_PER_PTE; tmp2++, pg_table++) { + pte_t pte; + + /* + * Map the fault vector writable so we can + * write the HPMC checksum. + */ + if (force) + pte = __mk_pte(address, pgprot); + else if (core_kernel_text(vaddr) && + address != fv_addr) + pte = __mk_pte(address, PAGE_KERNEL_EXEC); + else +#if defined(CONFIG_PARISC_PAGE_SIZE_4KB) + if (address >= ro_start && address < ro_end + && address != fv_addr + && address != gw_addr) + pte = __mk_pte(address, PAGE_KERNEL_RO); + else +#endif + pte = __mk_pte(address, pgprot); + + if (address >= end_paddr) { + if (force) + break; + else + pte_val(pte) = 0; + } + + set_pte(pg_table, pte); + + address += PAGE_SIZE; + vaddr += PAGE_SIZE; + } + start_pte = 0; + + if (address >= end_paddr) + break; + } + start_pmd = 0; + } +} + void free_initmem(void) { unsigned long addr; unsigned long init_begin = (unsigned long)__init_begin; unsigned long init_end = (unsigned long)__init_end; -#ifdef CONFIG_DEBUG_KERNEL + /* The init text pages are marked R-X. We have to + * flush the icache and mark them RW- + * + * This is tricky, because map_pages is in the init section. + * Do a dummy remap of the data section first (the data + * section is already PAGE_KERNEL) to pull in the TLB entries + * for map_kernel */ + map_pages(init_begin, __pa(init_begin), init_end - init_begin, + PAGE_KERNEL_RWX, 1); + /* now remap at PAGE_KERNEL since the TLB is pre-primed to execute + * map_pages */ + map_pages(init_begin, __pa(init_begin), init_end - init_begin, + PAGE_KERNEL, 1); + + /* force the kernel to see the new TLB entries */ + __flush_tlb_range(0, init_begin, init_end); /* Attempt to catch anyone trying to execute code here * by filling the page with BRK insns. */ memset((void *)init_begin, 0x00, init_end - init_begin); + /* finally dump all the instructions which were cached, since the + * pages are no-longer executable */ flush_icache_range(init_begin, init_end); -#endif - /* align __init_begin and __init_end to page size, - ignoring linker script where we might have tried to save RAM */ - init_begin = PAGE_ALIGN(init_begin); - init_end = PAGE_ALIGN(init_end); for (addr = init_begin; addr < init_end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); @@ -618,114 +752,6 @@ void show_mem(unsigned int filter) #endif } - -static void __init map_pages(unsigned long start_vaddr, unsigned long start_paddr, unsigned long size, pgprot_t pgprot) -{ - pgd_t *pg_dir; - pmd_t *pmd; - pte_t *pg_table; - unsigned long end_paddr; - unsigned long start_pmd; - unsigned long start_pte; - unsigned long tmp1; - unsigned long tmp2; - unsigned long address; - unsigned long ro_start; - unsigned long ro_end; - unsigned long fv_addr; - unsigned long gw_addr; - extern const unsigned long fault_vector_20; - extern void * const linux_gateway_page; - - ro_start = __pa((unsigned long)_text); - ro_end = __pa((unsigned long)&data_start); - fv_addr = __pa((unsigned long)&fault_vector_20) & PAGE_MASK; - gw_addr = __pa((unsigned long)&linux_gateway_page) & PAGE_MASK; - - end_paddr = start_paddr + size; - - pg_dir = pgd_offset_k(start_vaddr); - -#if PTRS_PER_PMD == 1 - start_pmd = 0; -#else - start_pmd = ((start_vaddr >> PMD_SHIFT) & (PTRS_PER_PMD - 1)); -#endif - start_pte = ((start_vaddr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)); - - address = start_paddr; - while (address < end_paddr) { -#if PTRS_PER_PMD == 1 - pmd = (pmd_t *)__pa(pg_dir); -#else - pmd = (pmd_t *)pgd_address(*pg_dir); - - /* - * pmd is physical at this point - */ - - if (!pmd) { - pmd = (pmd_t *) alloc_bootmem_low_pages_node(NODE_DATA(0),PAGE_SIZE << PMD_ORDER); - pmd = (pmd_t *) __pa(pmd); - } - - pgd_populate(NULL, pg_dir, __va(pmd)); -#endif - pg_dir++; - - /* now change pmd to kernel virtual addresses */ - - pmd = (pmd_t *)__va(pmd) + start_pmd; - for (tmp1 = start_pmd; tmp1 < PTRS_PER_PMD; tmp1++,pmd++) { - - /* - * pg_table is physical at this point - */ - - pg_table = (pte_t *)pmd_address(*pmd); - if (!pg_table) { - pg_table = (pte_t *) - alloc_bootmem_low_pages_node(NODE_DATA(0),PAGE_SIZE); - pg_table = (pte_t *) __pa(pg_table); - } - - pmd_populate_kernel(NULL, pmd, __va(pg_table)); - - /* now change pg_table to kernel virtual addresses */ - - pg_table = (pte_t *) __va(pg_table) + start_pte; - for (tmp2 = start_pte; tmp2 < PTRS_PER_PTE; tmp2++,pg_table++) { - pte_t pte; - - /* - * Map the fault vector writable so we can - * write the HPMC checksum. - */ -#if defined(CONFIG_PARISC_PAGE_SIZE_4KB) - if (address >= ro_start && address < ro_end - && address != fv_addr - && address != gw_addr) - pte = __mk_pte(address, PAGE_KERNEL_RO); - else -#endif - pte = __mk_pte(address, pgprot); - - if (address >= end_paddr) - pte_val(pte) = 0; - - set_pte(pg_table, pte); - - address += PAGE_SIZE; - } - start_pte = 0; - - if (address >= end_paddr) - break; - } - start_pmd = 0; - } -} - /* * pagetable_init() sets up the page tables * @@ -750,14 +776,14 @@ static void __init pagetable_init(void) size = pmem_ranges[range].pages << PAGE_SHIFT; map_pages((unsigned long)__va(start_paddr), start_paddr, - size, PAGE_KERNEL); + size, PAGE_KERNEL, 0); } #ifdef CONFIG_BLK_DEV_INITRD if (initrd_end && initrd_end > mem_limit) { printk(KERN_INFO "initrd: mapping %08lx-%08lx\n", initrd_start, initrd_end); map_pages(initrd_start, __pa(initrd_start), - initrd_end - initrd_start, PAGE_KERNEL); + initrd_end - initrd_start, PAGE_KERNEL, 0); } #endif @@ -782,7 +808,7 @@ static void __init gateway_init(void) */ map_pages(linux_gateway_page_addr, __pa(&linux_gateway_page), - PAGE_SIZE, PAGE_GATEWAY); + PAGE_SIZE, PAGE_GATEWAY, 1); } #ifdef CONFIG_HPUX diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index 18ea6963ad7..d2ca5ed3877 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -45,6 +45,114 @@ struct kvm_regs { __u64 gpr[32]; }; +#define KVM_SREGS_E_IMPL_NONE 0 +#define KVM_SREGS_E_IMPL_FSL 1 + +#define KVM_SREGS_E_FSL_PIDn (1 << 0) /* PID1/PID2 */ + +/* + * Feature bits indicate which sections of the sregs struct are valid, + * both in KVM_GET_SREGS and KVM_SET_SREGS. On KVM_SET_SREGS, registers + * corresponding to unset feature bits will not be modified. This allows + * restoring a checkpoint made without that feature, while keeping the + * default values of the new registers. + * + * KVM_SREGS_E_BASE contains: + * CSRR0/1 (refers to SRR2/3 on 40x) + * ESR + * DEAR + * MCSR + * TSR + * TCR + * DEC + * TB + * VRSAVE (USPRG0) + */ +#define KVM_SREGS_E_BASE (1 << 0) + +/* + * KVM_SREGS_E_ARCH206 contains: + * + * PIR + * MCSRR0/1 + * DECAR + * IVPR + */ +#define KVM_SREGS_E_ARCH206 (1 << 1) + +/* + * Contains EPCR, plus the upper half of 64-bit registers + * that are 32-bit on 32-bit implementations. + */ +#define KVM_SREGS_E_64 (1 << 2) + +#define KVM_SREGS_E_SPRG8 (1 << 3) +#define KVM_SREGS_E_MCIVPR (1 << 4) + +/* + * IVORs are used -- contains IVOR0-15, plus additional IVORs + * in combination with an appropriate feature bit. + */ +#define KVM_SREGS_E_IVOR (1 << 5) + +/* + * Contains MAS0-4, MAS6-7, TLBnCFG, MMUCFG. + * Also TLBnPS if MMUCFG[MAVN] = 1. + */ +#define KVM_SREGS_E_ARCH206_MMU (1 << 6) + +/* DBSR, DBCR, IAC, DAC, DVC */ +#define KVM_SREGS_E_DEBUG (1 << 7) + +/* Enhanced debug -- DSRR0/1, SPRG9 */ +#define KVM_SREGS_E_ED (1 << 8) + +/* Embedded Floating Point (SPE) -- IVOR32-34 if KVM_SREGS_E_IVOR */ +#define KVM_SREGS_E_SPE (1 << 9) + +/* External Proxy (EXP) -- EPR */ +#define KVM_SREGS_EXP (1 << 10) + +/* External PID (E.PD) -- EPSC/EPLC */ +#define KVM_SREGS_E_PD (1 << 11) + +/* Processor Control (E.PC) -- IVOR36-37 if KVM_SREGS_E_IVOR */ +#define KVM_SREGS_E_PC (1 << 12) + +/* Page table (E.PT) -- EPTCFG */ +#define KVM_SREGS_E_PT (1 << 13) + +/* Embedded Performance Monitor (E.PM) -- IVOR35 if KVM_SREGS_E_IVOR */ +#define KVM_SREGS_E_PM (1 << 14) + +/* + * Special updates: + * + * Some registers may change even while a vcpu is not running. + * To avoid losing these changes, by default these registers are + * not updated by KVM_SET_SREGS. To force an update, set the bit + * in u.e.update_special corresponding to the register to be updated. + * + * The update_special field is zero on return from KVM_GET_SREGS. + * + * When restoring a checkpoint, the caller can set update_special + * to 0xffffffff to ensure that everything is restored, even new features + * that the caller doesn't know about. + */ +#define KVM_SREGS_E_UPDATE_MCSR (1 << 0) +#define KVM_SREGS_E_UPDATE_TSR (1 << 1) +#define KVM_SREGS_E_UPDATE_DEC (1 << 2) +#define KVM_SREGS_E_UPDATE_DBSR (1 << 3) + +/* + * In KVM_SET_SREGS, reserved/pad fields must be left untouched from a + * previous KVM_GET_REGS. + * + * Unless otherwise indicated, setting any register with KVM_SET_SREGS + * directly sets its value. It does not trigger any special semantics such + * as write-one-to-clear. Calling KVM_SET_SREGS on an unmodified struct + * just received from KVM_GET_SREGS is always a no-op. + */ struct kvm_sregs { __u32 pvr; union { @@ -62,6 +170,82 @@ struct kvm_sregs { __u64 dbat[8]; } ppc32; } s; + struct { + union { + struct { /* KVM_SREGS_E_IMPL_FSL */ + __u32 features; /* KVM_SREGS_E_FSL_ */ + __u32 svr; + __u64 mcar; + __u32 hid0; + + /* KVM_SREGS_E_FSL_PIDn */ + __u32 pid1, pid2; + } fsl; + __u8 pad[256]; + } impl; + + __u32 features; /* KVM_SREGS_E_ */ + __u32 impl_id; /* KVM_SREGS_E_IMPL_ */ + __u32 update_special; /* KVM_SREGS_E_UPDATE_ */ + __u32 pir; /* read-only */ + __u64 sprg8; + __u64 sprg9; /* E.ED */ + __u64 csrr0; + __u64 dsrr0; /* E.ED */ + __u64 mcsrr0; + __u32 csrr1; + __u32 dsrr1; /* E.ED */ + __u32 mcsrr1; + __u32 esr; + __u64 dear; + __u64 ivpr; + __u64 mcivpr; + __u64 mcsr; /* KVM_SREGS_E_UPDATE_MCSR */ + + __u32 tsr; /* KVM_SREGS_E_UPDATE_TSR */ + __u32 tcr; + __u32 decar; + __u32 dec; /* KVM_SREGS_E_UPDATE_DEC */ + + /* + * Userspace can read TB directly, but the + * value reported here is consistent with "dec". + * + * Read-only. + */ + __u64 tb; + + __u32 dbsr; /* KVM_SREGS_E_UPDATE_DBSR */ + __u32 dbcr[3]; + __u32 iac[4]; + __u32 dac[2]; + __u32 dvc[2]; + __u8 num_iac; /* read-only */ + __u8 num_dac; /* read-only */ + __u8 num_dvc; /* read-only */ + __u8 pad; + + __u32 epr; /* EXP */ + __u32 vrsave; /* a.k.a. USPRG0 */ + __u32 epcr; /* KVM_SREGS_E_64 */ + + __u32 mas0; + __u32 mas1; + __u64 mas2; + __u64 mas7_3; + __u32 mas4; + __u32 mas6; + + __u32 ivor_low[16]; /* IVOR0-15 */ + __u32 ivor_high[18]; /* IVOR32+, plus room to expand */ + + __u32 mmucfg; /* read-only */ + __u32 eptcfg; /* E.PT, read-only */ + __u32 tlbcfg[4];/* read-only */ + __u32 tlbps[4]; /* read-only */ + + __u32 eplc, epsc; /* E.PD */ + } e; __u8 pad[1020]; } u; }; diff --git a/arch/powerpc/include/asm/kvm_44x.h b/arch/powerpc/include/asm/kvm_44x.h index d22d39942a9..a0e57618ff3 100644 --- a/arch/powerpc/include/asm/kvm_44x.h +++ b/arch/powerpc/include/asm/kvm_44x.h @@ -61,7 +61,6 @@ static inline struct kvmppc_vcpu_44x *to_44x(struct kvm_vcpu *vcpu) return container_of(vcpu, struct kvmppc_vcpu_44x, vcpu); } -void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid); void kvmppc_44x_tlb_put(struct kvm_vcpu *vcpu); void kvmppc_44x_tlb_load(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h index 7fea26fffb2..7a2a565f88c 100644 --- a/arch/powerpc/include/asm/kvm_e500.h +++ b/arch/powerpc/include/asm/kvm_e500.h @@ -43,6 +43,7 @@ struct kvmppc_vcpu_e500 { u32 host_pid[E500_PID_NUM]; u32 pid[E500_PID_NUM]; + u32 svr; u32 mas0; u32 mas1; @@ -58,6 +59,7 @@ struct kvmppc_vcpu_e500 { u32 hid1; u32 tlb0cfg; u32 tlb1cfg; + u64 mcar; struct kvm_vcpu vcpu; }; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index bba3b9b72a3..186f150b9b8 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -223,6 +223,7 @@ struct kvm_vcpu_arch { ulong hflags; ulong guest_owned_ext; #endif + u32 vrsave; /* also USPRG0 */ u32 mmucr; ulong sprg4; ulong sprg5; @@ -232,6 +233,9 @@ struct kvm_vcpu_arch { ulong csrr1; ulong dsrr0; ulong dsrr1; + ulong mcsrr0; + ulong mcsrr1; + ulong mcsr; ulong esr; u32 dec; u32 decar; @@ -255,6 +259,7 @@ struct kvm_vcpu_arch { u32 dbsr; #ifdef CONFIG_KVM_EXIT_TIMING + struct mutex exit_timing_lock; struct kvmppc_exit_timing timing_exit; struct kvmppc_exit_timing timing_last_enter; u32 last_exit_type; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index ecb3bc74c34..9345238edec 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -61,6 +61,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu); extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); +extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); /* Core-specific hooks */ @@ -142,4 +143,12 @@ static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value) return r; } +void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); + +void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); + +void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); + #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/powerpc/include/asm/pte-hash64-64k.h b/arch/powerpc/include/asm/pte-hash64-64k.h index c4490f9c67c..59247e816ac 100644 --- a/arch/powerpc/include/asm/pte-hash64-64k.h +++ b/arch/powerpc/include/asm/pte-hash64-64k.h @@ -22,7 +22,7 @@ #define _PAGE_HASHPTE _PAGE_HPTE_SUB /* Note the full page bits must be in the same location as for normal - * 4k pages as the same asssembly will be used to insert 64K pages + * 4k pages as the same assembly will be used to insert 64K pages * wether the kernel has CONFIG_PPC_64K_PAGES or not */ #define _PAGE_F_SECOND 0x00008000 /* full page: hidx bits */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 6887661ac07..36e1c8a29be 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -396,6 +396,7 @@ int main(void) DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack)); DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); + DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave)); DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4)); DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5)); DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6)); diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index bd9d35f59cf..76a6e40a6f7 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -142,7 +142,7 @@ static int kgdb_singlestep(struct pt_regs *regs) return 0; /* - * On Book E and perhaps other processsors, singlestep is handled on + * On Book E and perhaps other processors, singlestep is handled on * the critical exception stack. This causes current_thread_info() * to fail, since it it locates the thread_info by masking off * the low bits of the current stack pointer. We work around diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 74d0e742114..da3a1225c0a 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -107,6 +107,16 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, return 0; } +void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + kvmppc_get_sregs_ivor(vcpu, sregs); +} + +int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + return kvmppc_set_sregs_ivor(vcpu, sregs); +} + struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvmppc_vcpu_44x *vcpu_44x; diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c index 65ea083a5b2..549bb2c9a47 100644 --- a/arch/powerpc/kvm/44x_emulate.c +++ b/arch/powerpc/kvm/44x_emulate.c @@ -158,7 +158,6 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs); } - kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); return emulated; } @@ -179,7 +178,6 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); } - kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS); return emulated; } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index ef76acb455c..8462b3a1c1c 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -569,6 +569,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) kvmppc_set_msr(vcpu, regs->msr); vcpu->arch.shared->srr0 = regs->srr0; vcpu->arch.shared->srr1 = regs->srr1; + kvmppc_set_pid(vcpu, regs->pid); vcpu->arch.shared->sprg0 = regs->sprg0; vcpu->arch.shared->sprg1 = regs->sprg1; vcpu->arch.shared->sprg2 = regs->sprg2; @@ -584,16 +585,165 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } +static void get_sregs_base(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + u64 tb = get_tb(); + + sregs->u.e.features |= KVM_SREGS_E_BASE; + + sregs->u.e.csrr0 = vcpu->arch.csrr0; + sregs->u.e.csrr1 = vcpu->arch.csrr1; + sregs->u.e.mcsr = vcpu->arch.mcsr; + sregs->u.e.esr = vcpu->arch.esr; + sregs->u.e.dear = vcpu->arch.shared->dar; + sregs->u.e.tsr = vcpu->arch.tsr; + sregs->u.e.tcr = vcpu->arch.tcr; + sregs->u.e.dec = kvmppc_get_dec(vcpu, tb); + sregs->u.e.tb = tb; + sregs->u.e.vrsave = vcpu->arch.vrsave; +} + +static int set_sregs_base(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + if (!(sregs->u.e.features & KVM_SREGS_E_BASE)) + return 0; + + vcpu->arch.csrr0 = sregs->u.e.csrr0; + vcpu->arch.csrr1 = sregs->u.e.csrr1; + vcpu->arch.mcsr = sregs->u.e.mcsr; + vcpu->arch.esr = sregs->u.e.esr; + vcpu->arch.shared->dar = sregs->u.e.dear; + vcpu->arch.vrsave = sregs->u.e.vrsave; + vcpu->arch.tcr = sregs->u.e.tcr; + + if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) + vcpu->arch.dec = sregs->u.e.dec; + + kvmppc_emulate_dec(vcpu); + + if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { + /* + * FIXME: existing KVM timer handling is incomplete. + * TSR cannot be read by the guest, and its value in + * vcpu->arch is always zero. For now, just handle + * the case where the caller is trying to inject a + * decrementer interrupt. + */ + + if ((sregs->u.e.tsr & TSR_DIS) && + (vcpu->arch.tcr & TCR_DIE)) + kvmppc_core_queue_dec(vcpu); + } + + return 0; +} + +static void get_sregs_arch206(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + sregs->u.e.features |= KVM_SREGS_E_ARCH206; + + sregs->u.e.pir = 0; + sregs->u.e.mcsrr0 = vcpu->arch.mcsrr0; + sregs->u.e.mcsrr1 = vcpu->arch.mcsrr1; + sregs->u.e.decar = vcpu->arch.decar; + sregs->u.e.ivpr = vcpu->arch.ivpr; +} + +static int set_sregs_arch206(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + if (!(sregs->u.e.features & KVM_SREGS_E_ARCH206)) + return 0; + + if (sregs->u.e.pir != 0) + return -EINVAL; + + vcpu->arch.mcsrr0 = sregs->u.e.mcsrr0; + vcpu->arch.mcsrr1 = sregs->u.e.mcsrr1; + vcpu->arch.decar = sregs->u.e.decar; + vcpu->arch.ivpr = sregs->u.e.ivpr; + + return 0; +} + +void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + sregs->u.e.features |= KVM_SREGS_E_IVOR; + + sregs->u.e.ivor_low[0] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]; + sregs->u.e.ivor_low[1] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]; + sregs->u.e.ivor_low[2] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]; + sregs->u.e.ivor_low[3] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]; + sregs->u.e.ivor_low[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]; + sregs->u.e.ivor_low[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]; + sregs->u.e.ivor_low[6] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]; + sregs->u.e.ivor_low[7] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]; + sregs->u.e.ivor_low[8] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]; + sregs->u.e.ivor_low[9] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]; + sregs->u.e.ivor_low[10] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]; + sregs->u.e.ivor_low[11] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]; + sregs->u.e.ivor_low[12] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]; + sregs->u.e.ivor_low[13] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]; + sregs->u.e.ivor_low[14] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]; + sregs->u.e.ivor_low[15] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; +} + +int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) + return 0; + + vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = sregs->u.e.ivor_low[0]; + vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = sregs->u.e.ivor_low[1]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = sregs->u.e.ivor_low[2]; + vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = sregs->u.e.ivor_low[3]; + vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = sregs->u.e.ivor_low[4]; + vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = sregs->u.e.ivor_low[5]; + vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = sregs->u.e.ivor_low[6]; + vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = sregs->u.e.ivor_low[7]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = sregs->u.e.ivor_low[8]; + vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = sregs->u.e.ivor_low[9]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = sregs->u.e.ivor_low[10]; + vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = sregs->u.e.ivor_low[11]; + vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = sregs->u.e.ivor_low[12]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = sregs->u.e.ivor_low[13]; + vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = sregs->u.e.ivor_low[14]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = sregs->u.e.ivor_low[15]; + + return 0; +} + int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - return -ENOTSUPP; + sregs->pvr = vcpu->arch.pvr; + + get_sregs_base(vcpu, sregs); + get_sregs_arch206(vcpu, sregs); + kvmppc_core_get_sregs(vcpu, sregs); + return 0; } int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - return -ENOTSUPP; + int ret; + + if (vcpu->arch.pvr != sregs->pvr) + return -EINVAL; + + ret = set_sregs_base(vcpu, sregs); + if (ret < 0) + return ret; + + ret = set_sregs_arch206(vcpu, sregs); + if (ret < 0) + return ret; + + return kvmppc_core_set_sregs(vcpu, sregs); } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index 1cc471faac2..b58ccae9590 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -380,7 +380,6 @@ lightweight_exit: * because host interrupt handlers would get confused. */ lwz r1, VCPU_GPR(r1)(r4) - /* XXX handle USPRG0 */ /* Host interrupt handlers may have clobbered these guest-readable * SPRGs, so we need to reload them here with the guest's values. */ lwz r3, VCPU_SPRG4(r4) diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index e3768ee9b59..318dbc61ba4 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -63,6 +63,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) /* Registers init */ vcpu->arch.pvr = mfspr(SPRN_PVR); + vcpu_e500->svr = mfspr(SPRN_SVR); /* Since booke kvm only support one core, update all vcpus' PIR to 0 */ vcpu->vcpu_id = 0; @@ -96,6 +97,81 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, return 0; } +void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + sregs->u.e.features |= KVM_SREGS_E_ARCH206_MMU | KVM_SREGS_E_SPE | + KVM_SREGS_E_PM; + sregs->u.e.impl_id = KVM_SREGS_E_IMPL_FSL; + + sregs->u.e.impl.fsl.features = 0; + sregs->u.e.impl.fsl.svr = vcpu_e500->svr; + sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0; + sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar; + + sregs->u.e.mas0 = vcpu_e500->mas0; + sregs->u.e.mas1 = vcpu_e500->mas1; + sregs->u.e.mas2 = vcpu_e500->mas2; + sregs->u.e.mas7_3 = ((u64)vcpu_e500->mas7 << 32) | vcpu_e500->mas3; + sregs->u.e.mas4 = vcpu_e500->mas4; + sregs->u.e.mas6 = vcpu_e500->mas6; + + sregs->u.e.mmucfg = mfspr(SPRN_MMUCFG); + sregs->u.e.tlbcfg[0] = vcpu_e500->tlb0cfg; + sregs->u.e.tlbcfg[1] = vcpu_e500->tlb1cfg; + sregs->u.e.tlbcfg[2] = 0; + sregs->u.e.tlbcfg[3] = 0; + + sregs->u.e.ivor_high[0] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; + sregs->u.e.ivor_high[1] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]; + sregs->u.e.ivor_high[2] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]; + sregs->u.e.ivor_high[3] = + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; + + kvmppc_get_sregs_ivor(vcpu, sregs); +} + +int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) { + vcpu_e500->svr = sregs->u.e.impl.fsl.svr; + vcpu_e500->hid0 = sregs->u.e.impl.fsl.hid0; + vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar; + } + + if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) { + vcpu_e500->mas0 = sregs->u.e.mas0; + vcpu_e500->mas1 = sregs->u.e.mas1; + vcpu_e500->mas2 = sregs->u.e.mas2; + vcpu_e500->mas7 = sregs->u.e.mas7_3 >> 32; + vcpu_e500->mas3 = (u32)sregs->u.e.mas7_3; + vcpu_e500->mas4 = sregs->u.e.mas4; + vcpu_e500->mas6 = sregs->u.e.mas6; + } + + if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) + return 0; + + if (sregs->u.e.features & KVM_SREGS_E_SPE) { + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = + sregs->u.e.ivor_high[0]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = + sregs->u.e.ivor_high[1]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = + sregs->u.e.ivor_high[2]; + } + + if (sregs->u.e.features & KVM_SREGS_E_PM) { + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = + sregs->u.e.ivor_high[3]; + } + + return kvmppc_set_sregs_ivor(vcpu, sregs); +} + struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvmppc_vcpu_e500 *vcpu_e500; diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index 8e3edfbc963..69cd665a0ca 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. + * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. * * Author: Yu Liu, <yu.liu@freescale.com> * @@ -78,8 +78,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) switch (sprn) { case SPRN_PID: - vcpu_e500->pid[0] = vcpu->arch.shadow_pid = - vcpu->arch.pid = spr_val; + kvmppc_set_pid(vcpu, spr_val); break; case SPRN_PID1: vcpu_e500->pid[1] = spr_val; break; @@ -175,6 +174,8 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid0); break; case SPRN_HID1: kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid1); break; + case SPRN_SVR: + kvmppc_set_gpr(vcpu, rt, vcpu_e500->svr); break; case SPRN_MMUCSR0: kvmppc_set_gpr(vcpu, rt, 0); break; diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index d6d6d47a75a..b18fe353397 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. + * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. * * Author: Yu Liu, yu.liu@freescale.com * @@ -24,6 +24,7 @@ #include "../mm/mmu_decl.h" #include "e500_tlb.h" #include "trace.h" +#include "timing.h" #define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) @@ -506,6 +507,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) vcpu_e500->mas7 = 0; } + kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS); return EMULATE_DONE; } @@ -571,6 +573,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) write_host_tlbe(vcpu_e500, stlbsel, sesel); } + kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); return EMULATE_DONE; } @@ -672,6 +675,14 @@ int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, return -1; } +void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + vcpu_e500->pid[0] = vcpu->arch.shadow_pid = + vcpu->arch.pid = pid; +} + void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) { struct tlbe *tlbe; diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index c64fd2909bb..141dce3c681 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -114,6 +114,12 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) } } +u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb) +{ + u64 jd = tb - vcpu->arch.dec_jiffies; + return vcpu->arch.dec - jd; +} + /* XXX to do: * lhax * lhaux @@ -279,11 +285,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case SPRN_DEC: { - u64 jd = get_tb() - vcpu->arch.dec_jiffies; - kvmppc_set_gpr(vcpu, rt, vcpu->arch.dec - jd); - pr_debug("mfDEC: %x - %llx = %lx\n", - vcpu->arch.dec, jd, - kvmppc_get_gpr(vcpu, rt)); + kvmppc_set_gpr(vcpu, rt, + kvmppc_get_dec(vcpu, get_tb())); break; } default: @@ -294,6 +297,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) } break; } + kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS); break; case OP_31_XOP_STHX: @@ -363,6 +367,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) printk("mtspr: unknown spr %x\n", sprn); break; } + kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); break; case OP_31_XOP_DCBI: diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 99758460efd..616dd516ca1 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -175,7 +175,11 @@ int kvm_dev_ioctl_check_extension(long ext) int r; switch (ext) { +#ifdef CONFIG_BOOKE + case KVM_CAP_PPC_BOOKE_SREGS: +#else case KVM_CAP_PPC_SEGSTATE: +#endif case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_UNSET_IRQ: case KVM_CAP_PPC_IRQ_LEVEL: @@ -284,6 +288,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu); vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; +#ifdef CONFIG_KVM_EXIT_TIMING + mutex_init(&vcpu->arch.exit_timing_lock); +#endif + return 0; } @@ -294,12 +302,25 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { +#ifdef CONFIG_BOOKE + /* + * vrsave (formerly usprg0) isn't used by Linux, but may + * be used by the guest. + * + * On non-booke this is associated with Altivec and + * is handled by code in book3s.c. + */ + mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); +#endif kvmppc_core_vcpu_load(vcpu, cpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvmppc_core_vcpu_put(vcpu); +#ifdef CONFIG_BOOKE + vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); +#endif } int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c index a021f5827a3..319177df958 100644 --- a/arch/powerpc/kvm/timing.c +++ b/arch/powerpc/kvm/timing.c @@ -34,8 +34,8 @@ void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu) { int i; - /* pause guest execution to avoid concurrent updates */ - mutex_lock(&vcpu->mutex); + /* Take a lock to avoid concurrent updates */ + mutex_lock(&vcpu->arch.exit_timing_lock); vcpu->arch.last_exit_type = 0xDEAD; for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) { @@ -49,7 +49,7 @@ void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu) vcpu->arch.timing_exit.tv64 = 0; vcpu->arch.timing_last_enter.tv64 = 0; - mutex_unlock(&vcpu->mutex); + mutex_unlock(&vcpu->arch.exit_timing_lock); } static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type) @@ -65,6 +65,8 @@ static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type) return; } + mutex_lock(&vcpu->arch.exit_timing_lock); + vcpu->arch.timing_count_type[type]++; /* sum */ @@ -93,6 +95,8 @@ static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type) vcpu->arch.timing_min_duration[type] = duration; if (unlikely(duration > vcpu->arch.timing_max_duration[type])) vcpu->arch.timing_max_duration[type] = duration; + + mutex_unlock(&vcpu->arch.exit_timing_lock); } void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu) @@ -147,17 +151,30 @@ static int kvmppc_exit_timing_show(struct seq_file *m, void *private) { struct kvm_vcpu *vcpu = m->private; int i; + u64 min, max, sum, sum_quad; seq_printf(m, "%s", "type count min max sum sum_squared\n"); + for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) { + + min = vcpu->arch.timing_min_duration[i]; + do_div(min, tb_ticks_per_usec); + max = vcpu->arch.timing_max_duration[i]; + do_div(max, tb_ticks_per_usec); + sum = vcpu->arch.timing_sum_duration[i]; + do_div(sum, tb_ticks_per_usec); + sum_quad = vcpu->arch.timing_sum_quad_duration[i]; + do_div(sum_quad, tb_ticks_per_usec); + seq_printf(m, "%12s %10d %10lld %10lld %20lld %20lld\n", kvm_exit_names[i], vcpu->arch.timing_count_type[i], - vcpu->arch.timing_min_duration[i], - vcpu->arch.timing_max_duration[i], - vcpu->arch.timing_sum_duration[i], - vcpu->arch.timing_sum_quad_duration[i]); + min, + max, + sum, + sum_quad); + } return 0; } diff --git a/arch/s390/hypfs/hypfs.h b/arch/s390/hypfs/hypfs.h index 80c1526f2af..d9df5a060a8 100644 --- a/arch/s390/hypfs/hypfs.h +++ b/arch/s390/hypfs/hypfs.h @@ -47,7 +47,7 @@ struct hypfs_dbfs_data { void *buf; void *buf_free_ptr; size_t size; - struct hypfs_dbfs_file *dbfs_file;; + struct hypfs_dbfs_file *dbfs_file; struct kref kref; }; diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index e560d102215..63a027c9ada 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -25,6 +25,10 @@ config SPARC select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG select HAVE_ARCH_JUMP_LABEL + select HAVE_GENERIC_HARDIRQS + select GENERIC_HARDIRQS_NO_DEPRECATED + select GENERIC_IRQ_SHOW + select USE_GENERIC_SMP_HELPERS if SMP config SPARC32 def_bool !64BIT @@ -43,15 +47,12 @@ config SPARC64 select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_SYSCALL_TRACEPOINTS - select USE_GENERIC_SMP_HELPERS if SMP select RTC_DRV_CMOS select RTC_DRV_BQ4802 select RTC_DRV_SUN4V select RTC_DRV_STARFIRE select HAVE_PERF_EVENTS select PERF_USE_VMALLOC - select HAVE_GENERIC_HARDIRQS - select GENERIC_IRQ_SHOW select IRQ_PREFLOW_FASTEOI config ARCH_DEFCONFIG diff --git a/arch/sparc/include/asm/cpudata_32.h b/arch/sparc/include/asm/cpudata_32.h index 31d48a0e32c..a4c5a938b93 100644 --- a/arch/sparc/include/asm/cpudata_32.h +++ b/arch/sparc/include/asm/cpudata_32.h @@ -16,6 +16,10 @@ typedef struct { unsigned long clock_tick; unsigned int multiplier; unsigned int counter; +#ifdef CONFIG_SMP + unsigned int irq_resched_count; + unsigned int irq_call_count; +#endif int prom_node; int mid; int next; @@ -23,5 +27,6 @@ typedef struct { DECLARE_PER_CPU(cpuinfo_sparc, __cpu_data); #define cpu_data(__cpu) per_cpu(__cpu_data, (__cpu)) +#define local_cpu_data() __get_cpu_var(__cpu_data) #endif /* _SPARC_CPUDATA_H */ diff --git a/arch/sparc/include/asm/floppy_32.h b/arch/sparc/include/asm/floppy_32.h index 86666f70322..482c79e2a41 100644 --- a/arch/sparc/include/asm/floppy_32.h +++ b/arch/sparc/include/asm/floppy_32.h @@ -281,28 +281,27 @@ static inline void sun_fd_enable_dma(void) pdma_areasize = pdma_size; } -/* Our low-level entry point in arch/sparc/kernel/entry.S */ -extern int sparc_floppy_request_irq(int irq, unsigned long flags, - irq_handler_t irq_handler); +extern int sparc_floppy_request_irq(unsigned int irq, + irq_handler_t irq_handler); static int sun_fd_request_irq(void) { static int once = 0; - int error; - if(!once) { + if (!once) { once = 1; - error = sparc_floppy_request_irq(FLOPPY_IRQ, - IRQF_DISABLED, - floppy_interrupt); - return ((error == 0) ? 0 : -1); - } else return 0; + return sparc_floppy_request_irq(FLOPPY_IRQ, floppy_interrupt); + } else { + return 0; + } } static struct linux_prom_registers fd_regs[2]; static int sun_floppy_init(void) { + struct platform_device *op; + struct device_node *dp; char state[128]; phandle tnode, fd_node; int num_regs; @@ -310,7 +309,6 @@ static int sun_floppy_init(void) use_virtual_dma = 1; - FLOPPY_IRQ = 11; /* Forget it if we aren't on a machine that could possibly * ever have a floppy drive. */ @@ -349,6 +347,26 @@ static int sun_floppy_init(void) sun_fdc = (struct sun_flpy_controller *) of_ioremap(&r, 0, fd_regs[0].reg_size, "floppy"); + /* Look up irq in platform_device. + * We try "SUNW,fdtwo" and "fd" + */ + for_each_node_by_name(dp, "SUNW,fdtwo") { + op = of_find_device_by_node(dp); + if (op) + break; + } + if (!op) { + for_each_node_by_name(dp, "fd") { + op = of_find_device_by_node(dp); + if (op) + break; + } + } + if (!op) + goto no_sun_fdc; + + FLOPPY_IRQ = op->archdata.irqs[0]; + /* Last minute sanity check... */ if(sun_fdc->status_82072 == 0xff) { sun_fdc = NULL; diff --git a/arch/sparc/include/asm/io.h b/arch/sparc/include/asm/io.h index a34b2994937..f6902cf3cbe 100644 --- a/arch/sparc/include/asm/io.h +++ b/arch/sparc/include/asm/io.h @@ -5,4 +5,17 @@ #else #include <asm/io_32.h> #endif + +/* + * Defines used for both SPARC32 and SPARC64 + */ + +/* Big endian versions of memory read/write routines */ +#define readb_be(__addr) __raw_readb(__addr) +#define readw_be(__addr) __raw_readw(__addr) +#define readl_be(__addr) __raw_readl(__addr) +#define writeb_be(__b, __addr) __raw_writeb(__b, __addr) +#define writel_be(__w, __addr) __raw_writel(__w, __addr) +#define writew_be(__l, __addr) __raw_writew(__l, __addr) + #endif diff --git a/arch/sparc/include/asm/irq_32.h b/arch/sparc/include/asm/irq_32.h index eced3e3ebd3..2ae3acaeb1b 100644 --- a/arch/sparc/include/asm/irq_32.h +++ b/arch/sparc/include/asm/irq_32.h @@ -6,7 +6,11 @@ #ifndef _SPARC_IRQ_H #define _SPARC_IRQ_H -#define NR_IRQS 16 +/* Allocated number of logical irq numbers. + * sun4d boxes (ss2000e) should be OK with ~32. + * Be on the safe side and make room for 64 + */ +#define NR_IRQS 64 #include <linux/interrupt.h> diff --git a/arch/sparc/include/asm/leon.h b/arch/sparc/include/asm/leon.h index c04f96fb753..6bdaf1e43d2 100644 --- a/arch/sparc/include/asm/leon.h +++ b/arch/sparc/include/asm/leon.h @@ -52,29 +52,6 @@ #define LEON_DIAGF_VALID 0x2000 #define LEON_DIAGF_VALID_SHIFT 13 -/* - * Interrupt Sources - * - * The interrupt source numbers directly map to the trap type and to - * the bits used in the Interrupt Clear, Interrupt Force, Interrupt Mask, - * and the Interrupt Pending Registers. - */ -#define LEON_INTERRUPT_CORRECTABLE_MEMORY_ERROR 1 -#define LEON_INTERRUPT_UART_1_RX_TX 2 -#define LEON_INTERRUPT_UART_0_RX_TX 3 -#define LEON_INTERRUPT_EXTERNAL_0 4 -#define LEON_INTERRUPT_EXTERNAL_1 5 -#define LEON_INTERRUPT_EXTERNAL_2 6 -#define LEON_INTERRUPT_EXTERNAL_3 7 -#define LEON_INTERRUPT_TIMER1 8 -#define LEON_INTERRUPT_TIMER2 9 -#define LEON_INTERRUPT_EMPTY1 10 -#define LEON_INTERRUPT_EMPTY2 11 -#define LEON_INTERRUPT_OPEN_ETH 12 -#define LEON_INTERRUPT_EMPTY4 13 -#define LEON_INTERRUPT_EMPTY5 14 -#define LEON_INTERRUPT_EMPTY6 15 - /* irq masks */ #define LEON_HARD_INT(x) (1 << (x)) /* irq 0-15 */ #define LEON_IRQMASK_R 0x0000fffe /* bit 15- 1 of lregs.irqmask */ @@ -183,7 +160,6 @@ static inline void leon_srmmu_enabletlb(void) /* macro access for leon_readnobuffer_reg() */ #define LEON_BYPASSCACHE_LOAD_VA(x) leon_readnobuffer_reg((unsigned long)(x)) -extern void sparc_leon_eirq_register(int eirq); extern void leon_init(void); extern void leon_switch_mm(void); extern void leon_init_IRQ(void); @@ -239,8 +215,8 @@ static inline int sparc_leon3_cpuid(void) #endif /*!__ASSEMBLY__*/ #ifdef CONFIG_SMP -# define LEON3_IRQ_RESCHEDULE 13 -# define LEON3_IRQ_TICKER (leon_percpu_timer_dev[0].irq) +# define LEON3_IRQ_IPI_DEFAULT 13 +# define LEON3_IRQ_TICKER (leon3_ticker_irq) # define LEON3_IRQ_CROSS_CALL 15 #endif @@ -339,9 +315,9 @@ struct leon2_cacheregs { #include <linux/interrupt.h> struct device_node; -extern int sparc_leon_eirq_get(int eirq, int cpu); -extern irqreturn_t sparc_leon_eirq_isr(int dummy, void *dev_id); -extern void sparc_leon_eirq_register(int eirq); +extern unsigned int leon_build_device_irq(unsigned int real_irq, + irq_flow_handler_t flow_handler, + const char *name, int do_ack); extern void leon_clear_clock_irq(void); extern void leon_load_profile_irq(int cpu, unsigned int limit); extern void leon_init_timers(irq_handler_t counter_fn); @@ -358,6 +334,7 @@ extern void leon3_getCacheRegs(struct leon3_cacheregs *regs); extern int leon_flush_needed(void); extern void leon_switch_mm(void); extern int srmmu_swprobe_trace; +extern int leon3_ticker_irq; #ifdef CONFIG_SMP extern int leon_smp_nrcpus(void); @@ -366,17 +343,19 @@ extern void leon_smp_done(void); extern void leon_boot_cpus(void); extern int leon_boot_one_cpu(int i); void leon_init_smp(void); -extern void cpu_probe(void); extern void cpu_idle(void); extern void init_IRQ(void); extern void cpu_panic(void); extern int __leon_processor_id(void); void leon_enable_irq_cpu(unsigned int irq_nr, unsigned int cpu); +extern irqreturn_t leon_percpu_timer_interrupt(int irq, void *unused); -extern unsigned int real_irq_entry[], smpleon_ticker[]; +extern unsigned int real_irq_entry[]; +extern unsigned int smpleon_ipi[]; extern unsigned int patchme_maybe_smp_msg[]; extern unsigned int t_nmi[], linux_trap_ipi15_leon[]; extern unsigned int linux_trap_ipi15_sun4m[]; +extern int leon_ipi_irq; #endif /* CONFIG_SMP */ diff --git a/arch/sparc/include/asm/pcic.h b/arch/sparc/include/asm/pcic.h index f20ef562b26..7eb5d78f521 100644 --- a/arch/sparc/include/asm/pcic.h +++ b/arch/sparc/include/asm/pcic.h @@ -29,11 +29,17 @@ struct linux_pcic { int pcic_imdim; }; -extern int pcic_probe(void); -/* Erm... MJ redefined pcibios_present() so that it does not work early. */ +#ifdef CONFIG_PCI extern int pcic_present(void); +extern int pcic_probe(void); +extern void pci_time_init(void); extern void sun4m_pci_init_IRQ(void); - +#else +static inline int pcic_present(void) { return 0; } +static inline int pcic_probe(void) { return 0; } +static inline void pci_time_init(void) {} +static inline void sun4m_pci_init_IRQ(void) {} +#endif #endif /* Size of PCI I/O space which we relocate. */ diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 303bd4dc829..5b31a8e8982 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -8,6 +8,8 @@ * Copyright (C) 1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) */ +#include <linux/const.h> + #ifndef __ASSEMBLY__ #include <asm-generic/4level-fixup.h> @@ -456,9 +458,9 @@ extern int io_remap_pfn_range(struct vm_area_struct *vma, #endif /* !(__ASSEMBLY__) */ -#define VMALLOC_START 0xfe600000 +#define VMALLOC_START _AC(0xfe600000,UL) /* XXX Alter this when I get around to fixing sun4c - Anton */ -#define VMALLOC_END 0xffc00000 +#define VMALLOC_END _AC(0xffc00000,UL) /* We provide our own get_unmapped_area to cope with VA holes for userland */ diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index f8dddb7045b..b77128c8052 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -699,6 +699,9 @@ extern pmd_t swapper_low_pmd_dir[2048]; extern void paging_init(void); extern unsigned long find_ecache_flush_span(unsigned long size); +struct seq_file; +extern void mmu_info(struct seq_file *); + /* These do nothing with the way I have things setup. */ #define mmu_lockarea(vaddr, len) (vaddr) #define mmu_unlockarea(vaddr, len) do { } while(0) diff --git a/arch/sparc/include/asm/setup.h b/arch/sparc/include/asm/setup.h index 2643c62f4ac..64718ba2643 100644 --- a/arch/sparc/include/asm/setup.h +++ b/arch/sparc/include/asm/setup.h @@ -11,4 +11,16 @@ # define COMMAND_LINE_SIZE 256 #endif +#ifdef __KERNEL__ + +#ifdef CONFIG_SPARC32 +/* The CPU that was used for booting + * Only sun4d + leon may have boot_cpu_id != 0 + */ +extern unsigned char boot_cpu_id; +extern unsigned char boot_cpu_id4; +#endif + +#endif /* __KERNEL__ */ + #endif /* _SPARC_SETUP_H */ diff --git a/arch/sparc/include/asm/smp_32.h b/arch/sparc/include/asm/smp_32.h index d82d7f4c0a7..093f10843ff 100644 --- a/arch/sparc/include/asm/smp_32.h +++ b/arch/sparc/include/asm/smp_32.h @@ -50,42 +50,38 @@ void smp_callin(void); void smp_boot_cpus(void); void smp_store_cpu_info(int); +void smp_resched_interrupt(void); +void smp_call_function_single_interrupt(void); +void smp_call_function_interrupt(void); + struct seq_file; void smp_bogo(struct seq_file *); void smp_info(struct seq_file *); BTFIXUPDEF_CALL(void, smp_cross_call, smpfunc_t, cpumask_t, unsigned long, unsigned long, unsigned long, unsigned long) BTFIXUPDEF_CALL(int, __hard_smp_processor_id, void) +BTFIXUPDEF_CALL(void, smp_ipi_resched, int); +BTFIXUPDEF_CALL(void, smp_ipi_single, int); +BTFIXUPDEF_CALL(void, smp_ipi_mask_one, int); BTFIXUPDEF_BLACKBOX(hard_smp_processor_id) BTFIXUPDEF_BLACKBOX(load_current) #define smp_cross_call(func,mask,arg1,arg2,arg3,arg4) BTFIXUP_CALL(smp_cross_call)(func,mask,arg1,arg2,arg3,arg4) -static inline void xc0(smpfunc_t func) { smp_cross_call(func, cpu_online_map, 0, 0, 0, 0); } +static inline void xc0(smpfunc_t func) { smp_cross_call(func, *cpu_online_mask, 0, 0, 0, 0); } static inline void xc1(smpfunc_t func, unsigned long arg1) -{ smp_cross_call(func, cpu_online_map, arg1, 0, 0, 0); } +{ smp_cross_call(func, *cpu_online_mask, arg1, 0, 0, 0); } static inline void xc2(smpfunc_t func, unsigned long arg1, unsigned long arg2) -{ smp_cross_call(func, cpu_online_map, arg1, arg2, 0, 0); } +{ smp_cross_call(func, *cpu_online_mask, arg1, arg2, 0, 0); } static inline void xc3(smpfunc_t func, unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ smp_cross_call(func, cpu_online_map, arg1, arg2, arg3, 0); } +{ smp_cross_call(func, *cpu_online_mask, arg1, arg2, arg3, 0); } static inline void xc4(smpfunc_t func, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4) -{ smp_cross_call(func, cpu_online_map, arg1, arg2, arg3, arg4); } - -static inline int smp_call_function(void (*func)(void *info), void *info, int wait) -{ - xc1((smpfunc_t)func, (unsigned long)info); - return 0; -} +{ smp_cross_call(func, *cpu_online_mask, arg1, arg2, arg3, arg4); } -static inline int smp_call_function_single(int cpuid, void (*func) (void *info), - void *info, int wait) -{ - smp_cross_call((smpfunc_t)func, cpumask_of_cpu(cpuid), - (unsigned long) info, 0, 0, 0); - return 0; -} +extern void arch_send_call_function_single_ipi(int cpu); +extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); static inline int cpu_logical_map(int cpu) { @@ -135,6 +131,11 @@ static inline int hard_smp_processor_id(void) __asm__ __volatile__("lda [%g0] ASI_M_VIKING_TMP1, %0\n\t" "nop; nop" : "=&r" (cpuid)); + - leon + __asm__ __volatile__( "rd %asr17, %0\n\t" + "srl %0, 0x1c, %0\n\t" + "nop\n\t" : + "=&r" (cpuid)); See btfixup.h and btfixupprep.c to understand how a blackbox works. */ __asm__ __volatile__("sethi %%hi(___b_hard_smp_processor_id), %0\n\t" diff --git a/arch/sparc/include/asm/smp_64.h b/arch/sparc/include/asm/smp_64.h index f49e11cd4de..20bca895071 100644 --- a/arch/sparc/include/asm/smp_64.h +++ b/arch/sparc/include/asm/smp_64.h @@ -49,6 +49,10 @@ extern void cpu_play_dead(void); extern void smp_fetch_global_regs(void); +struct seq_file; +void smp_bogo(struct seq_file *); +void smp_info(struct seq_file *); + #ifdef CONFIG_HOTPLUG_CPU extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h index 7f9b9dba38a..5f5b8bf3f50 100644 --- a/arch/sparc/include/asm/spinlock_32.h +++ b/arch/sparc/include/asm/spinlock_32.h @@ -9,6 +9,7 @@ #ifndef __ASSEMBLY__ #include <asm/psr.h> +#include <asm/processor.h> /* for cpu_relax */ #define arch_spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0) diff --git a/arch/sparc/include/asm/system_32.h b/arch/sparc/include/asm/system_32.h index 890036b3689..47a7e862474 100644 --- a/arch/sparc/include/asm/system_32.h +++ b/arch/sparc/include/asm/system_32.h @@ -15,11 +15,6 @@ #include <linux/irqflags.h> -static inline unsigned int probe_irq_mask(unsigned long val) -{ - return 0; -} - /* * Sparc (general) CPU types */ diff --git a/arch/sparc/include/asm/system_64.h b/arch/sparc/include/asm/system_64.h index e3b65d8cf41..3c96d3bb9f1 100644 --- a/arch/sparc/include/asm/system_64.h +++ b/arch/sparc/include/asm/system_64.h @@ -29,10 +29,6 @@ enum sparc_cpu { /* This cannot ever be a sun4c :) That's just history. */ #define ARCH_SUN4C 0 -extern const char *sparc_cpu_type; -extern const char *sparc_fpu_type; -extern const char *sparc_pmu_type; - extern char reboot_command[]; /* These are here in an effort to more fully work around Spitfire Errata diff --git a/arch/sparc/include/asm/winmacro.h b/arch/sparc/include/asm/winmacro.h index 5b0a06dc3bc..a9be04b0d04 100644 --- a/arch/sparc/include/asm/winmacro.h +++ b/arch/sparc/include/asm/winmacro.h @@ -103,6 +103,7 @@ st %scratch, [%cur_reg + TI_W_SAVED]; #ifdef CONFIG_SMP +/* Results of LOAD_CURRENT() after BTFIXUP for SUN4M, SUN4D & LEON (comments) */ #define LOAD_CURRENT4M(dest_reg, idreg) \ rd %tbr, %idreg; \ sethi %hi(current_set), %dest_reg; \ @@ -118,6 +119,14 @@ or %dest_reg, %lo(C_LABEL(current_set)), %dest_reg; \ ld [%idreg + %dest_reg], %dest_reg; +#define LOAD_CURRENT_LEON(dest_reg, idreg) \ + rd %asr17, %idreg; \ + sethi %hi(current_set), %dest_reg; \ + srl %idreg, 0x1c, %idreg; \ + or %dest_reg, %lo(current_set), %dest_reg; \ + sll %idreg, 0x2, %idreg; \ + ld [%idreg + %dest_reg], %dest_reg; + /* Blackbox - take care with this... - check smp4m and smp4d before changing this. */ #define LOAD_CURRENT(dest_reg, idreg) \ sethi %hi(___b_load_current), %idreg; \ diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile index 99aa4db6e9c..9cff2709a96 100644 --- a/arch/sparc/kernel/Makefile +++ b/arch/sparc/kernel/Makefile @@ -71,10 +71,6 @@ obj-$(CONFIG_SPARC64) += pcr.o obj-$(CONFIG_SPARC64) += nmi.o obj-$(CONFIG_SPARC64_SMP) += cpumap.o -# sparc32 do not use GENERIC_HARDIRQS but uses the generic devres implementation -obj-$(CONFIG_SPARC32) += devres.o -devres-y := ../../../kernel/irq/devres.o - obj-y += dma.o obj-$(CONFIG_SPARC32_PCI) += pcic.o diff --git a/arch/sparc/kernel/cpu.c b/arch/sparc/kernel/cpu.c index 7925c54f413..138dbbc8dc8 100644 --- a/arch/sparc/kernel/cpu.c +++ b/arch/sparc/kernel/cpu.c @@ -4,6 +4,7 @@ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) */ +#include <linux/seq_file.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> @@ -11,7 +12,9 @@ #include <linux/threads.h> #include <asm/spitfire.h> +#include <asm/pgtable.h> #include <asm/oplib.h> +#include <asm/setup.h> #include <asm/page.h> #include <asm/head.h> #include <asm/psr.h> @@ -23,6 +26,9 @@ DEFINE_PER_CPU(cpuinfo_sparc, __cpu_data) = { 0 }; EXPORT_PER_CPU_SYMBOL(__cpu_data); +int ncpus_probed; +unsigned int fsr_storage; + struct cpu_info { int psr_vers; const char *name; @@ -247,13 +253,12 @@ static const struct manufacturer_info __initconst manufacturer_info[] = { * machine type value into consideration too. I will fix this. */ -const char *sparc_cpu_type; -const char *sparc_fpu_type; +static const char *sparc_cpu_type; +static const char *sparc_fpu_type; const char *sparc_pmu_type; -unsigned int fsr_storage; -static void set_cpu_and_fpu(int psr_impl, int psr_vers, int fpu_vers) +static void __init set_cpu_and_fpu(int psr_impl, int psr_vers, int fpu_vers) { const struct manufacturer_info *manuf; int i; @@ -313,7 +318,123 @@ static void set_cpu_and_fpu(int psr_impl, int psr_vers, int fpu_vers) } #ifdef CONFIG_SPARC32 -void __cpuinit cpu_probe(void) +static int show_cpuinfo(struct seq_file *m, void *__unused) +{ + seq_printf(m, + "cpu\t\t: %s\n" + "fpu\t\t: %s\n" + "promlib\t\t: Version %d Revision %d\n" + "prom\t\t: %d.%d\n" + "type\t\t: %s\n" + "ncpus probed\t: %d\n" + "ncpus active\t: %d\n" +#ifndef CONFIG_SMP + "CPU0Bogo\t: %lu.%02lu\n" + "CPU0ClkTck\t: %ld\n" +#endif + , + sparc_cpu_type, + sparc_fpu_type , + romvec->pv_romvers, + prom_rev, + romvec->pv_printrev >> 16, + romvec->pv_printrev & 0xffff, + &cputypval[0], + ncpus_probed, + num_online_cpus() +#ifndef CONFIG_SMP + , cpu_data(0).udelay_val/(500000/HZ), + (cpu_data(0).udelay_val/(5000/HZ)) % 100, + cpu_data(0).clock_tick +#endif + ); + +#ifdef CONFIG_SMP + smp_bogo(m); +#endif + mmu_info(m); +#ifdef CONFIG_SMP + smp_info(m); +#endif + return 0; +} +#endif /* CONFIG_SPARC32 */ + +#ifdef CONFIG_SPARC64 +unsigned int dcache_parity_tl1_occurred; +unsigned int icache_parity_tl1_occurred; + + +static int show_cpuinfo(struct seq_file *m, void *__unused) +{ + seq_printf(m, + "cpu\t\t: %s\n" + "fpu\t\t: %s\n" + "pmu\t\t: %s\n" + "prom\t\t: %s\n" + "type\t\t: %s\n" + "ncpus probed\t: %d\n" + "ncpus active\t: %d\n" + "D$ parity tl1\t: %u\n" + "I$ parity tl1\t: %u\n" +#ifndef CONFIG_SMP + "Cpu0ClkTck\t: %016lx\n" +#endif + , + sparc_cpu_type, + sparc_fpu_type, + sparc_pmu_type, + prom_version, + ((tlb_type == hypervisor) ? + "sun4v" : + "sun4u"), + ncpus_probed, + num_online_cpus(), + dcache_parity_tl1_occurred, + icache_parity_tl1_occurred +#ifndef CONFIG_SMP + , cpu_data(0).clock_tick +#endif + ); +#ifdef CONFIG_SMP + smp_bogo(m); +#endif + mmu_info(m); +#ifdef CONFIG_SMP + smp_info(m); +#endif + return 0; +} +#endif /* CONFIG_SPARC64 */ + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + /* The pointer we are returning is arbitrary, + * it just has to be non-NULL and not IS_ERR + * in the success case. + */ + return *pos == 0 ? &c_start : NULL; +} + +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return c_start(m, pos); +} + +static void c_stop(struct seq_file *m, void *v) +{ +} + +const struct seq_operations cpuinfo_op = { + .start =c_start, + .next = c_next, + .stop = c_stop, + .show = show_cpuinfo, +}; + +#ifdef CONFIG_SPARC32 +static int __init cpu_type_probe(void) { int psr_impl, psr_vers, fpu_vers; int psr; @@ -332,8 +453,12 @@ void __cpuinit cpu_probe(void) put_psr(psr); set_cpu_and_fpu(psr_impl, psr_vers, fpu_vers); + + return 0; } -#else +#endif /* CONFIG_SPARC32 */ + +#ifdef CONFIG_SPARC64 static void __init sun4v_cpu_probe(void) { switch (sun4v_chip_type) { @@ -374,6 +499,6 @@ static int __init cpu_type_probe(void) } return 0; } +#endif /* CONFIG_SPARC64 */ early_initcall(cpu_type_probe); -#endif diff --git a/arch/sparc/kernel/cpumap.c b/arch/sparc/kernel/cpumap.c index 8de64c8126b..d91fd782743 100644 --- a/arch/sparc/kernel/cpumap.c +++ b/arch/sparc/kernel/cpumap.c @@ -202,7 +202,7 @@ static struct cpuinfo_tree *build_cpuinfo_tree(void) new_tree->total_nodes = n; memcpy(&new_tree->level, tmp_level, sizeof(tmp_level)); - prev_cpu = cpu = first_cpu(cpu_online_map); + prev_cpu = cpu = cpumask_first(cpu_online_mask); /* Initialize all levels in the tree with the first CPU */ for (level = CPUINFO_LVL_PROC; level >= CPUINFO_LVL_ROOT; level--) { @@ -381,7 +381,7 @@ static int simple_map_to_cpu(unsigned int index) } /* Impossible, since num_online_cpus() <= num_possible_cpus() */ - return first_cpu(cpu_online_map); + return cpumask_first(cpu_online_mask); } static int _map_to_cpu(unsigned int index) diff --git a/arch/sparc/kernel/devices.c b/arch/sparc/kernel/devices.c index d2eddd6647c..113c052c304 100644 --- a/arch/sparc/kernel/devices.c +++ b/arch/sparc/kernel/devices.c @@ -20,7 +20,6 @@ #include <asm/system.h> #include <asm/cpudata.h> -extern void cpu_probe(void); extern void clock_stop_probe(void); /* tadpole.c */ extern void sun4c_probe_memerr_reg(void); @@ -115,7 +114,7 @@ int cpu_get_hwmid(phandle prom_node) void __init device_scan(void) { - prom_printf("Booting Linux...\n"); + printk(KERN_NOTICE "Booting Linux...\n"); #ifndef CONFIG_SMP { @@ -133,7 +132,6 @@ void __init device_scan(void) } #endif /* !CONFIG_SMP */ - cpu_probe(); { extern void auxio_probe(void); extern void auxio_power_probe(void); diff --git a/arch/sparc/kernel/ds.c b/arch/sparc/kernel/ds.c index 3add4de8a1a..dd1342c0a3b 100644 --- a/arch/sparc/kernel/ds.c +++ b/arch/sparc/kernel/ds.c @@ -497,7 +497,7 @@ static void dr_cpu_init_response(struct ds_data *resp, u64 req_num, tag->num_records = ncpus; i = 0; - for_each_cpu_mask(cpu, *mask) { + for_each_cpu(cpu, mask) { ent[i].cpu = cpu; ent[i].result = DR_CPU_RES_OK; ent[i].stat = default_stat; @@ -534,7 +534,7 @@ static int __cpuinit dr_cpu_configure(struct ds_info *dp, int resp_len, ncpus, cpu; unsigned long flags; - ncpus = cpus_weight(*mask); + ncpus = cpumask_weight(mask); resp_len = dr_cpu_size_response(ncpus); resp = kzalloc(resp_len, GFP_KERNEL); if (!resp) @@ -547,7 +547,7 @@ static int __cpuinit dr_cpu_configure(struct ds_info *dp, mdesc_populate_present_mask(mask); mdesc_fill_in_cpu_data(mask); - for_each_cpu_mask(cpu, *mask) { + for_each_cpu(cpu, mask) { int err; printk(KERN_INFO "ds-%llu: Starting cpu %d...\n", @@ -593,7 +593,7 @@ static int dr_cpu_unconfigure(struct ds_info *dp, int resp_len, ncpus, cpu; unsigned long flags; - ncpus = cpus_weight(*mask); + ncpus = cpumask_weight(mask); resp_len = dr_cpu_size_response(ncpus); resp = kzalloc(resp_len, GFP_KERNEL); if (!resp) @@ -603,7 +603,7 @@ static int dr_cpu_unconfigure(struct ds_info *dp, resp_len, ncpus, mask, DR_CPU_STAT_UNCONFIGURED); - for_each_cpu_mask(cpu, *mask) { + for_each_cpu(cpu, mask) { int err; printk(KERN_INFO "ds-%llu: Shutting down cpu %d...\n", @@ -649,13 +649,13 @@ static void __cpuinit dr_cpu_data(struct ds_info *dp, purge_dups(cpu_list, tag->num_records); - cpus_clear(mask); + cpumask_clear(&mask); for (i = 0; i < tag->num_records; i++) { if (cpu_list[i] == CPU_SENTINEL) continue; if (cpu_list[i] < nr_cpu_ids) - cpu_set(cpu_list[i], mask); + cpumask_set_cpu(cpu_list[i], &mask); } if (tag->type == DR_CPU_CONFIGURE) diff --git a/arch/sparc/kernel/entry.S b/arch/sparc/kernel/entry.S index 6da784a5612..8341963f4c8 100644 --- a/arch/sparc/kernel/entry.S +++ b/arch/sparc/kernel/entry.S @@ -269,19 +269,22 @@ smp4m_ticker: /* Here is where we check for possible SMP IPI passed to us * on some level other than 15 which is the NMI and only used * for cross calls. That has a separate entry point below. + * + * IPIs are sent on Level 12, 13 and 14. See IRQ_IPI_*. */ maybe_smp4m_msg: GET_PROCESSOR4M_ID(o3) sethi %hi(sun4m_irq_percpu), %l5 sll %o3, 2, %o3 or %l5, %lo(sun4m_irq_percpu), %o5 - sethi %hi(0x40000000), %o2 + sethi %hi(0x70000000), %o2 ! Check all soft-IRQs ld [%o5 + %o3], %o1 ld [%o1 + 0x00], %o3 ! sun4m_irq_percpu[cpu]->pending andcc %o3, %o2, %g0 be,a smp4m_ticker cmp %l7, 14 - st %o2, [%o1 + 0x04] ! sun4m_irq_percpu[cpu]->clear=0x40000000 + /* Soft-IRQ IPI */ + st %o2, [%o1 + 0x04] ! sun4m_irq_percpu[cpu]->clear=0x70000000 WRITE_PAUSE ld [%o1 + 0x00], %g0 ! sun4m_irq_percpu[cpu]->pending WRITE_PAUSE @@ -290,9 +293,27 @@ maybe_smp4m_msg: WRITE_PAUSE wr %l4, PSR_ET, %psr WRITE_PAUSE - call smp_reschedule_irq + sll %o2, 28, %o2 ! shift for simpler checks below +maybe_smp4m_msg_check_single: + andcc %o2, 0x1, %g0 + beq,a maybe_smp4m_msg_check_mask + andcc %o2, 0x2, %g0 + call smp_call_function_single_interrupt nop - + andcc %o2, 0x2, %g0 +maybe_smp4m_msg_check_mask: + beq,a maybe_smp4m_msg_check_resched + andcc %o2, 0x4, %g0 + call smp_call_function_interrupt + nop + andcc %o2, 0x4, %g0 +maybe_smp4m_msg_check_resched: + /* rescheduling is done in RESTORE_ALL regardless, but incr stats */ + beq,a maybe_smp4m_msg_out + nop + call smp_resched_interrupt + nop +maybe_smp4m_msg_out: RESTORE_ALL .align 4 @@ -401,18 +422,18 @@ linux_trap_ipi15_sun4d: 1: b,a 1b #ifdef CONFIG_SPARC_LEON - - .globl smpleon_ticker - /* SMP per-cpu ticker interrupts are handled specially. */ -smpleon_ticker: + .globl smpleon_ipi + .extern leon_ipi_interrupt + /* SMP per-cpu IPI interrupts are handled specially. */ +smpleon_ipi: SAVE_ALL or %l0, PSR_PIL, %g2 wr %g2, 0x0, %psr WRITE_PAUSE wr %g2, PSR_ET, %psr WRITE_PAUSE - call leon_percpu_timer_interrupt - add %sp, STACKFRAME_SZ, %o0 + call leonsmp_ipi_interrupt + add %sp, STACKFRAME_SZ, %o1 ! pt_regs wr %l0, PSR_ET, %psr WRITE_PAUSE RESTORE_ALL diff --git a/arch/sparc/kernel/head_32.S b/arch/sparc/kernel/head_32.S index 59423491cef..58778575983 100644 --- a/arch/sparc/kernel/head_32.S +++ b/arch/sparc/kernel/head_32.S @@ -810,31 +810,25 @@ found_version: got_prop: #ifdef CONFIG_SPARC_LEON /* no cpu-type check is needed, it is a SPARC-LEON */ -#ifdef CONFIG_SMP - ba leon_smp_init - nop - .global leon_smp_init -leon_smp_init: - sethi %hi(boot_cpu_id), %g1 ! master always 0 - stb %g0, [%g1 + %lo(boot_cpu_id)] - sethi %hi(boot_cpu_id4), %g1 ! master always 0 - stb %g0, [%g1 + %lo(boot_cpu_id4)] + sethi %hi(boot_cpu_id), %g2 ! boot-cpu index - rd %asr17,%g1 - srl %g1,28,%g1 +#ifdef CONFIG_SMP + ldub [%g2 + %lo(boot_cpu_id)], %g1 + cmp %g1, 0xff ! unset means first CPU + bne leon_smp_cpu_startup ! continue only with master + nop +#endif + /* Get CPU-ID from most significant 4-bit of ASR17 */ + rd %asr17, %g1 + srl %g1, 28, %g1 - cmp %g0,%g1 - beq sun4c_continue_boot !continue with master - nop + /* Update boot_cpu_id only on boot cpu */ + stub %g1, [%g2 + %lo(boot_cpu_id)] - ba leon_smp_cpu_startup - nop -#else ba sun4c_continue_boot nop #endif -#endif set cputypval, %o2 ldub [%o2 + 0x4], %l1 @@ -893,9 +887,6 @@ sun4d_init: sta %g4, [%g0] ASI_M_VIKING_TMP1 sethi %hi(boot_cpu_id), %g5 stb %g4, [%g5 + %lo(boot_cpu_id)] - sll %g4, 2, %g4 - sethi %hi(boot_cpu_id4), %g5 - stb %g4, [%g5 + %lo(boot_cpu_id4)] #endif /* Fall through to sun4m_init */ @@ -1024,14 +1015,28 @@ sun4c_continue_boot: bl 1b add %o0, 0x1, %o0 + /* If boot_cpu_id has not been setup by machine specific + * init-code above we default it to zero. + */ + sethi %hi(boot_cpu_id), %g2 + ldub [%g2 + %lo(boot_cpu_id)], %g3 + cmp %g3, 0xff + bne 1f + nop + mov %g0, %g3 + stub %g3, [%g2 + %lo(boot_cpu_id)] + +1: /* boot_cpu_id set. calculate boot_cpu_id4 = boot_cpu_id*4 */ + sll %g3, 2, %g3 + sethi %hi(boot_cpu_id4), %g2 + stub %g3, [%g2 + %lo(boot_cpu_id4)] + /* Initialize the uwinmask value for init task just in case. * But first make current_set[boot_cpu_id] point to something useful. */ set init_thread_union, %g6 set current_set, %g2 #ifdef CONFIG_SMP - sethi %hi(boot_cpu_id4), %g3 - ldub [%g3 + %lo(boot_cpu_id4)], %g3 st %g6, [%g2] add %g2, %g3, %g2 #endif diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c index c6ce9a6a479..1c9c80a1a86 100644 --- a/arch/sparc/kernel/ioport.c +++ b/arch/sparc/kernel/ioport.c @@ -50,10 +50,15 @@ #include <asm/io-unit.h> #include <asm/leon.h> +/* This function must make sure that caches and memory are coherent after DMA + * On LEON systems without cache snooping it flushes the entire D-CACHE. + */ #ifndef CONFIG_SPARC_LEON -#define mmu_inval_dma_area(p, l) /* Anton pulled it out for 2.4.0-xx */ +static inline void dma_make_coherent(unsigned long pa, unsigned long len) +{ +} #else -static inline void mmu_inval_dma_area(void *va, unsigned long len) +static inline void dma_make_coherent(unsigned long pa, unsigned long len) { if (!sparc_leon3_snooping_enabled()) leon_flush_dcache_all(); @@ -284,7 +289,6 @@ static void *sbus_alloc_coherent(struct device *dev, size_t len, printk("sbus_alloc_consistent: cannot occupy 0x%lx", len_total); goto err_nova; } - mmu_inval_dma_area((void *)va, len_total); // XXX The mmu_map_dma_area does this for us below, see comments. // sparc_mapiorange(0, virt_to_phys(va), res->start, len_total); @@ -336,7 +340,6 @@ static void sbus_free_coherent(struct device *dev, size_t n, void *p, release_resource(res); kfree(res); - /* mmu_inval_dma_area(va, n); */ /* it's consistent, isn't it */ pgv = virt_to_page(p); mmu_unmap_dma_area(dev, ba, n); @@ -463,7 +466,6 @@ static void *pci32_alloc_coherent(struct device *dev, size_t len, printk("pci_alloc_consistent: cannot occupy 0x%lx", len_total); goto err_nova; } - mmu_inval_dma_area(va, len_total); sparc_mapiorange(0, virt_to_phys(va), res->start, len_total); *pba = virt_to_phys(va); /* equals virt_to_bus (R.I.P.) for us. */ @@ -489,7 +491,6 @@ static void pci32_free_coherent(struct device *dev, size_t n, void *p, dma_addr_t ba) { struct resource *res; - void *pgp; if ((res = _sparc_find_resource(&_sparc_dvma, (unsigned long)p)) == NULL) { @@ -509,14 +510,12 @@ static void pci32_free_coherent(struct device *dev, size_t n, void *p, return; } - pgp = phys_to_virt(ba); /* bus_to_virt actually */ - mmu_inval_dma_area(pgp, n); + dma_make_coherent(ba, n); sparc_unmapiorange((unsigned long)p, n); release_resource(res); kfree(res); - - free_pages((unsigned long)pgp, get_order(n)); + free_pages((unsigned long)phys_to_virt(ba), get_order(n)); } /* @@ -535,7 +534,7 @@ static void pci32_unmap_page(struct device *dev, dma_addr_t ba, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { if (dir != PCI_DMA_TODEVICE) - mmu_inval_dma_area(phys_to_virt(ba), PAGE_ALIGN(size)); + dma_make_coherent(ba, PAGE_ALIGN(size)); } /* Map a set of buffers described by scatterlist in streaming @@ -562,8 +561,7 @@ static int pci32_map_sg(struct device *device, struct scatterlist *sgl, /* IIep is write-through, not flushing. */ for_each_sg(sgl, sg, nents, n) { - BUG_ON(page_address(sg_page(sg)) == NULL); - sg->dma_address = virt_to_phys(sg_virt(sg)); + sg->dma_address = sg_phys(sg); sg->dma_length = sg->length; } return nents; @@ -582,9 +580,7 @@ static void pci32_unmap_sg(struct device *dev, struct scatterlist *sgl, if (dir != PCI_DMA_TODEVICE) { for_each_sg(sgl, sg, nents, n) { - BUG_ON(page_address(sg_page(sg)) == NULL); - mmu_inval_dma_area(page_address(sg_page(sg)), - PAGE_ALIGN(sg->length)); + dma_make_coherent(sg_phys(sg), PAGE_ALIGN(sg->length)); } } } @@ -603,8 +599,7 @@ static void pci32_sync_single_for_cpu(struct device *dev, dma_addr_t ba, size_t size, enum dma_data_direction dir) { if (dir != PCI_DMA_TODEVICE) { - mmu_inval_dma_area(phys_to_virt(ba), - PAGE_ALIGN(size)); + dma_make_coherent(ba, PAGE_ALIGN(size)); } } @@ -612,8 +607,7 @@ static void pci32_sync_single_for_device(struct device *dev, dma_addr_t ba, size_t size, enum dma_data_direction dir) { if (dir != PCI_DMA_TODEVICE) { - mmu_inval_dma_area(phys_to_virt(ba), - PAGE_ALIGN(size)); + dma_make_coherent(ba, PAGE_ALIGN(size)); } } @@ -631,9 +625,7 @@ static void pci32_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, if (dir != PCI_DMA_TODEVICE) { for_each_sg(sgl, sg, nents, n) { - BUG_ON(page_address(sg_page(sg)) == NULL); - mmu_inval_dma_area(page_address(sg_page(sg)), - PAGE_ALIGN(sg->length)); + dma_make_coherent(sg_phys(sg), PAGE_ALIGN(sg->length)); } } } @@ -646,9 +638,7 @@ static void pci32_sync_sg_for_device(struct device *device, struct scatterlist * if (dir != PCI_DMA_TODEVICE) { for_each_sg(sgl, sg, nents, n) { - BUG_ON(page_address(sg_page(sg)) == NULL); - mmu_inval_dma_area(page_address(sg_page(sg)), - PAGE_ALIGN(sg->length)); + dma_make_coherent(sg_phys(sg), PAGE_ALIGN(sg->length)); } } } diff --git a/arch/sparc/kernel/irq.h b/arch/sparc/kernel/irq.h index 008453b798e..100b9c204e7 100644 --- a/arch/sparc/kernel/irq.h +++ b/arch/sparc/kernel/irq.h @@ -2,6 +2,23 @@ #include <asm/btfixup.h> +struct irq_bucket { + struct irq_bucket *next; + unsigned int real_irq; + unsigned int irq; + unsigned int pil; +}; + +#define SUN4D_MAX_BOARD 10 +#define SUN4D_MAX_IRQ ((SUN4D_MAX_BOARD + 2) << 5) + +/* Map between the irq identifier used in hw to the + * irq_bucket. The map is sufficient large to hold + * the sun4d hw identifiers. + */ +extern struct irq_bucket *irq_map[SUN4D_MAX_IRQ]; + + /* sun4m specific type definitions */ /* This maps direct to CPU specific interrupt registers */ @@ -35,6 +52,10 @@ struct sparc_irq_config { }; extern struct sparc_irq_config sparc_irq_config; +unsigned int irq_alloc(unsigned int real_irq, unsigned int pil); +void irq_link(unsigned int irq); +void irq_unlink(unsigned int irq); +void handler_irq(unsigned int pil, struct pt_regs *regs); /* Dave Redman (djhr@tadpole.co.uk) * changed these to function pointers.. it saves cycles and will allow @@ -44,33 +65,9 @@ extern struct sparc_irq_config sparc_irq_config; * Changed these to btfixup entities... It saves cycles :) */ -BTFIXUPDEF_CALL(void, disable_irq, unsigned int) -BTFIXUPDEF_CALL(void, enable_irq, unsigned int) -BTFIXUPDEF_CALL(void, disable_pil_irq, unsigned int) -BTFIXUPDEF_CALL(void, enable_pil_irq, unsigned int) BTFIXUPDEF_CALL(void, clear_clock_irq, void) BTFIXUPDEF_CALL(void, load_profile_irq, int, unsigned int) -static inline void __disable_irq(unsigned int irq) -{ - BTFIXUP_CALL(disable_irq)(irq); -} - -static inline void __enable_irq(unsigned int irq) -{ - BTFIXUP_CALL(enable_irq)(irq); -} - -static inline void disable_pil_irq(unsigned int irq) -{ - BTFIXUP_CALL(disable_pil_irq)(irq); -} - -static inline void enable_pil_irq(unsigned int irq) -{ - BTFIXUP_CALL(enable_pil_irq)(irq); -} - static inline void clear_clock_irq(void) { BTFIXUP_CALL(clear_clock_irq)(); @@ -89,4 +86,10 @@ BTFIXUPDEF_CALL(void, set_irq_udt, int) #define set_cpu_int(cpu,level) BTFIXUP_CALL(set_cpu_int)(cpu,level) #define clear_cpu_int(cpu,level) BTFIXUP_CALL(clear_cpu_int)(cpu,level) #define set_irq_udt(cpu) BTFIXUP_CALL(set_irq_udt)(cpu) + +/* All SUN4D IPIs are sent on this IRQ, may be shared with hard IRQs */ +#define SUN4D_IPI_IRQ 14 + +extern void sun4d_ipi_interrupt(void); + #endif diff --git a/arch/sparc/kernel/irq_32.c b/arch/sparc/kernel/irq_32.c index 7c93df4099c..9b89d842913 100644 --- a/arch/sparc/kernel/irq_32.c +++ b/arch/sparc/kernel/irq_32.c @@ -15,6 +15,7 @@ #include <linux/seq_file.h> #include <asm/cacheflush.h> +#include <asm/cpudata.h> #include <asm/pcic.h> #include <asm/leon.h> @@ -101,284 +102,173 @@ EXPORT_SYMBOL(arch_local_irq_restore); * directed CPU interrupts using the existing enable/disable irq code * with tweaks. * + * Sun4d complicates things even further. IRQ numbers are arbitrary + * 32-bit values in that case. Since this is similar to sparc64, + * we adopt a virtual IRQ numbering scheme as is done there. + * Virutal interrupt numbers are allocated by build_irq(). So NR_IRQS + * just becomes a limit of how many interrupt sources we can handle in + * a single system. Even fully loaded SS2000 machines top off at + * about 32 interrupt sources or so, therefore a NR_IRQS value of 64 + * is more than enough. + * + * We keep a map of per-PIL enable interrupts. These get wired + * up via the irq_chip->startup() method which gets invoked by + * the generic IRQ layer during request_irq(). */ +/* Table of allocated irqs. Unused entries has irq == 0 */ +static struct irq_bucket irq_table[NR_IRQS]; +/* Protect access to irq_table */ +static DEFINE_SPINLOCK(irq_table_lock); -/* - * Dave Redman (djhr@tadpole.co.uk) - * - * There used to be extern calls and hard coded values here.. very sucky! - * instead, because some of the devices attach very early, I do something - * equally sucky but at least we'll never try to free statically allocated - * space or call kmalloc before kmalloc_init :(. - * - * In fact it's the timer10 that attaches first.. then timer14 - * then kmalloc_init is called.. then the tty interrupts attach. - * hmmm.... - * - */ -#define MAX_STATIC_ALLOC 4 -struct irqaction static_irqaction[MAX_STATIC_ALLOC]; -int static_irq_count; - -static struct { - struct irqaction *action; - int flags; -} sparc_irq[NR_IRQS]; -#define SPARC_IRQ_INPROGRESS 1 - -/* Used to protect the IRQ action lists */ -DEFINE_SPINLOCK(irq_action_lock); +/* Map between the irq identifier used in hw to the irq_bucket. */ +struct irq_bucket *irq_map[SUN4D_MAX_IRQ]; +/* Protect access to irq_map */ +static DEFINE_SPINLOCK(irq_map_lock); -int show_interrupts(struct seq_file *p, void *v) +/* Allocate a new irq from the irq_table */ +unsigned int irq_alloc(unsigned int real_irq, unsigned int pil) { - int i = *(loff_t *)v; - struct irqaction *action; unsigned long flags; -#ifdef CONFIG_SMP - int j; -#endif + unsigned int i; + + spin_lock_irqsave(&irq_table_lock, flags); + for (i = 1; i < NR_IRQS; i++) { + if (irq_table[i].real_irq == real_irq && irq_table[i].pil == pil) + goto found; + } - if (sparc_cpu_model == sun4d) - return show_sun4d_interrupts(p, v); + for (i = 1; i < NR_IRQS; i++) { + if (!irq_table[i].irq) + break; + } - spin_lock_irqsave(&irq_action_lock, flags); if (i < NR_IRQS) { - action = sparc_irq[i].action; - if (!action) - goto out_unlock; - seq_printf(p, "%3d: ", i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else - for_each_online_cpu(j) { - seq_printf(p, "%10u ", - kstat_cpu(j).irqs[i]); - } -#endif - seq_printf(p, " %c %s", - (action->flags & IRQF_DISABLED) ? '+' : ' ', - action->name); - for (action = action->next; action; action = action->next) { - seq_printf(p, ",%s %s", - (action->flags & IRQF_DISABLED) ? " +" : "", - action->name); - } - seq_putc(p, '\n'); + irq_table[i].real_irq = real_irq; + irq_table[i].irq = i; + irq_table[i].pil = pil; + } else { + printk(KERN_ERR "IRQ: Out of virtual IRQs.\n"); + i = 0; } -out_unlock: - spin_unlock_irqrestore(&irq_action_lock, flags); - return 0; +found: + spin_unlock_irqrestore(&irq_table_lock, flags); + + return i; } -void free_irq(unsigned int irq, void *dev_id) +/* Based on a single pil handler_irq may need to call several + * interrupt handlers. Use irq_map as entry to irq_table, + * and let each entry in irq_table point to the next entry. + */ +void irq_link(unsigned int irq) { - struct irqaction *action; - struct irqaction **actionp; + struct irq_bucket *p; unsigned long flags; - unsigned int cpu_irq; - - if (sparc_cpu_model == sun4d) { - sun4d_free_irq(irq, dev_id); - return; - } - cpu_irq = irq & (NR_IRQS - 1); - if (cpu_irq > 14) { /* 14 irq levels on the sparc */ - printk(KERN_ERR "Trying to free bogus IRQ %d\n", irq); - return; - } + unsigned int pil; - spin_lock_irqsave(&irq_action_lock, flags); + BUG_ON(irq >= NR_IRQS); - actionp = &sparc_irq[cpu_irq].action; - action = *actionp; + spin_lock_irqsave(&irq_map_lock, flags); - if (!action->handler) { - printk(KERN_ERR "Trying to free free IRQ%d\n", irq); - goto out_unlock; - } - if (dev_id) { - for (; action; action = action->next) { - if (action->dev_id == dev_id) - break; - actionp = &action->next; - } - if (!action) { - printk(KERN_ERR "Trying to free free shared IRQ%d\n", - irq); - goto out_unlock; - } - } else if (action->flags & IRQF_SHARED) { - printk(KERN_ERR "Trying to free shared IRQ%d with NULL device ID\n", - irq); - goto out_unlock; - } - if (action->flags & SA_STATIC_ALLOC) { - /* - * This interrupt is marked as specially allocated - * so it is a bad idea to free it. - */ - printk(KERN_ERR "Attempt to free statically allocated IRQ%d (%s)\n", - irq, action->name); - goto out_unlock; - } - - *actionp = action->next; + p = &irq_table[irq]; + pil = p->pil; + BUG_ON(pil > SUN4D_MAX_IRQ); + p->next = irq_map[pil]; + irq_map[pil] = p; - spin_unlock_irqrestore(&irq_action_lock, flags); + spin_unlock_irqrestore(&irq_map_lock, flags); +} - synchronize_irq(irq); +void irq_unlink(unsigned int irq) +{ + struct irq_bucket *p, **pnext; + unsigned long flags; - spin_lock_irqsave(&irq_action_lock, flags); + BUG_ON(irq >= NR_IRQS); - kfree(action); + spin_lock_irqsave(&irq_map_lock, flags); - if (!sparc_irq[cpu_irq].action) - __disable_irq(irq); + p = &irq_table[irq]; + BUG_ON(p->pil > SUN4D_MAX_IRQ); + pnext = &irq_map[p->pil]; + while (*pnext != p) + pnext = &(*pnext)->next; + *pnext = p->next; -out_unlock: - spin_unlock_irqrestore(&irq_action_lock, flags); + spin_unlock_irqrestore(&irq_map_lock, flags); } -EXPORT_SYMBOL(free_irq); - -/* - * This is called when we want to synchronize with - * interrupts. We may for example tell a device to - * stop sending interrupts: but to make sure there - * are no interrupts that are executing on another - * CPU we need to call this function. - */ -#ifdef CONFIG_SMP -void synchronize_irq(unsigned int irq) -{ - unsigned int cpu_irq; - cpu_irq = irq & (NR_IRQS - 1); - while (sparc_irq[cpu_irq].flags & SPARC_IRQ_INPROGRESS) - cpu_relax(); -} -EXPORT_SYMBOL(synchronize_irq); -#endif /* SMP */ -void unexpected_irq(int irq, void *dev_id, struct pt_regs *regs) +/* /proc/interrupts printing */ +int arch_show_interrupts(struct seq_file *p, int prec) { - int i; - struct irqaction *action; - unsigned int cpu_irq; + int j; - cpu_irq = irq & (NR_IRQS - 1); - action = sparc_irq[cpu_irq].action; - - printk(KERN_ERR "IO device interrupt, irq = %d\n", irq); - printk(KERN_ERR "PC = %08lx NPC = %08lx FP=%08lx\n", regs->pc, - regs->npc, regs->u_regs[14]); - if (action) { - printk(KERN_ERR "Expecting: "); - for (i = 0; i < 16; i++) - if (action->handler) - printk(KERN_CONT "[%s:%d:0x%x] ", action->name, - i, (unsigned int)action->handler); - } - printk(KERN_ERR "AIEEE\n"); - panic("bogus interrupt received"); +#ifdef CONFIG_SMP + seq_printf(p, "RES: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_data(j).irq_resched_count); + seq_printf(p, " IPI rescheduling interrupts\n"); + seq_printf(p, "CAL: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_data(j).irq_call_count); + seq_printf(p, " IPI function call interrupts\n"); +#endif + seq_printf(p, "NMI: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_data(j).counter); + seq_printf(p, " Non-maskable interrupts\n"); + return 0; } -void handler_irq(int pil, struct pt_regs *regs) +void handler_irq(unsigned int pil, struct pt_regs *regs) { struct pt_regs *old_regs; - struct irqaction *action; - int cpu = smp_processor_id(); + struct irq_bucket *p; + BUG_ON(pil > 15); old_regs = set_irq_regs(regs); irq_enter(); - disable_pil_irq(pil); -#ifdef CONFIG_SMP - /* Only rotate on lower priority IRQs (scsi, ethernet, etc.). */ - if ((sparc_cpu_model==sun4m) && (pil < 10)) - smp4m_irq_rotate(cpu); -#endif - action = sparc_irq[pil].action; - sparc_irq[pil].flags |= SPARC_IRQ_INPROGRESS; - kstat_cpu(cpu).irqs[pil]++; - do { - if (!action || !action->handler) - unexpected_irq(pil, NULL, regs); - action->handler(pil, action->dev_id); - action = action->next; - } while (action); - sparc_irq[pil].flags &= ~SPARC_IRQ_INPROGRESS; - enable_pil_irq(pil); + + p = irq_map[pil]; + while (p) { + struct irq_bucket *next = p->next; + + generic_handle_irq(p->irq); + p = next; + } irq_exit(); set_irq_regs(old_regs); } #if defined(CONFIG_BLK_DEV_FD) || defined(CONFIG_BLK_DEV_FD_MODULE) +static unsigned int floppy_irq; -/* - * Fast IRQs on the Sparc can only have one routine attached to them, - * thus no sharing possible. - */ -static int request_fast_irq(unsigned int irq, - void (*handler)(void), - unsigned long irqflags, const char *devname) +int sparc_floppy_request_irq(unsigned int irq, irq_handler_t irq_handler) { - struct irqaction *action; - unsigned long flags; unsigned int cpu_irq; - int ret; + int err; + #if defined CONFIG_SMP && !defined CONFIG_SPARC_LEON struct tt_entry *trap_table; #endif - cpu_irq = irq & (NR_IRQS - 1); - if (cpu_irq > 14) { - ret = -EINVAL; - goto out; - } - if (!handler) { - ret = -EINVAL; - goto out; - } - spin_lock_irqsave(&irq_action_lock, flags); + err = request_irq(irq, irq_handler, 0, "floppy", NULL); + if (err) + return -1; - action = sparc_irq[cpu_irq].action; - if (action) { - if (action->flags & IRQF_SHARED) - panic("Trying to register fast irq when already shared.\n"); - if (irqflags & IRQF_SHARED) - panic("Trying to register fast irq as shared.\n"); + /* Save for later use in floppy interrupt handler */ + floppy_irq = irq; - /* Anyway, someone already owns it so cannot be made fast. */ - printk(KERN_ERR "request_fast_irq: Trying to register yet already owned.\n"); - ret = -EBUSY; - goto out_unlock; - } - - /* - * If this is flagged as statically allocated then we use our - * private struct which is never freed. - */ - if (irqflags & SA_STATIC_ALLOC) { - if (static_irq_count < MAX_STATIC_ALLOC) - action = &static_irqaction[static_irq_count++]; - else - printk(KERN_ERR "Fast IRQ%d (%s) SA_STATIC_ALLOC failed using kmalloc\n", - irq, devname); - } - - if (action == NULL) - action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC); - if (!action) { - ret = -ENOMEM; - goto out_unlock; - } + cpu_irq = (irq & (NR_IRQS - 1)); /* Dork with trap table if we get this far. */ #define INSTANTIATE(table) \ table[SP_TRAP_IRQ1+(cpu_irq-1)].inst_one = SPARC_RD_PSR_L0; \ table[SP_TRAP_IRQ1+(cpu_irq-1)].inst_two = \ - SPARC_BRANCH((unsigned long) handler, \ + SPARC_BRANCH((unsigned long) floppy_hardint, \ (unsigned long) &table[SP_TRAP_IRQ1+(cpu_irq-1)].inst_two);\ table[SP_TRAP_IRQ1+(cpu_irq-1)].inst_three = SPARC_RD_WIM_L3; \ table[SP_TRAP_IRQ1+(cpu_irq-1)].inst_four = SPARC_NOP; @@ -399,22 +289,9 @@ static int request_fast_irq(unsigned int irq, * writing we have no CPU-neutral interface to fine-grained flushes. */ flush_cache_all(); - - action->flags = irqflags; - action->name = devname; - action->dev_id = NULL; - action->next = NULL; - - sparc_irq[cpu_irq].action = action; - - __enable_irq(irq); - - ret = 0; -out_unlock: - spin_unlock_irqrestore(&irq_action_lock, flags); -out: - return ret; + return 0; } +EXPORT_SYMBOL(sparc_floppy_request_irq); /* * These variables are used to access state from the assembler @@ -440,154 +317,23 @@ EXPORT_SYMBOL(pdma_base); unsigned long pdma_areasize; EXPORT_SYMBOL(pdma_areasize); -static irq_handler_t floppy_irq_handler; - +/* Use the generic irq support to call floppy_interrupt + * which was setup using request_irq() in sparc_floppy_request_irq(). + * We only have one floppy interrupt so we do not need to check + * for additional handlers being wired up by irq_link() + */ void sparc_floppy_irq(int irq, void *dev_id, struct pt_regs *regs) { struct pt_regs *old_regs; - int cpu = smp_processor_id(); old_regs = set_irq_regs(regs); - disable_pil_irq(irq); irq_enter(); - kstat_cpu(cpu).irqs[irq]++; - floppy_irq_handler(irq, dev_id); + generic_handle_irq(floppy_irq); irq_exit(); - enable_pil_irq(irq); set_irq_regs(old_regs); - /* - * XXX Eek, it's totally changed with preempt_count() and such - * if (softirq_pending(cpu)) - * do_softirq(); - */ -} - -int sparc_floppy_request_irq(int irq, unsigned long flags, - irq_handler_t irq_handler) -{ - floppy_irq_handler = irq_handler; - return request_fast_irq(irq, floppy_hardint, flags, "floppy"); } -EXPORT_SYMBOL(sparc_floppy_request_irq); - #endif -int request_irq(unsigned int irq, - irq_handler_t handler, - unsigned long irqflags, const char *devname, void *dev_id) -{ - struct irqaction *action, **actionp; - unsigned long flags; - unsigned int cpu_irq; - int ret; - - if (sparc_cpu_model == sun4d) - return sun4d_request_irq(irq, handler, irqflags, devname, dev_id); - - cpu_irq = irq & (NR_IRQS - 1); - if (cpu_irq > 14) { - ret = -EINVAL; - goto out; - } - if (!handler) { - ret = -EINVAL; - goto out; - } - - spin_lock_irqsave(&irq_action_lock, flags); - - actionp = &sparc_irq[cpu_irq].action; - action = *actionp; - if (action) { - if (!(action->flags & IRQF_SHARED) || !(irqflags & IRQF_SHARED)) { - ret = -EBUSY; - goto out_unlock; - } - if ((action->flags & IRQF_DISABLED) != (irqflags & IRQF_DISABLED)) { - printk(KERN_ERR "Attempt to mix fast and slow interrupts on IRQ%d denied\n", - irq); - ret = -EBUSY; - goto out_unlock; - } - for ( ; action; action = *actionp) - actionp = &action->next; - } - - /* If this is flagged as statically allocated then we use our - * private struct which is never freed. - */ - if (irqflags & SA_STATIC_ALLOC) { - if (static_irq_count < MAX_STATIC_ALLOC) - action = &static_irqaction[static_irq_count++]; - else - printk(KERN_ERR "Request for IRQ%d (%s) SA_STATIC_ALLOC failed using kmalloc\n", - irq, devname); - } - if (action == NULL) - action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC); - if (!action) { - ret = -ENOMEM; - goto out_unlock; - } - - action->handler = handler; - action->flags = irqflags; - action->name = devname; - action->next = NULL; - action->dev_id = dev_id; - - *actionp = action; - - __enable_irq(irq); - - ret = 0; -out_unlock: - spin_unlock_irqrestore(&irq_action_lock, flags); -out: - return ret; -} -EXPORT_SYMBOL(request_irq); - -void disable_irq_nosync(unsigned int irq) -{ - __disable_irq(irq); -} -EXPORT_SYMBOL(disable_irq_nosync); - -void disable_irq(unsigned int irq) -{ - __disable_irq(irq); -} -EXPORT_SYMBOL(disable_irq); - -void enable_irq(unsigned int irq) -{ - __enable_irq(irq); -} -EXPORT_SYMBOL(enable_irq); - -/* - * We really don't need these at all on the Sparc. We only have - * stubs here because they are exported to modules. - */ -unsigned long probe_irq_on(void) -{ - return 0; -} -EXPORT_SYMBOL(probe_irq_on); - -int probe_irq_off(unsigned long mask) -{ - return 0; -} -EXPORT_SYMBOL(probe_irq_off); - -static unsigned int build_device_irq(struct platform_device *op, - unsigned int real_irq) -{ - return real_irq; -} - /* djhr * This could probably be made indirect too and assigned in the CPU * bits of the code. That would be much nicer I think and would also @@ -598,8 +344,6 @@ static unsigned int build_device_irq(struct platform_device *op, void __init init_IRQ(void) { - sparc_irq_config.build_device_irq = build_device_irq; - switch (sparc_cpu_model) { case sun4c: case sun4: @@ -607,14 +351,11 @@ void __init init_IRQ(void) break; case sun4m: -#ifdef CONFIG_PCI pcic_probe(); - if (pcic_present()) { + if (pcic_present()) sun4m_pci_init_IRQ(); - break; - } -#endif - sun4m_init_IRQ(); + else + sun4m_init_IRQ(); break; case sun4d: @@ -632,9 +373,3 @@ void __init init_IRQ(void) btfixup(); } -#ifdef CONFIG_PROC_FS -void init_irq_proc(void) -{ - /* For now, nothing... */ -} -#endif /* CONFIG_PROC_FS */ diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index b1d275ce343..4e78862d12f 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -224,13 +224,13 @@ static int irq_choose_cpu(unsigned int irq, const struct cpumask *affinity) int cpuid; cpumask_copy(&mask, affinity); - if (cpus_equal(mask, cpu_online_map)) { + if (cpumask_equal(&mask, cpu_online_mask)) { cpuid = map_to_cpu(irq); } else { cpumask_t tmp; - cpus_and(tmp, cpu_online_map, mask); - cpuid = cpus_empty(tmp) ? map_to_cpu(irq) : first_cpu(tmp); + cpumask_and(&tmp, cpu_online_mask, &mask); + cpuid = cpumask_empty(&tmp) ? map_to_cpu(irq) : cpumask_first(&tmp); } return cpuid; diff --git a/arch/sparc/kernel/kernel.h b/arch/sparc/kernel/kernel.h index 24ad449886b..6f6544cfa0e 100644 --- a/arch/sparc/kernel/kernel.h +++ b/arch/sparc/kernel/kernel.h @@ -6,11 +6,9 @@ #include <asm/traps.h> /* cpu.c */ -extern const char *sparc_cpu_type; extern const char *sparc_pmu_type; -extern const char *sparc_fpu_type; - extern unsigned int fsr_storage; +extern int ncpus_probed; #ifdef CONFIG_SPARC32 /* cpu.c */ @@ -37,6 +35,7 @@ extern void sun4c_init_IRQ(void); extern unsigned int lvl14_resolution; extern void sun4m_init_IRQ(void); +extern void sun4m_unmask_profile_irq(void); extern void sun4m_clear_profile_irq(int cpu); /* sun4d_irq.c */ diff --git a/arch/sparc/kernel/leon_kernel.c b/arch/sparc/kernel/leon_kernel.c index 2969f777fa1..2f538ac2e13 100644 --- a/arch/sparc/kernel/leon_kernel.c +++ b/arch/sparc/kernel/leon_kernel.c @@ -19,53 +19,70 @@ #include <asm/leon_amba.h> #include <asm/traps.h> #include <asm/cacheflush.h> +#include <asm/smp.h> +#include <asm/setup.h> #include "prom.h" #include "irq.h" struct leon3_irqctrl_regs_map *leon3_irqctrl_regs; /* interrupt controller base address */ struct leon3_gptimer_regs_map *leon3_gptimer_regs; /* timer controller base address */ -struct amba_apb_device leon_percpu_timer_dev[16]; int leondebug_irq_disable; int leon_debug_irqout; static int dummy_master_l10_counter; unsigned long amba_system_id; +static DEFINE_SPINLOCK(leon_irq_lock); unsigned long leon3_gptimer_irq; /* interrupt controller irq number */ unsigned long leon3_gptimer_idx; /* Timer Index (0..6) within Timer Core */ +int leon3_ticker_irq; /* Timer ticker IRQ */ unsigned int sparc_leon_eirq; -#define LEON_IMASK ((&leon3_irqctrl_regs->mask[0])) +#define LEON_IMASK(cpu) (&leon3_irqctrl_regs->mask[cpu]) +#define LEON_IACK (&leon3_irqctrl_regs->iclear) +#define LEON_DO_ACK_HW 1 -/* Return the IRQ of the pending IRQ on the extended IRQ controller */ -int sparc_leon_eirq_get(int eirq, int cpu) +/* Return the last ACKed IRQ by the Extended IRQ controller. It has already + * been (automatically) ACKed when the CPU takes the trap. + */ +static inline unsigned int leon_eirq_get(int cpu) { return LEON3_BYPASS_LOAD_PA(&leon3_irqctrl_regs->intid[cpu]) & 0x1f; } -irqreturn_t sparc_leon_eirq_isr(int dummy, void *dev_id) +/* Handle one or multiple IRQs from the extended interrupt controller */ +static void leon_handle_ext_irq(unsigned int irq, struct irq_desc *desc) { - printk(KERN_ERR "sparc_leon_eirq_isr: ERROR EXTENDED IRQ\n"); - return IRQ_HANDLED; + unsigned int eirq; + int cpu = sparc_leon3_cpuid(); + + eirq = leon_eirq_get(cpu); + if ((eirq & 0x10) && irq_map[eirq]->irq) /* bit4 tells if IRQ happened */ + generic_handle_irq(irq_map[eirq]->irq); } /* The extended IRQ controller has been found, this function registers it */ -void sparc_leon_eirq_register(int eirq) +void leon_eirq_setup(unsigned int eirq) { - int irq; + unsigned long mask, oldmask; + unsigned int veirq; - /* Register a "BAD" handler for this interrupt, it should never happen */ - irq = request_irq(eirq, sparc_leon_eirq_isr, - (IRQF_DISABLED | SA_STATIC_ALLOC), "extirq", NULL); - - if (irq) { - printk(KERN_ERR - "sparc_leon_eirq_register: unable to attach IRQ%d\n", - eirq); - } else { - sparc_leon_eirq = eirq; + if (eirq < 1 || eirq > 0xf) { + printk(KERN_ERR "LEON EXT IRQ NUMBER BAD: %d\n", eirq); + return; } + veirq = leon_build_device_irq(eirq, leon_handle_ext_irq, "extirq", 0); + + /* + * Unmask the Extended IRQ, the IRQs routed through the Ext-IRQ + * controller have a mask-bit of their own, so this is safe. + */ + irq_link(veirq); + mask = 1 << eirq; + oldmask = LEON3_BYPASS_LOAD_PA(LEON_IMASK(boot_cpu_id)); + LEON3_BYPASS_STORE_PA(LEON_IMASK(boot_cpu_id), (oldmask | mask)); + sparc_leon_eirq = eirq; } static inline unsigned long get_irqmask(unsigned int irq) @@ -83,35 +100,151 @@ static inline unsigned long get_irqmask(unsigned int irq) return mask; } -static void leon_enable_irq(unsigned int irq_nr) +#ifdef CONFIG_SMP +static int irq_choose_cpu(const struct cpumask *affinity) { - unsigned long mask, flags; - mask = get_irqmask(irq_nr); - local_irq_save(flags); - LEON3_BYPASS_STORE_PA(LEON_IMASK, - (LEON3_BYPASS_LOAD_PA(LEON_IMASK) | (mask))); - local_irq_restore(flags); + cpumask_t mask; + + cpus_and(mask, cpu_online_map, *affinity); + if (cpus_equal(mask, cpu_online_map) || cpus_empty(mask)) + return boot_cpu_id; + else + return first_cpu(mask); } +#else +#define irq_choose_cpu(affinity) boot_cpu_id +#endif -static void leon_disable_irq(unsigned int irq_nr) +static int leon_set_affinity(struct irq_data *data, const struct cpumask *dest, + bool force) { - unsigned long mask, flags; - mask = get_irqmask(irq_nr); - local_irq_save(flags); - LEON3_BYPASS_STORE_PA(LEON_IMASK, - (LEON3_BYPASS_LOAD_PA(LEON_IMASK) & ~(mask))); - local_irq_restore(flags); + unsigned long mask, oldmask, flags; + int oldcpu, newcpu; + + mask = (unsigned long)data->chip_data; + oldcpu = irq_choose_cpu(data->affinity); + newcpu = irq_choose_cpu(dest); + + if (oldcpu == newcpu) + goto out; + + /* unmask on old CPU first before enabling on the selected CPU */ + spin_lock_irqsave(&leon_irq_lock, flags); + oldmask = LEON3_BYPASS_LOAD_PA(LEON_IMASK(oldcpu)); + LEON3_BYPASS_STORE_PA(LEON_IMASK(oldcpu), (oldmask & ~mask)); + oldmask = LEON3_BYPASS_LOAD_PA(LEON_IMASK(newcpu)); + LEON3_BYPASS_STORE_PA(LEON_IMASK(newcpu), (oldmask | mask)); + spin_unlock_irqrestore(&leon_irq_lock, flags); +out: + return IRQ_SET_MASK_OK; +} + +static void leon_unmask_irq(struct irq_data *data) +{ + unsigned long mask, oldmask, flags; + int cpu; + + mask = (unsigned long)data->chip_data; + cpu = irq_choose_cpu(data->affinity); + spin_lock_irqsave(&leon_irq_lock, flags); + oldmask = LEON3_BYPASS_LOAD_PA(LEON_IMASK(cpu)); + LEON3_BYPASS_STORE_PA(LEON_IMASK(cpu), (oldmask | mask)); + spin_unlock_irqrestore(&leon_irq_lock, flags); +} + +static void leon_mask_irq(struct irq_data *data) +{ + unsigned long mask, oldmask, flags; + int cpu; + + mask = (unsigned long)data->chip_data; + cpu = irq_choose_cpu(data->affinity); + spin_lock_irqsave(&leon_irq_lock, flags); + oldmask = LEON3_BYPASS_LOAD_PA(LEON_IMASK(cpu)); + LEON3_BYPASS_STORE_PA(LEON_IMASK(cpu), (oldmask & ~mask)); + spin_unlock_irqrestore(&leon_irq_lock, flags); +} + +static unsigned int leon_startup_irq(struct irq_data *data) +{ + irq_link(data->irq); + leon_unmask_irq(data); + return 0; +} +static void leon_shutdown_irq(struct irq_data *data) +{ + leon_mask_irq(data); + irq_unlink(data->irq); +} + +/* Used by external level sensitive IRQ handlers on the LEON: ACK IRQ ctrl */ +static void leon_eoi_irq(struct irq_data *data) +{ + unsigned long mask = (unsigned long)data->chip_data; + + if (mask & LEON_DO_ACK_HW) + LEON3_BYPASS_STORE_PA(LEON_IACK, mask & ~LEON_DO_ACK_HW); +} + +static struct irq_chip leon_irq = { + .name = "leon", + .irq_startup = leon_startup_irq, + .irq_shutdown = leon_shutdown_irq, + .irq_mask = leon_mask_irq, + .irq_unmask = leon_unmask_irq, + .irq_eoi = leon_eoi_irq, + .irq_set_affinity = leon_set_affinity, +}; + +/* + * Build a LEON IRQ for the edge triggered LEON IRQ controller: + * Edge (normal) IRQ - handle_simple_irq, ack=DONT-CARE, never ack + * Level IRQ (PCI|Level-GPIO) - handle_fasteoi_irq, ack=1, ack after ISR + * Per-CPU Edge - handle_percpu_irq, ack=0 + */ +unsigned int leon_build_device_irq(unsigned int real_irq, + irq_flow_handler_t flow_handler, + const char *name, int do_ack) +{ + unsigned int irq; + unsigned long mask; + + irq = 0; + mask = get_irqmask(real_irq); + if (mask == 0) + goto out; + + irq = irq_alloc(real_irq, real_irq); + if (irq == 0) + goto out; + + if (do_ack) + mask |= LEON_DO_ACK_HW; + + irq_set_chip_and_handler_name(irq, &leon_irq, + flow_handler, name); + irq_set_chip_data(irq, (void *)mask); + +out: + return irq; +} + +static unsigned int _leon_build_device_irq(struct platform_device *op, + unsigned int real_irq) +{ + return leon_build_device_irq(real_irq, handle_simple_irq, "edge", 0); } void __init leon_init_timers(irq_handler_t counter_fn) { - int irq; + int irq, eirq; struct device_node *rootnp, *np, *nnp; struct property *pp; int len; - int cpu, icsel; + int icsel; int ampopts; + int err; leondebug_irq_disable = 0; leon_debug_irqout = 0; @@ -173,98 +306,85 @@ void __init leon_init_timers(irq_handler_t counter_fn) leon3_gptimer_irq = *(unsigned int *)pp->value; } while (0); - if (leon3_gptimer_regs && leon3_irqctrl_regs && leon3_gptimer_irq) { - LEON3_BYPASS_STORE_PA( - &leon3_gptimer_regs->e[leon3_gptimer_idx].val, 0); - LEON3_BYPASS_STORE_PA( - &leon3_gptimer_regs->e[leon3_gptimer_idx].rld, - (((1000000 / HZ) - 1))); - LEON3_BYPASS_STORE_PA( + if (!(leon3_gptimer_regs && leon3_irqctrl_regs && leon3_gptimer_irq)) + goto bad; + + LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx].val, 0); + LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx].rld, + (((1000000 / HZ) - 1))); + LEON3_BYPASS_STORE_PA( &leon3_gptimer_regs->e[leon3_gptimer_idx].ctrl, 0); #ifdef CONFIG_SMP - leon_percpu_timer_dev[0].start = (int)leon3_gptimer_regs; - leon_percpu_timer_dev[0].irq = leon3_gptimer_irq + 1 + - leon3_gptimer_idx; - - if (!(LEON3_BYPASS_LOAD_PA(&leon3_gptimer_regs->config) & - (1<<LEON3_GPTIMER_SEPIRQ))) { - prom_printf("irq timer not configured with separate irqs\n"); - BUG(); - } + leon3_ticker_irq = leon3_gptimer_irq + 1 + leon3_gptimer_idx; - LEON3_BYPASS_STORE_PA( - &leon3_gptimer_regs->e[leon3_gptimer_idx+1].val, 0); - LEON3_BYPASS_STORE_PA( - &leon3_gptimer_regs->e[leon3_gptimer_idx+1].rld, - (((1000000/HZ) - 1))); - LEON3_BYPASS_STORE_PA( - &leon3_gptimer_regs->e[leon3_gptimer_idx+1].ctrl, 0); -# endif - - /* - * The IRQ controller may (if implemented) consist of multiple - * IRQ controllers, each mapped on a 4Kb boundary. - * Each CPU may be routed to different IRQCTRLs, however - * we assume that all CPUs (in SMP system) is routed to the - * same IRQ Controller, and for non-SMP only one IRQCTRL is - * accessed anyway. - * In AMP systems, Linux must run on CPU0 for the time being. - */ - cpu = sparc_leon3_cpuid(); - icsel = LEON3_BYPASS_LOAD_PA(&leon3_irqctrl_regs->icsel[cpu/8]); - icsel = (icsel >> ((7 - (cpu&0x7)) * 4)) & 0xf; - leon3_irqctrl_regs += icsel; - } else { - goto bad; + if (!(LEON3_BYPASS_LOAD_PA(&leon3_gptimer_regs->config) & + (1<<LEON3_GPTIMER_SEPIRQ))) { + printk(KERN_ERR "timer not configured with separate irqs\n"); + BUG(); } - irq = request_irq(leon3_gptimer_irq+leon3_gptimer_idx, - counter_fn, - (IRQF_DISABLED | SA_STATIC_ALLOC), "timer", NULL); + LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx+1].val, + 0); + LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx+1].rld, + (((1000000/HZ) - 1))); + LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx+1].ctrl, + 0); +#endif - if (irq) { - printk(KERN_ERR "leon_time_init: unable to attach IRQ%d\n", - LEON_INTERRUPT_TIMER1); + /* + * The IRQ controller may (if implemented) consist of multiple + * IRQ controllers, each mapped on a 4Kb boundary. + * Each CPU may be routed to different IRQCTRLs, however + * we assume that all CPUs (in SMP system) is routed to the + * same IRQ Controller, and for non-SMP only one IRQCTRL is + * accessed anyway. + * In AMP systems, Linux must run on CPU0 for the time being. + */ + icsel = LEON3_BYPASS_LOAD_PA(&leon3_irqctrl_regs->icsel[boot_cpu_id/8]); + icsel = (icsel >> ((7 - (boot_cpu_id&0x7)) * 4)) & 0xf; + leon3_irqctrl_regs += icsel; + + /* Mask all IRQs on boot-cpu IRQ controller */ + LEON3_BYPASS_STORE_PA(&leon3_irqctrl_regs->mask[boot_cpu_id], 0); + + /* Probe extended IRQ controller */ + eirq = (LEON3_BYPASS_LOAD_PA(&leon3_irqctrl_regs->mpstatus) + >> 16) & 0xf; + if (eirq != 0) + leon_eirq_setup(eirq); + + irq = _leon_build_device_irq(NULL, leon3_gptimer_irq+leon3_gptimer_idx); + err = request_irq(irq, counter_fn, IRQF_TIMER, "timer", NULL); + if (err) { + printk(KERN_ERR "unable to attach timer IRQ%d\n", irq); prom_halt(); } -# ifdef CONFIG_SMP - { - unsigned long flags; - struct tt_entry *trap_table = &sparc_ttable[SP_TRAP_IRQ1 + (leon_percpu_timer_dev[0].irq - 1)]; - - /* For SMP we use the level 14 ticker, however the bootup code - * has copied the firmwares level 14 vector into boot cpu's - * trap table, we must fix this now or we get squashed. - */ - local_irq_save(flags); - - patchme_maybe_smp_msg[0] = 0x01000000; /* NOP out the branch */ - - /* Adjust so that we jump directly to smpleon_ticker */ - trap_table->inst_three += smpleon_ticker - real_irq_entry; + LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx].ctrl, + LEON3_GPTIMER_EN | + LEON3_GPTIMER_RL | + LEON3_GPTIMER_LD | + LEON3_GPTIMER_IRQEN); - local_flush_cache_all(); - local_irq_restore(flags); +#ifdef CONFIG_SMP + /* Install per-cpu IRQ handler for broadcasted ticker */ + irq = leon_build_device_irq(leon3_ticker_irq, handle_percpu_irq, + "per-cpu", 0); + err = request_irq(irq, leon_percpu_timer_interrupt, + IRQF_PERCPU | IRQF_TIMER, "ticker", + NULL); + if (err) { + printk(KERN_ERR "unable to attach ticker IRQ%d\n", irq); + prom_halt(); } -# endif - - if (leon3_gptimer_regs) { - LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx].ctrl, - LEON3_GPTIMER_EN | - LEON3_GPTIMER_RL | - LEON3_GPTIMER_LD | LEON3_GPTIMER_IRQEN); -#ifdef CONFIG_SMP - LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx+1].ctrl, - LEON3_GPTIMER_EN | - LEON3_GPTIMER_RL | - LEON3_GPTIMER_LD | - LEON3_GPTIMER_IRQEN); + LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx+1].ctrl, + LEON3_GPTIMER_EN | + LEON3_GPTIMER_RL | + LEON3_GPTIMER_LD | + LEON3_GPTIMER_IRQEN); #endif - - } return; bad: printk(KERN_ERR "No Timer/irqctrl found\n"); @@ -281,9 +401,6 @@ void leon_load_profile_irq(int cpu, unsigned int limit) BUG(); } - - - void __init leon_trans_init(struct device_node *dp) { if (strcmp(dp->type, "cpu") == 0 && strcmp(dp->name, "<NULL>") == 0) { @@ -337,22 +454,18 @@ void leon_enable_irq_cpu(unsigned int irq_nr, unsigned int cpu) { unsigned long mask, flags, *addr; mask = get_irqmask(irq_nr); - local_irq_save(flags); - addr = (unsigned long *)&(leon3_irqctrl_regs->mask[cpu]); - LEON3_BYPASS_STORE_PA(addr, (LEON3_BYPASS_LOAD_PA(addr) | (mask))); - local_irq_restore(flags); + spin_lock_irqsave(&leon_irq_lock, flags); + addr = (unsigned long *)LEON_IMASK(cpu); + LEON3_BYPASS_STORE_PA(addr, (LEON3_BYPASS_LOAD_PA(addr) | mask)); + spin_unlock_irqrestore(&leon_irq_lock, flags); } #endif void __init leon_init_IRQ(void) { - sparc_irq_config.init_timers = leon_init_timers; - - BTFIXUPSET_CALL(enable_irq, leon_enable_irq, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(disable_irq, leon_disable_irq, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(enable_pil_irq, leon_enable_irq, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(disable_pil_irq, leon_disable_irq, BTFIXUPCALL_NORM); + sparc_irq_config.init_timers = leon_init_timers; + sparc_irq_config.build_device_irq = _leon_build_device_irq; BTFIXUPSET_CALL(clear_clock_irq, leon_clear_clock_irq, BTFIXUPCALL_NORM); diff --git a/arch/sparc/kernel/leon_smp.c b/arch/sparc/kernel/leon_smp.c index 8f5de4aa3c0..fe8fb44c609 100644 --- a/arch/sparc/kernel/leon_smp.c +++ b/arch/sparc/kernel/leon_smp.c @@ -14,6 +14,7 @@ #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <linux/of.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/mm.h> @@ -29,6 +30,7 @@ #include <asm/ptrace.h> #include <asm/atomic.h> #include <asm/irq_regs.h> +#include <asm/traps.h> #include <asm/delay.h> #include <asm/irq.h> @@ -50,9 +52,12 @@ extern ctxd_t *srmmu_ctx_table_phys; static int smp_processors_ready; extern volatile unsigned long cpu_callin_map[NR_CPUS]; -extern unsigned char boot_cpu_id; extern cpumask_t smp_commenced_mask; void __init leon_configure_cache_smp(void); +static void leon_ipi_init(void); + +/* IRQ number of LEON IPIs */ +int leon_ipi_irq = LEON3_IRQ_IPI_DEFAULT; static inline unsigned long do_swap(volatile unsigned long *ptr, unsigned long val) @@ -94,8 +99,6 @@ void __cpuinit leon_callin(void) local_flush_cache_all(); local_flush_tlb_all(); - cpu_probe(); - /* Fix idle thread fields. */ __asm__ __volatile__("ld [%0], %%g6\n\t" : : "r"(¤t_set[cpuid]) : "memory" /* paranoid */); @@ -104,11 +107,11 @@ void __cpuinit leon_callin(void) atomic_inc(&init_mm.mm_count); current->active_mm = &init_mm; - while (!cpu_isset(cpuid, smp_commenced_mask)) + while (!cpumask_test_cpu(cpuid, &smp_commenced_mask)) mb(); local_irq_enable(); - cpu_set(cpuid, cpu_online_map); + set_cpu_online(cpuid, true); } /* @@ -179,13 +182,16 @@ void __init leon_boot_cpus(void) int nrcpu = leon_smp_nrcpus(); int me = smp_processor_id(); + /* Setup IPI */ + leon_ipi_init(); + printk(KERN_INFO "%d:(%d:%d) cpus mpirq at 0x%x\n", (unsigned int)me, (unsigned int)nrcpu, (unsigned int)NR_CPUS, (unsigned int)&(leon3_irqctrl_regs->mpstatus)); leon_enable_irq_cpu(LEON3_IRQ_CROSS_CALL, me); leon_enable_irq_cpu(LEON3_IRQ_TICKER, me); - leon_enable_irq_cpu(LEON3_IRQ_RESCHEDULE, me); + leon_enable_irq_cpu(leon_ipi_irq, me); leon_smp_setbroadcast(1 << LEON3_IRQ_TICKER); @@ -220,6 +226,10 @@ int __cpuinit leon_boot_one_cpu(int i) (unsigned int)&leon3_irqctrl_regs->mpstatus); local_flush_cache_all(); + /* Make sure all IRQs are of from the start for this new CPU */ + LEON_BYPASS_STORE_PA(&leon3_irqctrl_regs->mask[i], 0); + + /* Wake one CPU */ LEON_BYPASS_STORE_PA(&(leon3_irqctrl_regs->mpstatus), 1 << i); /* wheee... it's going... */ @@ -236,7 +246,7 @@ int __cpuinit leon_boot_one_cpu(int i) } else { leon_enable_irq_cpu(LEON3_IRQ_CROSS_CALL, i); leon_enable_irq_cpu(LEON3_IRQ_TICKER, i); - leon_enable_irq_cpu(LEON3_IRQ_RESCHEDULE, i); + leon_enable_irq_cpu(leon_ipi_irq, i); } local_flush_cache_all(); @@ -262,21 +272,21 @@ void __init leon_smp_done(void) local_flush_cache_all(); /* Free unneeded trap tables */ - if (!cpu_isset(1, cpu_present_map)) { + if (!cpu_present(1)) { ClearPageReserved(virt_to_page(&trapbase_cpu1)); init_page_count(virt_to_page(&trapbase_cpu1)); free_page((unsigned long)&trapbase_cpu1); totalram_pages++; num_physpages++; } - if (!cpu_isset(2, cpu_present_map)) { + if (!cpu_present(2)) { ClearPageReserved(virt_to_page(&trapbase_cpu2)); init_page_count(virt_to_page(&trapbase_cpu2)); free_page((unsigned long)&trapbase_cpu2); totalram_pages++; num_physpages++; } - if (!cpu_isset(3, cpu_present_map)) { + if (!cpu_present(3)) { ClearPageReserved(virt_to_page(&trapbase_cpu3)); init_page_count(virt_to_page(&trapbase_cpu3)); free_page((unsigned long)&trapbase_cpu3); @@ -292,6 +302,99 @@ void leon_irq_rotate(int cpu) { } +struct leon_ipi_work { + int single; + int msk; + int resched; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct leon_ipi_work, leon_ipi_work); + +/* Initialize IPIs on the LEON, in order to save IRQ resources only one IRQ + * is used for all three types of IPIs. + */ +static void __init leon_ipi_init(void) +{ + int cpu, len; + struct leon_ipi_work *work; + struct property *pp; + struct device_node *rootnp; + struct tt_entry *trap_table; + unsigned long flags; + + /* Find IPI IRQ or stick with default value */ + rootnp = of_find_node_by_path("/ambapp0"); + if (rootnp) { + pp = of_find_property(rootnp, "ipi_num", &len); + if (pp && (*(int *)pp->value)) + leon_ipi_irq = *(int *)pp->value; + } + printk(KERN_INFO "leon: SMP IPIs at IRQ %d\n", leon_ipi_irq); + + /* Adjust so that we jump directly to smpleon_ipi */ + local_irq_save(flags); + trap_table = &sparc_ttable[SP_TRAP_IRQ1 + (leon_ipi_irq - 1)]; + trap_table->inst_three += smpleon_ipi - real_irq_entry; + local_flush_cache_all(); + local_irq_restore(flags); + + for_each_possible_cpu(cpu) { + work = &per_cpu(leon_ipi_work, cpu); + work->single = work->msk = work->resched = 0; + } +} + +static void leon_ipi_single(int cpu) +{ + struct leon_ipi_work *work = &per_cpu(leon_ipi_work, cpu); + + /* Mark work */ + work->single = 1; + + /* Generate IRQ on the CPU */ + set_cpu_int(cpu, leon_ipi_irq); +} + +static void leon_ipi_mask_one(int cpu) +{ + struct leon_ipi_work *work = &per_cpu(leon_ipi_work, cpu); + + /* Mark work */ + work->msk = 1; + + /* Generate IRQ on the CPU */ + set_cpu_int(cpu, leon_ipi_irq); +} + +static void leon_ipi_resched(int cpu) +{ + struct leon_ipi_work *work = &per_cpu(leon_ipi_work, cpu); + + /* Mark work */ + work->resched = 1; + + /* Generate IRQ on the CPU (any IRQ will cause resched) */ + set_cpu_int(cpu, leon_ipi_irq); +} + +void leonsmp_ipi_interrupt(void) +{ + struct leon_ipi_work *work = &__get_cpu_var(leon_ipi_work); + + if (work->single) { + work->single = 0; + smp_call_function_single_interrupt(); + } + if (work->msk) { + work->msk = 0; + smp_call_function_interrupt(); + } + if (work->resched) { + work->resched = 0; + smp_resched_interrupt(); + } +} + static struct smp_funcall { smpfunc_t func; unsigned long arg1; @@ -337,10 +440,10 @@ static void leon_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1, { register int i; - cpu_clear(smp_processor_id(), mask); - cpus_and(mask, cpu_online_map, mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + cpumask_and(&mask, cpu_online_mask, &mask); for (i = 0; i <= high; i++) { - if (cpu_isset(i, mask)) { + if (cpumask_test_cpu(i, &mask)) { ccall_info.processors_in[i] = 0; ccall_info.processors_out[i] = 0; set_cpu_int(i, LEON3_IRQ_CROSS_CALL); @@ -354,7 +457,7 @@ static void leon_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1, i = 0; do { - if (!cpu_isset(i, mask)) + if (!cpumask_test_cpu(i, &mask)) continue; while (!ccall_info.processors_in[i]) @@ -363,7 +466,7 @@ static void leon_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1, i = 0; do { - if (!cpu_isset(i, mask)) + if (!cpumask_test_cpu(i, &mask)) continue; while (!ccall_info.processors_out[i]) @@ -386,27 +489,23 @@ void leon_cross_call_irq(void) ccall_info.processors_out[i] = 1; } -void leon_percpu_timer_interrupt(struct pt_regs *regs) +irqreturn_t leon_percpu_timer_interrupt(int irq, void *unused) { - struct pt_regs *old_regs; int cpu = smp_processor_id(); - old_regs = set_irq_regs(regs); - leon_clear_profile_irq(cpu); profile_tick(CPU_PROFILING); if (!--prof_counter(cpu)) { - int user = user_mode(regs); + int user = user_mode(get_irq_regs()); - irq_enter(); update_process_times(user); - irq_exit(); prof_counter(cpu) = prof_multiplier(cpu); } - set_irq_regs(old_regs); + + return IRQ_HANDLED; } static void __init smp_setup_percpu_timer(void) @@ -449,6 +548,9 @@ void __init leon_init_smp(void) BTFIXUPSET_CALL(smp_cross_call, leon_cross_call, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(__hard_smp_processor_id, __leon_processor_id, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(smp_ipi_resched, leon_ipi_resched, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(smp_ipi_single, leon_ipi_single, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(smp_ipi_mask_one, leon_ipi_mask_one, BTFIXUPCALL_NORM); } #endif /* CONFIG_SPARC_LEON */ diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c index 56db06432ce..42f28c7420e 100644 --- a/arch/sparc/kernel/mdesc.c +++ b/arch/sparc/kernel/mdesc.c @@ -768,7 +768,7 @@ static void * __cpuinit mdesc_iterate_over_cpus(void *(*func)(struct mdesc_handl cpuid, NR_CPUS); continue; } - if (!cpu_isset(cpuid, *mask)) + if (!cpumask_test_cpu(cpuid, mask)) continue; #endif diff --git a/arch/sparc/kernel/of_device_64.c b/arch/sparc/kernel/of_device_64.c index 5c149689bb2..3bb2eace58c 100644 --- a/arch/sparc/kernel/of_device_64.c +++ b/arch/sparc/kernel/of_device_64.c @@ -622,8 +622,9 @@ static unsigned int __init build_one_device_irq(struct platform_device *op, out: nid = of_node_to_nid(dp); if (nid != -1) { - cpumask_t numa_mask = *cpumask_of_node(nid); + cpumask_t numa_mask; + cpumask_copy(&numa_mask, cpumask_of_node(nid)); irq_set_affinity(irq, &numa_mask); } diff --git a/arch/sparc/kernel/pci_msi.c b/arch/sparc/kernel/pci_msi.c index 30982e9ab62..580651af73f 100644 --- a/arch/sparc/kernel/pci_msi.c +++ b/arch/sparc/kernel/pci_msi.c @@ -284,8 +284,9 @@ static int bringup_one_msi_queue(struct pci_pbm_info *pbm, nid = pbm->numa_node; if (nid != -1) { - cpumask_t numa_mask = *cpumask_of_node(nid); + cpumask_t numa_mask; + cpumask_copy(&numa_mask, cpumask_of_node(nid)); irq_set_affinity(irq, &numa_mask); } err = request_irq(irq, sparc64_msiq_interrupt, 0, diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c index 2cdc131b50a..948601a066f 100644 --- a/arch/sparc/kernel/pcic.c +++ b/arch/sparc/kernel/pcic.c @@ -164,6 +164,9 @@ void __iomem *pcic_regs; volatile int pcic_speculative; volatile int pcic_trapped; +/* forward */ +unsigned int pcic_build_device_irq(struct platform_device *op, + unsigned int real_irq); #define CONFIG_CMD(bus, device_fn, where) (0x80000000 | (((unsigned int)bus) << 16) | (((unsigned int)device_fn) << 8) | (where & ~3)) @@ -523,6 +526,7 @@ static void pcic_fill_irq(struct linux_pcic *pcic, struct pci_dev *dev, int node) { struct pcic_ca2irq *p; + unsigned int real_irq; int i, ivec; char namebuf[64]; @@ -551,26 +555,25 @@ pcic_fill_irq(struct linux_pcic *pcic, struct pci_dev *dev, int node) i = p->pin; if (i >= 0 && i < 4) { ivec = readw(pcic->pcic_regs+PCI_INT_SELECT_LO); - dev->irq = ivec >> (i << 2) & 0xF; + real_irq = ivec >> (i << 2) & 0xF; } else if (i >= 4 && i < 8) { ivec = readw(pcic->pcic_regs+PCI_INT_SELECT_HI); - dev->irq = ivec >> ((i-4) << 2) & 0xF; + real_irq = ivec >> ((i-4) << 2) & 0xF; } else { /* Corrupted map */ printk("PCIC: BAD PIN %d\n", i); for (;;) {} } /* P3 */ /* printk("PCIC: device %s pin %d ivec 0x%x irq %x\n", namebuf, i, ivec, dev->irq); */ - /* - * dev->irq=0 means PROM did not bother to program the upper + /* real_irq means PROM did not bother to program the upper * half of PCIC. This happens on JS-E with PROM 3.11, for instance. */ - if (dev->irq == 0 || p->force) { + if (real_irq == 0 || p->force) { if (p->irq == 0 || p->irq >= 15) { /* Corrupted map */ printk("PCIC: BAD IRQ %d\n", p->irq); for (;;) {} } printk("PCIC: setting irq %d at pin %d for device %02x:%02x\n", p->irq, p->pin, dev->bus->number, dev->devfn); - dev->irq = p->irq; + real_irq = p->irq; i = p->pin; if (i >= 4) { @@ -584,7 +587,8 @@ pcic_fill_irq(struct linux_pcic *pcic, struct pci_dev *dev, int node) ivec |= p->irq << (i << 2); writew(ivec, pcic->pcic_regs+PCI_INT_SELECT_LO); } - } + } + dev->irq = pcic_build_device_irq(NULL, real_irq); } /* @@ -729,6 +733,7 @@ void __init pci_time_init(void) struct linux_pcic *pcic = &pcic0; unsigned long v; int timer_irq, irq; + int err; do_arch_gettimeoffset = pci_gettimeoffset; @@ -740,9 +745,10 @@ void __init pci_time_init(void) timer_irq = PCI_COUNTER_IRQ_SYS(v); writel (PCI_COUNTER_IRQ_SET(timer_irq, 0), pcic->pcic_regs+PCI_COUNTER_IRQ); - irq = request_irq(timer_irq, pcic_timer_handler, - (IRQF_DISABLED | SA_STATIC_ALLOC), "timer", NULL); - if (irq) { + irq = pcic_build_device_irq(NULL, timer_irq); + err = request_irq(irq, pcic_timer_handler, + IRQF_TIMER, "timer", NULL); + if (err) { prom_printf("time_init: unable to attach IRQ%d\n", timer_irq); prom_halt(); } @@ -803,50 +809,73 @@ static inline unsigned long get_irqmask(int irq_nr) return 1 << irq_nr; } -static void pcic_disable_irq(unsigned int irq_nr) +static void pcic_mask_irq(struct irq_data *data) { unsigned long mask, flags; - mask = get_irqmask(irq_nr); + mask = (unsigned long)data->chip_data; local_irq_save(flags); writel(mask, pcic0.pcic_regs+PCI_SYS_INT_TARGET_MASK_SET); local_irq_restore(flags); } -static void pcic_enable_irq(unsigned int irq_nr) +static void pcic_unmask_irq(struct irq_data *data) { unsigned long mask, flags; - mask = get_irqmask(irq_nr); + mask = (unsigned long)data->chip_data; local_irq_save(flags); writel(mask, pcic0.pcic_regs+PCI_SYS_INT_TARGET_MASK_CLEAR); local_irq_restore(flags); } -static void pcic_load_profile_irq(int cpu, unsigned int limit) +static unsigned int pcic_startup_irq(struct irq_data *data) { - printk("PCIC: unimplemented code: FILE=%s LINE=%d", __FILE__, __LINE__); + irq_link(data->irq); + pcic_unmask_irq(data); + return 0; } -/* We assume the caller has disabled local interrupts when these are called, - * or else very bizarre behavior will result. - */ -static void pcic_disable_pil_irq(unsigned int pil) +static struct irq_chip pcic_irq = { + .name = "pcic", + .irq_startup = pcic_startup_irq, + .irq_mask = pcic_mask_irq, + .irq_unmask = pcic_unmask_irq, +}; + +unsigned int pcic_build_device_irq(struct platform_device *op, + unsigned int real_irq) { - writel(get_irqmask(pil), pcic0.pcic_regs+PCI_SYS_INT_TARGET_MASK_SET); + unsigned int irq; + unsigned long mask; + + irq = 0; + mask = get_irqmask(real_irq); + if (mask == 0) + goto out; + + irq = irq_alloc(real_irq, real_irq); + if (irq == 0) + goto out; + + irq_set_chip_and_handler_name(irq, &pcic_irq, + handle_level_irq, "PCIC"); + irq_set_chip_data(irq, (void *)mask); + +out: + return irq; } -static void pcic_enable_pil_irq(unsigned int pil) + +static void pcic_load_profile_irq(int cpu, unsigned int limit) { - writel(get_irqmask(pil), pcic0.pcic_regs+PCI_SYS_INT_TARGET_MASK_CLEAR); + printk("PCIC: unimplemented code: FILE=%s LINE=%d", __FILE__, __LINE__); } void __init sun4m_pci_init_IRQ(void) { - BTFIXUPSET_CALL(enable_irq, pcic_enable_irq, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(disable_irq, pcic_disable_irq, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(enable_pil_irq, pcic_enable_pil_irq, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(disable_pil_irq, pcic_disable_pil_irq, BTFIXUPCALL_NORM); + sparc_irq_config.build_device_irq = pcic_build_device_irq; + BTFIXUPSET_CALL(clear_clock_irq, pcic_clear_clock_irq, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(load_profile_irq, pcic_load_profile_irq, BTFIXUPCALL_NORM); } diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index ee8426ede7c..2cb0e1c001e 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -26,6 +26,7 @@ #include <asm/nmi.h> #include <asm/pcr.h> +#include "kernel.h" #include "kstack.h" /* Sparc64 chips have two performance counters, 32-bits each, with diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 17529298c50..c8cc461ff75 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -128,8 +128,16 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); /* endless idle loop with no priority at all */ while(1) { - while (!need_resched()) - cpu_relax(); +#ifdef CONFIG_SPARC_LEON + if (pm_idle) { + while (!need_resched()) + (*pm_idle)(); + } else +#endif + { + while (!need_resched()) + cpu_relax(); + } preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/sparc/kernel/prom_32.c b/arch/sparc/kernel/prom_32.c index 05fb2533058..5ce3d15a99b 100644 --- a/arch/sparc/kernel/prom_32.c +++ b/arch/sparc/kernel/prom_32.c @@ -326,7 +326,6 @@ void __init of_console_init(void) of_console_options = NULL; } - prom_printf(msg, of_console_path); printk(msg, of_console_path); } diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c index 7b8b76c9557..3609bdee9ed 100644 --- a/arch/sparc/kernel/setup_32.c +++ b/arch/sparc/kernel/setup_32.c @@ -103,16 +103,20 @@ static unsigned int boot_flags __initdata = 0; /* Exported for mm/init.c:paging_init. */ unsigned long cmdline_memory_size __initdata = 0; +/* which CPU booted us (0xff = not set) */ +unsigned char boot_cpu_id = 0xff; /* 0xff will make it into DATA section... */ +unsigned char boot_cpu_id4; /* boot_cpu_id << 2 */ + static void prom_console_write(struct console *con, const char *s, unsigned n) { prom_write(s, n); } -static struct console prom_debug_console = { - .name = "debug", +static struct console prom_early_console = { + .name = "earlyprom", .write = prom_console_write, - .flags = CON_PRINTBUFFER, + .flags = CON_PRINTBUFFER | CON_BOOT, .index = -1, }; @@ -133,8 +137,7 @@ static void __init process_switch(char c) prom_halt(); break; case 'p': - /* Use PROM debug console. */ - register_console(&prom_debug_console); + /* Just ignore, this behavior is now the default. */ break; default: printk("Unknown boot switch (-%c)\n", c); @@ -215,6 +218,10 @@ void __init setup_arch(char **cmdline_p) strcpy(boot_command_line, *cmdline_p); parse_early_param(); + boot_flags_init(*cmdline_p); + + register_console(&prom_early_console); + /* Set sparc_cpu_model */ sparc_cpu_model = sun_unknown; if (!strcmp(&cputypval[0], "sun4 ")) @@ -265,7 +272,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_DUMMY_CONSOLE conswitchp = &dummy_con; #endif - boot_flags_init(*cmdline_p); idprom_init(); if (ARCH_SUN4C) @@ -311,75 +317,6 @@ void __init setup_arch(char **cmdline_p) smp_setup_cpu_possible_map(); } -static int ncpus_probed; - -static int show_cpuinfo(struct seq_file *m, void *__unused) -{ - seq_printf(m, - "cpu\t\t: %s\n" - "fpu\t\t: %s\n" - "promlib\t\t: Version %d Revision %d\n" - "prom\t\t: %d.%d\n" - "type\t\t: %s\n" - "ncpus probed\t: %d\n" - "ncpus active\t: %d\n" -#ifndef CONFIG_SMP - "CPU0Bogo\t: %lu.%02lu\n" - "CPU0ClkTck\t: %ld\n" -#endif - , - sparc_cpu_type, - sparc_fpu_type , - romvec->pv_romvers, - prom_rev, - romvec->pv_printrev >> 16, - romvec->pv_printrev & 0xffff, - &cputypval[0], - ncpus_probed, - num_online_cpus() -#ifndef CONFIG_SMP - , cpu_data(0).udelay_val/(500000/HZ), - (cpu_data(0).udelay_val/(5000/HZ)) % 100, - cpu_data(0).clock_tick -#endif - ); - -#ifdef CONFIG_SMP - smp_bogo(m); -#endif - mmu_info(m); -#ifdef CONFIG_SMP - smp_info(m); -#endif - return 0; -} - -static void *c_start(struct seq_file *m, loff_t *pos) -{ - /* The pointer we are returning is arbitrary, - * it just has to be non-NULL and not IS_ERR - * in the success case. - */ - return *pos == 0 ? &c_start : NULL; -} - -static void *c_next(struct seq_file *m, void *v, loff_t *pos) -{ - ++*pos; - return c_start(m, pos); -} - -static void c_stop(struct seq_file *m, void *v) -{ -} - -const struct seq_operations cpuinfo_op = { - .start =c_start, - .next = c_next, - .stop = c_stop, - .show = show_cpuinfo, -}; - extern int stop_a_enabled; void sun_do_break(void) diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c index 29bafe051bb..f3b6850cc8d 100644 --- a/arch/sparc/kernel/setup_64.c +++ b/arch/sparc/kernel/setup_64.c @@ -339,84 +339,6 @@ void __init setup_arch(char **cmdline_p) paging_init(); } -/* BUFFER is PAGE_SIZE bytes long. */ - -extern void smp_info(struct seq_file *); -extern void smp_bogo(struct seq_file *); -extern void mmu_info(struct seq_file *); - -unsigned int dcache_parity_tl1_occurred; -unsigned int icache_parity_tl1_occurred; - -int ncpus_probed; - -static int show_cpuinfo(struct seq_file *m, void *__unused) -{ - seq_printf(m, - "cpu\t\t: %s\n" - "fpu\t\t: %s\n" - "pmu\t\t: %s\n" - "prom\t\t: %s\n" - "type\t\t: %s\n" - "ncpus probed\t: %d\n" - "ncpus active\t: %d\n" - "D$ parity tl1\t: %u\n" - "I$ parity tl1\t: %u\n" -#ifndef CONFIG_SMP - "Cpu0ClkTck\t: %016lx\n" -#endif - , - sparc_cpu_type, - sparc_fpu_type, - sparc_pmu_type, - prom_version, - ((tlb_type == hypervisor) ? - "sun4v" : - "sun4u"), - ncpus_probed, - num_online_cpus(), - dcache_parity_tl1_occurred, - icache_parity_tl1_occurred -#ifndef CONFIG_SMP - , cpu_data(0).clock_tick -#endif - ); -#ifdef CONFIG_SMP - smp_bogo(m); -#endif - mmu_info(m); -#ifdef CONFIG_SMP - smp_info(m); -#endif - return 0; -} - -static void *c_start(struct seq_file *m, loff_t *pos) -{ - /* The pointer we are returning is arbitrary, - * it just has to be non-NULL and not IS_ERR - * in the success case. - */ - return *pos == 0 ? &c_start : NULL; -} - -static void *c_next(struct seq_file *m, void *v, loff_t *pos) -{ - ++*pos; - return c_start(m, pos); -} - -static void c_stop(struct seq_file *m, void *v) -{ -} - -const struct seq_operations cpuinfo_op = { - .start =c_start, - .next = c_next, - .stop = c_stop, - .show = show_cpuinfo, -}; - extern int stop_a_enabled; void sun_do_break(void) diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index 442286d8343..d5b3958be0b 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -37,8 +37,6 @@ #include "irq.h" volatile unsigned long cpu_callin_map[NR_CPUS] __cpuinitdata = {0,}; -unsigned char boot_cpu_id = 0; -unsigned char boot_cpu_id4 = 0; /* boot_cpu_id << 2 */ cpumask_t smp_commenced_mask = CPU_MASK_NONE; @@ -130,14 +128,57 @@ struct linux_prom_registers smp_penguin_ctable __cpuinitdata = { 0 }; void smp_send_reschedule(int cpu) { /* - * XXX missing reschedule IPI, see scheduler_ipi() + * CPU model dependent way of implementing IPI generation targeting + * a single CPU. The trap handler needs only to do trap entry/return + * to call schedule. */ + BTFIXUP_CALL(smp_ipi_resched)(cpu); } void smp_send_stop(void) { } +void arch_send_call_function_single_ipi(int cpu) +{ + /* trigger one IPI single call on one CPU */ + BTFIXUP_CALL(smp_ipi_single)(cpu); +} + +void arch_send_call_function_ipi_mask(const struct cpumask *mask) +{ + int cpu; + + /* trigger IPI mask call on each CPU */ + for_each_cpu(cpu, mask) + BTFIXUP_CALL(smp_ipi_mask_one)(cpu); +} + +void smp_resched_interrupt(void) +{ + irq_enter(); + scheduler_ipi(); + local_cpu_data().irq_resched_count++; + irq_exit(); + /* re-schedule routine called by interrupt return code. */ +} + +void smp_call_function_single_interrupt(void) +{ + irq_enter(); + generic_smp_call_function_single_interrupt(); + local_cpu_data().irq_call_count++; + irq_exit(); +} + +void smp_call_function_interrupt(void) +{ + irq_enter(); + generic_smp_call_function_interrupt(); + local_cpu_data().irq_call_count++; + irq_exit(); +} + void smp_flush_cache_all(void) { xc0((smpfunc_t) BTFIXUP_CALL(local_flush_cache_all)); @@ -153,9 +194,10 @@ void smp_flush_tlb_all(void) void smp_flush_cache_mm(struct mm_struct *mm) { if(mm->context != NO_CONTEXT) { - cpumask_t cpu_mask = *mm_cpumask(mm); - cpu_clear(smp_processor_id(), cpu_mask); - if (!cpus_empty(cpu_mask)) + cpumask_t cpu_mask; + cpumask_copy(&cpu_mask, mm_cpumask(mm)); + cpumask_clear_cpu(smp_processor_id(), &cpu_mask); + if (!cpumask_empty(&cpu_mask)) xc1((smpfunc_t) BTFIXUP_CALL(local_flush_cache_mm), (unsigned long) mm); local_flush_cache_mm(mm); } @@ -164,9 +206,10 @@ void smp_flush_cache_mm(struct mm_struct *mm) void smp_flush_tlb_mm(struct mm_struct *mm) { if(mm->context != NO_CONTEXT) { - cpumask_t cpu_mask = *mm_cpumask(mm); - cpu_clear(smp_processor_id(), cpu_mask); - if (!cpus_empty(cpu_mask)) { + cpumask_t cpu_mask; + cpumask_copy(&cpu_mask, mm_cpumask(mm)); + cpumask_clear_cpu(smp_processor_id(), &cpu_mask); + if (!cpumask_empty(&cpu_mask)) { xc1((smpfunc_t) BTFIXUP_CALL(local_flush_tlb_mm), (unsigned long) mm); if(atomic_read(&mm->mm_users) == 1 && current->active_mm == mm) cpumask_copy(mm_cpumask(mm), @@ -182,9 +225,10 @@ void smp_flush_cache_range(struct vm_area_struct *vma, unsigned long start, struct mm_struct *mm = vma->vm_mm; if (mm->context != NO_CONTEXT) { - cpumask_t cpu_mask = *mm_cpumask(mm); - cpu_clear(smp_processor_id(), cpu_mask); - if (!cpus_empty(cpu_mask)) + cpumask_t cpu_mask; + cpumask_copy(&cpu_mask, mm_cpumask(mm)); + cpumask_clear_cpu(smp_processor_id(), &cpu_mask); + if (!cpumask_empty(&cpu_mask)) xc3((smpfunc_t) BTFIXUP_CALL(local_flush_cache_range), (unsigned long) vma, start, end); local_flush_cache_range(vma, start, end); } @@ -196,9 +240,10 @@ void smp_flush_tlb_range(struct vm_area_struct *vma, unsigned long start, struct mm_struct *mm = vma->vm_mm; if (mm->context != NO_CONTEXT) { - cpumask_t cpu_mask = *mm_cpumask(mm); - cpu_clear(smp_processor_id(), cpu_mask); - if (!cpus_empty(cpu_mask)) + cpumask_t cpu_mask; + cpumask_copy(&cpu_mask, mm_cpumask(mm)); + cpumask_clear_cpu(smp_processor_id(), &cpu_mask); + if (!cpumask_empty(&cpu_mask)) xc3((smpfunc_t) BTFIXUP_CALL(local_flush_tlb_range), (unsigned long) vma, start, end); local_flush_tlb_range(vma, start, end); } @@ -209,9 +254,10 @@ void smp_flush_cache_page(struct vm_area_struct *vma, unsigned long page) struct mm_struct *mm = vma->vm_mm; if(mm->context != NO_CONTEXT) { - cpumask_t cpu_mask = *mm_cpumask(mm); - cpu_clear(smp_processor_id(), cpu_mask); - if (!cpus_empty(cpu_mask)) + cpumask_t cpu_mask; + cpumask_copy(&cpu_mask, mm_cpumask(mm)); + cpumask_clear_cpu(smp_processor_id(), &cpu_mask); + if (!cpumask_empty(&cpu_mask)) xc2((smpfunc_t) BTFIXUP_CALL(local_flush_cache_page), (unsigned long) vma, page); local_flush_cache_page(vma, page); } @@ -222,19 +268,15 @@ void smp_flush_tlb_page(struct vm_area_struct *vma, unsigned long page) struct mm_struct *mm = vma->vm_mm; if(mm->context != NO_CONTEXT) { - cpumask_t cpu_mask = *mm_cpumask(mm); - cpu_clear(smp_processor_id(), cpu_mask); - if (!cpus_empty(cpu_mask)) + cpumask_t cpu_mask; + cpumask_copy(&cpu_mask, mm_cpumask(mm)); + cpumask_clear_cpu(smp_processor_id(), &cpu_mask); + if (!cpumask_empty(&cpu_mask)) xc2((smpfunc_t) BTFIXUP_CALL(local_flush_tlb_page), (unsigned long) vma, page); local_flush_tlb_page(vma, page); } } -void smp_reschedule_irq(void) -{ - set_need_resched(); -} - void smp_flush_page_to_ram(unsigned long page) { /* Current theory is that those who call this are the one's @@ -251,9 +293,10 @@ void smp_flush_page_to_ram(unsigned long page) void smp_flush_sig_insns(struct mm_struct *mm, unsigned long insn_addr) { - cpumask_t cpu_mask = *mm_cpumask(mm); - cpu_clear(smp_processor_id(), cpu_mask); - if (!cpus_empty(cpu_mask)) + cpumask_t cpu_mask; + cpumask_copy(&cpu_mask, mm_cpumask(mm)); + cpumask_clear_cpu(smp_processor_id(), &cpu_mask); + if (!cpumask_empty(&cpu_mask)) xc2((smpfunc_t) BTFIXUP_CALL(local_flush_sig_insns), (unsigned long) mm, insn_addr); local_flush_sig_insns(mm, insn_addr); } @@ -407,7 +450,7 @@ int __cpuinit __cpu_up(unsigned int cpu) }; if (!ret) { - cpu_set(cpu, smp_commenced_mask); + cpumask_set_cpu(cpu, &smp_commenced_mask); while (!cpu_online(cpu)) mb(); } diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 9478da7fdb3..99cb17251bb 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -121,11 +121,11 @@ void __cpuinit smp_callin(void) /* inform the notifiers about the new cpu */ notify_cpu_starting(cpuid); - while (!cpu_isset(cpuid, smp_commenced_mask)) + while (!cpumask_test_cpu(cpuid, &smp_commenced_mask)) rmb(); ipi_call_lock_irq(); - cpu_set(cpuid, cpu_online_map); + set_cpu_online(cpuid, true); ipi_call_unlock_irq(); /* idle thread is expected to have preempt disabled */ @@ -785,7 +785,7 @@ static void xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask /* Send cross call to all processors mentioned in MASK_P * except self. Really, there are only two cases currently, - * "&cpu_online_map" and "&mm->cpu_vm_mask". + * "cpu_online_mask" and "mm_cpumask(mm)". */ static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 data2, const cpumask_t *mask) { @@ -797,7 +797,7 @@ static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 d /* Send cross call to all processors except self. */ static void smp_cross_call(unsigned long *func, u32 ctx, u64 data1, u64 data2) { - smp_cross_call_masked(func, ctx, data1, data2, &cpu_online_map); + smp_cross_call_masked(func, ctx, data1, data2, cpu_online_mask); } extern unsigned long xcall_sync_tick; @@ -805,7 +805,7 @@ extern unsigned long xcall_sync_tick; static void smp_start_sync_tick_client(int cpu) { xcall_deliver((u64) &xcall_sync_tick, 0, 0, - &cpumask_of_cpu(cpu)); + cpumask_of(cpu)); } extern unsigned long xcall_call_function; @@ -820,7 +820,7 @@ extern unsigned long xcall_call_function_single; void arch_send_call_function_single_ipi(int cpu) { xcall_deliver((u64) &xcall_call_function_single, 0, 0, - &cpumask_of_cpu(cpu)); + cpumask_of(cpu)); } void __irq_entry smp_call_function_client(int irq, struct pt_regs *regs) @@ -918,7 +918,7 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu) } if (data0) { xcall_deliver(data0, __pa(pg_addr), - (u64) pg_addr, &cpumask_of_cpu(cpu)); + (u64) pg_addr, cpumask_of(cpu)); #ifdef CONFIG_DEBUG_DCFLUSH atomic_inc(&dcpage_flushes_xcall); #endif @@ -954,7 +954,7 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page) } if (data0) { xcall_deliver(data0, __pa(pg_addr), - (u64) pg_addr, &cpu_online_map); + (u64) pg_addr, cpu_online_mask); #ifdef CONFIG_DEBUG_DCFLUSH atomic_inc(&dcpage_flushes_xcall); #endif @@ -1197,32 +1197,32 @@ void __devinit smp_fill_in_sib_core_maps(void) for_each_present_cpu(i) { unsigned int j; - cpus_clear(cpu_core_map[i]); + cpumask_clear(&cpu_core_map[i]); if (cpu_data(i).core_id == 0) { - cpu_set(i, cpu_core_map[i]); + cpumask_set_cpu(i, &cpu_core_map[i]); continue; } for_each_present_cpu(j) { if (cpu_data(i).core_id == cpu_data(j).core_id) - cpu_set(j, cpu_core_map[i]); + cpumask_set_cpu(j, &cpu_core_map[i]); } } for_each_present_cpu(i) { unsigned int j; - cpus_clear(per_cpu(cpu_sibling_map, i)); + cpumask_clear(&per_cpu(cpu_sibling_map, i)); if (cpu_data(i).proc_id == -1) { - cpu_set(i, per_cpu(cpu_sibling_map, i)); + cpumask_set_cpu(i, &per_cpu(cpu_sibling_map, i)); continue; } for_each_present_cpu(j) { if (cpu_data(i).proc_id == cpu_data(j).proc_id) - cpu_set(j, per_cpu(cpu_sibling_map, i)); + cpumask_set_cpu(j, &per_cpu(cpu_sibling_map, i)); } } } @@ -1232,10 +1232,10 @@ int __cpuinit __cpu_up(unsigned int cpu) int ret = smp_boot_one_cpu(cpu); if (!ret) { - cpu_set(cpu, smp_commenced_mask); - while (!cpu_isset(cpu, cpu_online_map)) + cpumask_set_cpu(cpu, &smp_commenced_mask); + while (!cpu_online(cpu)) mb(); - if (!cpu_isset(cpu, cpu_online_map)) { + if (!cpu_online(cpu)) { ret = -ENODEV; } else { /* On SUN4V, writes to %tick and %stick are @@ -1269,7 +1269,7 @@ void cpu_play_dead(void) tb->nonresum_mondo_pa, 0); } - cpu_clear(cpu, smp_commenced_mask); + cpumask_clear_cpu(cpu, &smp_commenced_mask); membar_safe("#Sync"); local_irq_disable(); @@ -1290,13 +1290,13 @@ int __cpu_disable(void) cpuinfo_sparc *c; int i; - for_each_cpu_mask(i, cpu_core_map[cpu]) - cpu_clear(cpu, cpu_core_map[i]); - cpus_clear(cpu_core_map[cpu]); + for_each_cpu(i, &cpu_core_map[cpu]) + cpumask_clear_cpu(cpu, &cpu_core_map[i]); + cpumask_clear(&cpu_core_map[cpu]); - for_each_cpu_mask(i, per_cpu(cpu_sibling_map, cpu)) - cpu_clear(cpu, per_cpu(cpu_sibling_map, i)); - cpus_clear(per_cpu(cpu_sibling_map, cpu)); + for_each_cpu(i, &per_cpu(cpu_sibling_map, cpu)) + cpumask_clear_cpu(cpu, &per_cpu(cpu_sibling_map, i)); + cpumask_clear(&per_cpu(cpu_sibling_map, cpu)); c = &cpu_data(cpu); @@ -1313,7 +1313,7 @@ int __cpu_disable(void) local_irq_disable(); ipi_call_lock(); - cpu_clear(cpu, cpu_online_map); + set_cpu_online(cpu, false); ipi_call_unlock(); cpu_map_rebuild(); @@ -1327,11 +1327,11 @@ void __cpu_die(unsigned int cpu) for (i = 0; i < 100; i++) { smp_rmb(); - if (!cpu_isset(cpu, smp_commenced_mask)) + if (!cpumask_test_cpu(cpu, &smp_commenced_mask)) break; msleep(100); } - if (cpu_isset(cpu, smp_commenced_mask)) { + if (cpumask_test_cpu(cpu, &smp_commenced_mask)) { printk(KERN_ERR "CPU %u didn't die...\n", cpu); } else { #if defined(CONFIG_SUN_LDOMS) @@ -1341,7 +1341,7 @@ void __cpu_die(unsigned int cpu) do { hv_err = sun4v_cpu_stop(cpu); if (hv_err == HV_EOK) { - cpu_clear(cpu, cpu_present_map); + set_cpu_present(cpu, false); break; } } while (--limit > 0); @@ -1362,7 +1362,7 @@ void __init smp_cpus_done(unsigned int max_cpus) void smp_send_reschedule(int cpu) { xcall_deliver((u64) &xcall_receive_signal, 0, 0, - &cpumask_of_cpu(cpu)); + cpumask_of(cpu)); } void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs) diff --git a/arch/sparc/kernel/sun4c_irq.c b/arch/sparc/kernel/sun4c_irq.c index 90eea38ad66..f6bf25a2ff8 100644 --- a/arch/sparc/kernel/sun4c_irq.c +++ b/arch/sparc/kernel/sun4c_irq.c @@ -65,62 +65,94 @@ */ unsigned char __iomem *interrupt_enable; -static void sun4c_disable_irq(unsigned int irq_nr) +static void sun4c_mask_irq(struct irq_data *data) { - unsigned long flags; - unsigned char current_mask, new_mask; - - local_irq_save(flags); - irq_nr &= (NR_IRQS - 1); - current_mask = sbus_readb(interrupt_enable); - switch (irq_nr) { - case 1: - new_mask = ((current_mask) & (~(SUN4C_INT_E1))); - break; - case 8: - new_mask = ((current_mask) & (~(SUN4C_INT_E8))); - break; - case 10: - new_mask = ((current_mask) & (~(SUN4C_INT_E10))); - break; - case 14: - new_mask = ((current_mask) & (~(SUN4C_INT_E14))); - break; - default: + unsigned long mask = (unsigned long)data->chip_data; + + if (mask) { + unsigned long flags; + + local_irq_save(flags); + mask = sbus_readb(interrupt_enable) & ~mask; + sbus_writeb(mask, interrupt_enable); local_irq_restore(flags); - return; } - sbus_writeb(new_mask, interrupt_enable); - local_irq_restore(flags); } -static void sun4c_enable_irq(unsigned int irq_nr) +static void sun4c_unmask_irq(struct irq_data *data) { - unsigned long flags; - unsigned char current_mask, new_mask; - - local_irq_save(flags); - irq_nr &= (NR_IRQS - 1); - current_mask = sbus_readb(interrupt_enable); - switch (irq_nr) { - case 1: - new_mask = ((current_mask) | SUN4C_INT_E1); - break; - case 8: - new_mask = ((current_mask) | SUN4C_INT_E8); - break; - case 10: - new_mask = ((current_mask) | SUN4C_INT_E10); - break; - case 14: - new_mask = ((current_mask) | SUN4C_INT_E14); - break; - default: + unsigned long mask = (unsigned long)data->chip_data; + + if (mask) { + unsigned long flags; + + local_irq_save(flags); + mask = sbus_readb(interrupt_enable) | mask; + sbus_writeb(mask, interrupt_enable); local_irq_restore(flags); - return; } - sbus_writeb(new_mask, interrupt_enable); - local_irq_restore(flags); +} + +static unsigned int sun4c_startup_irq(struct irq_data *data) +{ + irq_link(data->irq); + sun4c_unmask_irq(data); + + return 0; +} + +static void sun4c_shutdown_irq(struct irq_data *data) +{ + sun4c_mask_irq(data); + irq_unlink(data->irq); +} + +static struct irq_chip sun4c_irq = { + .name = "sun4c", + .irq_startup = sun4c_startup_irq, + .irq_shutdown = sun4c_shutdown_irq, + .irq_mask = sun4c_mask_irq, + .irq_unmask = sun4c_unmask_irq, +}; + +static unsigned int sun4c_build_device_irq(struct platform_device *op, + unsigned int real_irq) +{ + unsigned int irq; + + if (real_irq >= 16) { + prom_printf("Bogus sun4c IRQ %u\n", real_irq); + prom_halt(); + } + + irq = irq_alloc(real_irq, real_irq); + if (irq) { + unsigned long mask = 0UL; + + switch (real_irq) { + case 1: + mask = SUN4C_INT_E1; + break; + case 8: + mask = SUN4C_INT_E8; + break; + case 10: + mask = SUN4C_INT_E10; + break; + case 14: + mask = SUN4C_INT_E14; + break; + default: + /* All the rest are either always enabled, + * or are for signalling software interrupts. + */ + break; + } + irq_set_chip_and_handler_name(irq, &sun4c_irq, + handle_level_irq, "level"); + irq_set_chip_data(irq, (void *)mask); + } + return irq; } struct sun4c_timer_info { @@ -144,8 +176,9 @@ static void sun4c_load_profile_irq(int cpu, unsigned int limit) static void __init sun4c_init_timers(irq_handler_t counter_fn) { - const struct linux_prom_irqs *irq; + const struct linux_prom_irqs *prom_irqs; struct device_node *dp; + unsigned int irq; const u32 *addr; int err; @@ -163,9 +196,9 @@ static void __init sun4c_init_timers(irq_handler_t counter_fn) sun4c_timers = (void __iomem *) (unsigned long) addr[0]; - irq = of_get_property(dp, "intr", NULL); + prom_irqs = of_get_property(dp, "intr", NULL); of_node_put(dp); - if (!irq) { + if (!prom_irqs) { prom_printf("sun4c_init_timers: No intr property\n"); prom_halt(); } @@ -178,15 +211,15 @@ static void __init sun4c_init_timers(irq_handler_t counter_fn) master_l10_counter = &sun4c_timers->l10_count; - err = request_irq(irq[0].pri, counter_fn, - (IRQF_DISABLED | SA_STATIC_ALLOC), - "timer", NULL); + irq = sun4c_build_device_irq(NULL, prom_irqs[0].pri); + err = request_irq(irq, counter_fn, IRQF_TIMER, "timer", NULL); if (err) { prom_printf("sun4c_init_timers: request_irq() fails with %d\n", err); prom_halt(); } - sun4c_disable_irq(irq[1].pri); + /* disable timer interrupt */ + sun4c_mask_irq(irq_get_irq_data(irq)); } #ifdef CONFIG_SMP @@ -215,14 +248,11 @@ void __init sun4c_init_IRQ(void) interrupt_enable = (void __iomem *) (unsigned long) addr[0]; - BTFIXUPSET_CALL(enable_irq, sun4c_enable_irq, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(disable_irq, sun4c_disable_irq, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(enable_pil_irq, sun4c_enable_irq, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(disable_pil_irq, sun4c_disable_irq, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(clear_clock_irq, sun4c_clear_clock_irq, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(load_profile_irq, sun4c_load_profile_irq, BTFIXUPCALL_NOP); - sparc_irq_config.init_timers = sun4c_init_timers; + sparc_irq_config.init_timers = sun4c_init_timers; + sparc_irq_config.build_device_irq = sun4c_build_device_irq; #ifdef CONFIG_SMP BTFIXUPSET_CALL(set_cpu_int, sun4c_nop, BTFIXUPCALL_NOP); diff --git a/arch/sparc/kernel/sun4d_irq.c b/arch/sparc/kernel/sun4d_irq.c index 77b4a899271..a9ea60eb2c1 100644 --- a/arch/sparc/kernel/sun4d_irq.c +++ b/arch/sparc/kernel/sun4d_irq.c @@ -14,6 +14,7 @@ #include <asm/io.h> #include <asm/sbi.h> #include <asm/cacheflush.h> +#include <asm/setup.h> #include "kernel.h" #include "irq.h" @@ -22,22 +23,20 @@ * cpu local. CPU local interrupts cover the timer interrupts * and whatnot, and we encode those as normal PILs between * 0 and 15. - * - * SBUS interrupts are encoded integers including the board number - * (plus one), the SBUS level, and the SBUS slot number. Sun4D - * IRQ dispatch is done by: - * - * 1) Reading the BW local interrupt table in order to get the bus - * interrupt mask. - * - * This table is indexed by SBUS interrupt level which can be - * derived from the PIL we got interrupted on. - * - * 2) For each bus showing interrupt pending from #1, read the - * SBI interrupt state register. This will indicate which slots - * have interrupts pending for that SBUS interrupt level. + * SBUS interrupts are encodes as a combination of board, level and slot. */ +struct sun4d_handler_data { + unsigned int cpuid; /* target cpu */ + unsigned int real_irq; /* interrupt level */ +}; + + +static unsigned int sun4d_encode_irq(int board, int lvl, int slot) +{ + return (board + 1) << 5 | (lvl << 2) | slot; +} + struct sun4d_timer_regs { u32 l10_timer_limit; u32 l10_cur_countx; @@ -48,17 +47,12 @@ struct sun4d_timer_regs { static struct sun4d_timer_regs __iomem *sun4d_timers; -#define TIMER_IRQ 10 - -#define MAX_STATIC_ALLOC 4 -static unsigned char sbus_tid[32]; - -static struct irqaction *irq_action[NR_IRQS]; +#define SUN4D_TIMER_IRQ 10 -static struct sbus_action { - struct irqaction *action; - /* For SMP this needs to be extended */ -} *sbus_actions; +/* Specify which cpu handle interrupts from which board. + * Index is board - value is cpu. + */ +static unsigned char board_to_cpu[32]; static int pil_to_sbus[] = { 0, @@ -79,152 +73,81 @@ static int pil_to_sbus[] = { 0, }; -static int sbus_to_pil[] = { - 0, - 2, - 3, - 5, - 7, - 9, - 11, - 13, -}; - -static int nsbi; - /* Exported for sun4d_smp.c */ DEFINE_SPINLOCK(sun4d_imsk_lock); -int show_sun4d_interrupts(struct seq_file *p, void *v) +/* SBUS interrupts are encoded integers including the board number + * (plus one), the SBUS level, and the SBUS slot number. Sun4D + * IRQ dispatch is done by: + * + * 1) Reading the BW local interrupt table in order to get the bus + * interrupt mask. + * + * This table is indexed by SBUS interrupt level which can be + * derived from the PIL we got interrupted on. + * + * 2) For each bus showing interrupt pending from #1, read the + * SBI interrupt state register. This will indicate which slots + * have interrupts pending for that SBUS interrupt level. + * + * 3) Call the genreric IRQ support. + */ +static void sun4d_sbus_handler_irq(int sbusl) { - int i = *(loff_t *) v, j = 0, k = 0, sbusl; - struct irqaction *action; - unsigned long flags; -#ifdef CONFIG_SMP - int x; -#endif - - spin_lock_irqsave(&irq_action_lock, flags); - if (i < NR_IRQS) { - sbusl = pil_to_sbus[i]; - if (!sbusl) { - action = *(i + irq_action); - if (!action) - goto out_unlock; - } else { - for (j = 0; j < nsbi; j++) { - for (k = 0; k < 4; k++) - action = sbus_actions[(j << 5) + (sbusl << 2) + k].action; - if (action) - goto found_it; - } - goto out_unlock; - } -found_it: seq_printf(p, "%3d: ", i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else - for_each_online_cpu(x) - seq_printf(p, "%10u ", - kstat_cpu(cpu_logical_map(x)).irqs[i]); -#endif - seq_printf(p, "%c %s", - (action->flags & IRQF_DISABLED) ? '+' : ' ', - action->name); - action = action->next; - for (;;) { - for (; action; action = action->next) { - seq_printf(p, ",%s %s", - (action->flags & IRQF_DISABLED) ? " +" : "", - action->name); - } - if (!sbusl) - break; - k++; - if (k < 4) { - action = sbus_actions[(j << 5) + (sbusl << 2) + k].action; - } else { - j++; - if (j == nsbi) - break; - k = 0; - action = sbus_actions[(j << 5) + (sbusl << 2)].action; + unsigned int bus_mask; + unsigned int sbino, slot; + unsigned int sbil; + + bus_mask = bw_get_intr_mask(sbusl) & 0x3ffff; + bw_clear_intr_mask(sbusl, bus_mask); + + sbil = (sbusl << 2); + /* Loop for each pending SBI */ + for (sbino = 0; bus_mask; sbino++) { + unsigned int idx, mask; + + bus_mask >>= 1; + if (!(bus_mask & 1)) + continue; + /* XXX This seems to ACK the irq twice. acquire_sbi() + * XXX uses swap, therefore this writes 0xf << sbil, + * XXX then later release_sbi() will write the individual + * XXX bits which were set again. + */ + mask = acquire_sbi(SBI2DEVID(sbino), 0xf << sbil); + mask &= (0xf << sbil); + + /* Loop for each pending SBI slot */ + idx = 0; + slot = (1 << sbil); + while (mask != 0) { + unsigned int pil; + struct irq_bucket *p; + + idx++; + slot <<= 1; + if (!(mask & slot)) + continue; + + mask &= ~slot; + pil = sun4d_encode_irq(sbino, sbil, idx); + + p = irq_map[pil]; + while (p) { + struct irq_bucket *next; + + next = p->next; + generic_handle_irq(p->irq); + p = next; } + release_sbi(SBI2DEVID(sbino), slot); } - seq_putc(p, '\n'); } -out_unlock: - spin_unlock_irqrestore(&irq_action_lock, flags); - return 0; -} - -void sun4d_free_irq(unsigned int irq, void *dev_id) -{ - struct irqaction *action, **actionp; - struct irqaction *tmp = NULL; - unsigned long flags; - - spin_lock_irqsave(&irq_action_lock, flags); - if (irq < 15) - actionp = irq + irq_action; - else - actionp = &(sbus_actions[irq - (1 << 5)].action); - action = *actionp; - if (!action) { - printk(KERN_ERR "Trying to free free IRQ%d\n", irq); - goto out_unlock; - } - if (dev_id) { - for (; action; action = action->next) { - if (action->dev_id == dev_id) - break; - tmp = action; - } - if (!action) { - printk(KERN_ERR "Trying to free free shared IRQ%d\n", - irq); - goto out_unlock; - } - } else if (action->flags & IRQF_SHARED) { - printk(KERN_ERR "Trying to free shared IRQ%d with NULL device ID\n", - irq); - goto out_unlock; - } - if (action->flags & SA_STATIC_ALLOC) { - /* - * This interrupt is marked as specially allocated - * so it is a bad idea to free it. - */ - printk(KERN_ERR "Attempt to free statically allocated IRQ%d (%s)\n", - irq, action->name); - goto out_unlock; - } - - if (tmp) - tmp->next = action->next; - else - *actionp = action->next; - - spin_unlock_irqrestore(&irq_action_lock, flags); - - synchronize_irq(irq); - - spin_lock_irqsave(&irq_action_lock, flags); - - kfree(action); - - if (!(*actionp)) - __disable_irq(irq); - -out_unlock: - spin_unlock_irqrestore(&irq_action_lock, flags); } void sun4d_handler_irq(int pil, struct pt_regs *regs) { struct pt_regs *old_regs; - struct irqaction *action; - int cpu = smp_processor_id(); /* SBUS IRQ level (1 - 7) */ int sbusl = pil_to_sbus[pil]; @@ -233,160 +156,96 @@ void sun4d_handler_irq(int pil, struct pt_regs *regs) cc_set_iclr(1 << pil); +#ifdef CONFIG_SMP + /* + * Check IPI data structures after IRQ has been cleared. Hard and Soft + * IRQ can happen at the same time, so both cases are always handled. + */ + if (pil == SUN4D_IPI_IRQ) + sun4d_ipi_interrupt(); +#endif + old_regs = set_irq_regs(regs); irq_enter(); - kstat_cpu(cpu).irqs[pil]++; - if (!sbusl) { - action = *(pil + irq_action); - if (!action) - unexpected_irq(pil, NULL, regs); - do { - action->handler(pil, action->dev_id); - action = action->next; - } while (action); + if (sbusl == 0) { + /* cpu interrupt */ + struct irq_bucket *p; + + p = irq_map[pil]; + while (p) { + struct irq_bucket *next; + + next = p->next; + generic_handle_irq(p->irq); + p = next; + } } else { - int bus_mask = bw_get_intr_mask(sbusl) & 0x3ffff; - int sbino; - struct sbus_action *actionp; - unsigned mask, slot; - int sbil = (sbusl << 2); - - bw_clear_intr_mask(sbusl, bus_mask); - - /* Loop for each pending SBI */ - for (sbino = 0; bus_mask; sbino++, bus_mask >>= 1) - if (bus_mask & 1) { - mask = acquire_sbi(SBI2DEVID(sbino), 0xf << sbil); - mask &= (0xf << sbil); - actionp = sbus_actions + (sbino << 5) + (sbil); - /* Loop for each pending SBI slot */ - for (slot = (1 << sbil); mask; slot <<= 1, actionp++) - if (mask & slot) { - mask &= ~slot; - action = actionp->action; - - if (!action) - unexpected_irq(pil, NULL, regs); - do { - action->handler(pil, action->dev_id); - action = action->next; - } while (action); - release_sbi(SBI2DEVID(sbino), slot); - } - } + /* SBUS interrupt */ + sun4d_sbus_handler_irq(sbusl); } irq_exit(); set_irq_regs(old_regs); } -int sun4d_request_irq(unsigned int irq, - irq_handler_t handler, - unsigned long irqflags, const char *devname, void *dev_id) + +static void sun4d_mask_irq(struct irq_data *data) { - struct irqaction *action, *tmp = NULL, **actionp; + struct sun4d_handler_data *handler_data = data->handler_data; + unsigned int real_irq; +#ifdef CONFIG_SMP + int cpuid = handler_data->cpuid; unsigned long flags; - int ret; - - if (irq > 14 && irq < (1 << 5)) { - ret = -EINVAL; - goto out; - } - - if (!handler) { - ret = -EINVAL; - goto out; - } - - spin_lock_irqsave(&irq_action_lock, flags); - - if (irq >= (1 << 5)) - actionp = &(sbus_actions[irq - (1 << 5)].action); - else - actionp = irq + irq_action; - action = *actionp; - - if (action) { - if ((action->flags & IRQF_SHARED) && (irqflags & IRQF_SHARED)) { - for (tmp = action; tmp->next; tmp = tmp->next) - /* find last entry - tmp used below */; - } else { - ret = -EBUSY; - goto out_unlock; - } - if ((action->flags & IRQF_DISABLED) ^ (irqflags & IRQF_DISABLED)) { - printk(KERN_ERR "Attempt to mix fast and slow interrupts on IRQ%d denied\n", - irq); - ret = -EBUSY; - goto out_unlock; - } - action = NULL; /* Or else! */ - } - - /* If this is flagged as statically allocated then we use our - * private struct which is never freed. - */ - if (irqflags & SA_STATIC_ALLOC) { - if (static_irq_count < MAX_STATIC_ALLOC) - action = &static_irqaction[static_irq_count++]; - else - printk(KERN_ERR "Request for IRQ%d (%s) SA_STATIC_ALLOC failed using kmalloc\n", - irq, devname); - } - - if (action == NULL) - action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC); - - if (!action) { - ret = -ENOMEM; - goto out_unlock; - } - - action->handler = handler; - action->flags = irqflags; - action->name = devname; - action->next = NULL; - action->dev_id = dev_id; - - if (tmp) - tmp->next = action; - else - *actionp = action; - - __enable_irq(irq); - - ret = 0; -out_unlock: - spin_unlock_irqrestore(&irq_action_lock, flags); -out: - return ret; +#endif + real_irq = handler_data->real_irq; +#ifdef CONFIG_SMP + spin_lock_irqsave(&sun4d_imsk_lock, flags); + cc_set_imsk_other(cpuid, cc_get_imsk_other(cpuid) | (1 << real_irq)); + spin_unlock_irqrestore(&sun4d_imsk_lock, flags); +#else + cc_set_imsk(cc_get_imsk() | (1 << real_irq)); +#endif } -static void sun4d_disable_irq(unsigned int irq) +static void sun4d_unmask_irq(struct irq_data *data) { - int tid = sbus_tid[(irq >> 5) - 1]; + struct sun4d_handler_data *handler_data = data->handler_data; + unsigned int real_irq; +#ifdef CONFIG_SMP + int cpuid = handler_data->cpuid; unsigned long flags; +#endif + real_irq = handler_data->real_irq; - if (irq < NR_IRQS) - return; - +#ifdef CONFIG_SMP spin_lock_irqsave(&sun4d_imsk_lock, flags); - cc_set_imsk_other(tid, cc_get_imsk_other(tid) | (1 << sbus_to_pil[(irq >> 2) & 7])); + cc_set_imsk_other(cpuid, cc_get_imsk_other(cpuid) | ~(1 << real_irq)); spin_unlock_irqrestore(&sun4d_imsk_lock, flags); +#else + cc_set_imsk(cc_get_imsk() | ~(1 << real_irq)); +#endif } -static void sun4d_enable_irq(unsigned int irq) +static unsigned int sun4d_startup_irq(struct irq_data *data) { - int tid = sbus_tid[(irq >> 5) - 1]; - unsigned long flags; - - if (irq < NR_IRQS) - return; + irq_link(data->irq); + sun4d_unmask_irq(data); + return 0; +} - spin_lock_irqsave(&sun4d_imsk_lock, flags); - cc_set_imsk_other(tid, cc_get_imsk_other(tid) & ~(1 << sbus_to_pil[(irq >> 2) & 7])); - spin_unlock_irqrestore(&sun4d_imsk_lock, flags); +static void sun4d_shutdown_irq(struct irq_data *data) +{ + sun4d_mask_irq(data); + irq_unlink(data->irq); } +struct irq_chip sun4d_irq = { + .name = "sun4d", + .irq_startup = sun4d_startup_irq, + .irq_shutdown = sun4d_shutdown_irq, + .irq_unmask = sun4d_unmask_irq, + .irq_mask = sun4d_mask_irq, +}; + #ifdef CONFIG_SMP static void sun4d_set_cpu_int(int cpu, int level) { @@ -413,7 +272,7 @@ void __init sun4d_distribute_irqs(void) for_each_node_by_name(dp, "sbi") { int devid = of_getintprop_default(dp, "device-id", 0); int board = of_getintprop_default(dp, "board#", 0); - sbus_tid[board] = cpuid; + board_to_cpu[board] = cpuid; set_sbi_tid(devid, cpuid << 3); } printk(KERN_ERR "All sbus IRQs directed to CPU%d\n", cpuid); @@ -443,15 +302,16 @@ static void __init sun4d_load_profile_irqs(void) unsigned int sun4d_build_device_irq(struct platform_device *op, unsigned int real_irq) { - static int pil_to_sbus[] = { - 0, 0, 1, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 0, - }; struct device_node *dp = op->dev.of_node; struct device_node *io_unit, *sbi = dp->parent; const struct linux_prom_registers *regs; + struct sun4d_handler_data *handler_data; + unsigned int pil; + unsigned int irq; int board, slot; int sbusl; + irq = 0; while (sbi) { if (!strcmp(sbi->name, "sbi")) break; @@ -484,7 +344,28 @@ unsigned int sun4d_build_device_irq(struct platform_device *op, sbusl = pil_to_sbus[real_irq]; if (sbusl) - return (((board + 1) << 5) + (sbusl << 2) + slot); + pil = sun4d_encode_irq(board, sbusl, slot); + else + pil = real_irq; + + irq = irq_alloc(real_irq, pil); + if (irq == 0) + goto err_out; + + handler_data = irq_get_handler_data(irq); + if (unlikely(handler_data)) + goto err_out; + + handler_data = kzalloc(sizeof(struct sun4d_handler_data), GFP_ATOMIC); + if (unlikely(!handler_data)) { + prom_printf("IRQ: kzalloc(sun4d_handler_data) failed.\n"); + prom_halt(); + } + handler_data->cpuid = board_to_cpu[board]; + handler_data->real_irq = real_irq; + irq_set_chip_and_handler_name(irq, &sun4d_irq, + handle_level_irq, "level"); + irq_set_handler_data(irq, handler_data); err_out: return real_irq; @@ -518,6 +399,7 @@ static void __init sun4d_init_timers(irq_handler_t counter_fn) { struct device_node *dp; struct resource res; + unsigned int irq; const u32 *reg; int err; @@ -552,9 +434,8 @@ static void __init sun4d_init_timers(irq_handler_t counter_fn) master_l10_counter = &sun4d_timers->l10_cur_count; - err = request_irq(TIMER_IRQ, counter_fn, - (IRQF_DISABLED | SA_STATIC_ALLOC), - "timer", NULL); + irq = sun4d_build_device_irq(NULL, SUN4D_TIMER_IRQ); + err = request_irq(irq, counter_fn, IRQF_TIMER, "timer", NULL); if (err) { prom_printf("sun4d_init_timers: request_irq() failed with %d\n", err); @@ -567,27 +448,16 @@ static void __init sun4d_init_timers(irq_handler_t counter_fn) void __init sun4d_init_sbi_irq(void) { struct device_node *dp; - int target_cpu = 0; + int target_cpu; -#ifdef CONFIG_SMP target_cpu = boot_cpu_id; -#endif - - nsbi = 0; - for_each_node_by_name(dp, "sbi") - nsbi++; - sbus_actions = kzalloc(nsbi * 8 * 4 * sizeof(struct sbus_action), GFP_ATOMIC); - if (!sbus_actions) { - prom_printf("SUN4D: Cannot allocate sbus_actions, halting.\n"); - prom_halt(); - } for_each_node_by_name(dp, "sbi") { int devid = of_getintprop_default(dp, "device-id", 0); int board = of_getintprop_default(dp, "board#", 0); unsigned int mask; set_sbi_tid(devid, target_cpu << 3); - sbus_tid[board] = target_cpu; + board_to_cpu[board] = target_cpu; /* Get rid of pending irqs from PROM */ mask = acquire_sbi(devid, 0xffffffff); @@ -603,12 +473,10 @@ void __init sun4d_init_IRQ(void) { local_irq_disable(); - BTFIXUPSET_CALL(enable_irq, sun4d_enable_irq, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(disable_irq, sun4d_disable_irq, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(clear_clock_irq, sun4d_clear_clock_irq, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(load_profile_irq, sun4d_load_profile_irq, BTFIXUPCALL_NORM); - sparc_irq_config.init_timers = sun4d_init_timers; + sparc_irq_config.init_timers = sun4d_init_timers; sparc_irq_config.build_device_irq = sun4d_build_device_irq; #ifdef CONFIG_SMP diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c index 475d50b96cd..133387980b5 100644 --- a/arch/sparc/kernel/sun4d_smp.c +++ b/arch/sparc/kernel/sun4d_smp.c @@ -32,6 +32,7 @@ static inline unsigned long sun4d_swap(volatile unsigned long *ptr, unsigned lon return val; } +static void smp4d_ipi_init(void); static void smp_setup_percpu_timer(void); static unsigned char cpu_leds[32]; @@ -80,8 +81,6 @@ void __cpuinit smp4d_callin(void) local_flush_cache_all(); local_flush_tlb_all(); - cpu_probe(); - while ((unsigned long)current_set[cpuid] < PAGE_OFFSET) barrier(); @@ -105,7 +104,7 @@ void __cpuinit smp4d_callin(void) local_irq_enable(); /* We don't allow PIL 14 yet */ - while (!cpu_isset(cpuid, smp_commenced_mask)) + while (!cpumask_test_cpu(cpuid, &smp_commenced_mask)) barrier(); spin_lock_irqsave(&sun4d_imsk_lock, flags); @@ -120,6 +119,7 @@ void __cpuinit smp4d_callin(void) */ void __init smp4d_boot_cpus(void) { + smp4d_ipi_init(); if (boot_cpu_id) current_set[0] = NULL; smp_setup_percpu_timer(); @@ -191,6 +191,80 @@ void __init smp4d_smp_done(void) sun4d_distribute_irqs(); } +/* Memory structure giving interrupt handler information about IPI generated */ +struct sun4d_ipi_work { + int single; + int msk; + int resched; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct sun4d_ipi_work, sun4d_ipi_work); + +/* Initialize IPIs on the SUN4D SMP machine */ +static void __init smp4d_ipi_init(void) +{ + int cpu; + struct sun4d_ipi_work *work; + + printk(KERN_INFO "smp4d: setup IPI at IRQ %d\n", SUN4D_IPI_IRQ); + + for_each_possible_cpu(cpu) { + work = &per_cpu(sun4d_ipi_work, cpu); + work->single = work->msk = work->resched = 0; + } +} + +void sun4d_ipi_interrupt(void) +{ + struct sun4d_ipi_work *work = &__get_cpu_var(sun4d_ipi_work); + + if (work->single) { + work->single = 0; + smp_call_function_single_interrupt(); + } + if (work->msk) { + work->msk = 0; + smp_call_function_interrupt(); + } + if (work->resched) { + work->resched = 0; + smp_resched_interrupt(); + } +} + +static void smp4d_ipi_single(int cpu) +{ + struct sun4d_ipi_work *work = &per_cpu(sun4d_ipi_work, cpu); + + /* Mark work */ + work->single = 1; + + /* Generate IRQ on the CPU */ + sun4d_send_ipi(cpu, SUN4D_IPI_IRQ); +} + +static void smp4d_ipi_mask_one(int cpu) +{ + struct sun4d_ipi_work *work = &per_cpu(sun4d_ipi_work, cpu); + + /* Mark work */ + work->msk = 1; + + /* Generate IRQ on the CPU */ + sun4d_send_ipi(cpu, SUN4D_IPI_IRQ); +} + +static void smp4d_ipi_resched(int cpu) +{ + struct sun4d_ipi_work *work = &per_cpu(sun4d_ipi_work, cpu); + + /* Mark work */ + work->resched = 1; + + /* Generate IRQ on the CPU (any IRQ will cause resched) */ + sun4d_send_ipi(cpu, SUN4D_IPI_IRQ); +} + static struct smp_funcall { smpfunc_t func; unsigned long arg1; @@ -239,10 +313,10 @@ static void smp4d_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1, { register int i; - cpu_clear(smp_processor_id(), mask); - cpus_and(mask, cpu_online_map, mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + cpumask_and(&mask, cpu_online_mask, &mask); for (i = 0; i <= high; i++) { - if (cpu_isset(i, mask)) { + if (cpumask_test_cpu(i, &mask)) { ccall_info.processors_in[i] = 0; ccall_info.processors_out[i] = 0; sun4d_send_ipi(i, IRQ_CROSS_CALL); @@ -255,7 +329,7 @@ static void smp4d_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1, i = 0; do { - if (!cpu_isset(i, mask)) + if (!cpumask_test_cpu(i, &mask)) continue; while (!ccall_info.processors_in[i]) barrier(); @@ -263,7 +337,7 @@ static void smp4d_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1, i = 0; do { - if (!cpu_isset(i, mask)) + if (!cpumask_test_cpu(i, &mask)) continue; while (!ccall_info.processors_out[i]) barrier(); @@ -356,6 +430,9 @@ void __init sun4d_init_smp(void) BTFIXUPSET_BLACKBOX(load_current, smp4d_blackbox_current); BTFIXUPSET_CALL(smp_cross_call, smp4d_cross_call, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(__hard_smp_processor_id, __smp4d_processor_id, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(smp_ipi_resched, smp4d_ipi_resched, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(smp_ipi_single, smp4d_ipi_single, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(smp_ipi_mask_one, smp4d_ipi_mask_one, BTFIXUPCALL_NORM); for (i = 0; i < NR_CPUS; i++) { ccall_info.processors_in[i] = 1; diff --git a/arch/sparc/kernel/sun4m_irq.c b/arch/sparc/kernel/sun4m_irq.c index 69df6257a32..422c16dad1f 100644 --- a/arch/sparc/kernel/sun4m_irq.c +++ b/arch/sparc/kernel/sun4m_irq.c @@ -100,6 +100,11 @@ struct sun4m_irq_percpu __iomem *sun4m_irq_percpu[SUN4M_NCPUS]; struct sun4m_irq_global __iomem *sun4m_irq_global; +struct sun4m_handler_data { + bool percpu; + long mask; +}; + /* Dave Redman (djhr@tadpole.co.uk) * The sun4m interrupt registers. */ @@ -142,9 +147,9 @@ struct sun4m_irq_global __iomem *sun4m_irq_global; #define OBP_INT_LEVEL_VME 0x40 #define SUN4M_TIMER_IRQ (OBP_INT_LEVEL_ONBOARD | 10) -#define SUM4M_PROFILE_IRQ (OBP_INT_LEVEL_ONBOARD | 14) +#define SUN4M_PROFILE_IRQ (OBP_INT_LEVEL_ONBOARD | 14) -static unsigned long irq_mask[0x50] = { +static unsigned long sun4m_imask[0x50] = { /* 0x00 - SMP */ 0, SUN4M_SOFT_INT(1), SUN4M_SOFT_INT(2), SUN4M_SOFT_INT(3), @@ -169,7 +174,7 @@ static unsigned long irq_mask[0x50] = { SUN4M_INT_VIDEO, SUN4M_INT_MODULE, SUN4M_INT_REALTIME, SUN4M_INT_FLOPPY, (SUN4M_INT_SERIAL | SUN4M_INT_KBDMS), - SUN4M_INT_AUDIO, 0, SUN4M_INT_MODULE_ERR, + SUN4M_INT_AUDIO, SUN4M_INT_E14, SUN4M_INT_MODULE_ERR, /* 0x30 - sbus */ 0, 0, SUN4M_INT_SBUS(0), SUN4M_INT_SBUS(1), 0, SUN4M_INT_SBUS(2), 0, SUN4M_INT_SBUS(3), @@ -182,105 +187,110 @@ static unsigned long irq_mask[0x50] = { 0, SUN4M_INT_VME(6), 0, 0 }; -static unsigned long sun4m_get_irqmask(unsigned int irq) +static void sun4m_mask_irq(struct irq_data *data) { - unsigned long mask; - - if (irq < 0x50) - mask = irq_mask[irq]; - else - mask = 0; + struct sun4m_handler_data *handler_data = data->handler_data; + int cpu = smp_processor_id(); - if (!mask) - printk(KERN_ERR "sun4m_get_irqmask: IRQ%d has no valid mask!\n", - irq); + if (handler_data->mask) { + unsigned long flags; - return mask; + local_irq_save(flags); + if (handler_data->percpu) { + sbus_writel(handler_data->mask, &sun4m_irq_percpu[cpu]->set); + } else { + sbus_writel(handler_data->mask, &sun4m_irq_global->mask_set); + } + local_irq_restore(flags); + } } -static void sun4m_disable_irq(unsigned int irq_nr) +static void sun4m_unmask_irq(struct irq_data *data) { - unsigned long mask, flags; + struct sun4m_handler_data *handler_data = data->handler_data; int cpu = smp_processor_id(); - mask = sun4m_get_irqmask(irq_nr); - local_irq_save(flags); - if (irq_nr > 15) - sbus_writel(mask, &sun4m_irq_global->mask_set); - else - sbus_writel(mask, &sun4m_irq_percpu[cpu]->set); - local_irq_restore(flags); -} - -static void sun4m_enable_irq(unsigned int irq_nr) -{ - unsigned long mask, flags; - int cpu = smp_processor_id(); + if (handler_data->mask) { + unsigned long flags; - /* Dreadful floppy hack. When we use 0x2b instead of - * 0x0b the system blows (it starts to whistle!). - * So we continue to use 0x0b. Fixme ASAP. --P3 - */ - if (irq_nr != 0x0b) { - mask = sun4m_get_irqmask(irq_nr); - local_irq_save(flags); - if (irq_nr > 15) - sbus_writel(mask, &sun4m_irq_global->mask_clear); - else - sbus_writel(mask, &sun4m_irq_percpu[cpu]->clear); - local_irq_restore(flags); - } else { local_irq_save(flags); - sbus_writel(SUN4M_INT_FLOPPY, &sun4m_irq_global->mask_clear); + if (handler_data->percpu) { + sbus_writel(handler_data->mask, &sun4m_irq_percpu[cpu]->clear); + } else { + sbus_writel(handler_data->mask, &sun4m_irq_global->mask_clear); + } local_irq_restore(flags); } } -static unsigned long cpu_pil_to_imask[16] = { -/*0*/ 0x00000000, -/*1*/ 0x00000000, -/*2*/ SUN4M_INT_SBUS(0) | SUN4M_INT_VME(0), -/*3*/ SUN4M_INT_SBUS(1) | SUN4M_INT_VME(1), -/*4*/ SUN4M_INT_SCSI, -/*5*/ SUN4M_INT_SBUS(2) | SUN4M_INT_VME(2), -/*6*/ SUN4M_INT_ETHERNET, -/*7*/ SUN4M_INT_SBUS(3) | SUN4M_INT_VME(3), -/*8*/ SUN4M_INT_VIDEO, -/*9*/ SUN4M_INT_SBUS(4) | SUN4M_INT_VME(4) | SUN4M_INT_MODULE_ERR, -/*10*/ SUN4M_INT_REALTIME, -/*11*/ SUN4M_INT_SBUS(5) | SUN4M_INT_VME(5) | SUN4M_INT_FLOPPY, -/*12*/ SUN4M_INT_SERIAL | SUN4M_INT_KBDMS, -/*13*/ SUN4M_INT_SBUS(6) | SUN4M_INT_VME(6) | SUN4M_INT_AUDIO, -/*14*/ SUN4M_INT_E14, -/*15*/ SUN4M_INT_ERROR, -}; +static unsigned int sun4m_startup_irq(struct irq_data *data) +{ + irq_link(data->irq); + sun4m_unmask_irq(data); + return 0; +} -/* We assume the caller has disabled local interrupts when these are called, - * or else very bizarre behavior will result. - */ -static void sun4m_disable_pil_irq(unsigned int pil) +static void sun4m_shutdown_irq(struct irq_data *data) { - sbus_writel(cpu_pil_to_imask[pil], &sun4m_irq_global->mask_set); + sun4m_mask_irq(data); + irq_unlink(data->irq); } -static void sun4m_enable_pil_irq(unsigned int pil) +static struct irq_chip sun4m_irq = { + .name = "sun4m", + .irq_startup = sun4m_startup_irq, + .irq_shutdown = sun4m_shutdown_irq, + .irq_mask = sun4m_mask_irq, + .irq_unmask = sun4m_unmask_irq, +}; + + +static unsigned int sun4m_build_device_irq(struct platform_device *op, + unsigned int real_irq) { - sbus_writel(cpu_pil_to_imask[pil], &sun4m_irq_global->mask_clear); + struct sun4m_handler_data *handler_data; + unsigned int irq; + unsigned int pil; + + if (real_irq >= OBP_INT_LEVEL_VME) { + prom_printf("Bogus sun4m IRQ %u\n", real_irq); + prom_halt(); + } + pil = (real_irq & 0xf); + irq = irq_alloc(real_irq, pil); + + if (irq == 0) + goto out; + + handler_data = irq_get_handler_data(irq); + if (unlikely(handler_data)) + goto out; + + handler_data = kzalloc(sizeof(struct sun4m_handler_data), GFP_ATOMIC); + if (unlikely(!handler_data)) { + prom_printf("IRQ: kzalloc(sun4m_handler_data) failed.\n"); + prom_halt(); + } + + handler_data->mask = sun4m_imask[real_irq]; + handler_data->percpu = real_irq < OBP_INT_LEVEL_ONBOARD; + irq_set_chip_and_handler_name(irq, &sun4m_irq, + handle_level_irq, "level"); + irq_set_handler_data(irq, handler_data); + +out: + return irq; } #ifdef CONFIG_SMP static void sun4m_send_ipi(int cpu, int level) { - unsigned long mask = sun4m_get_irqmask(level); - - sbus_writel(mask, &sun4m_irq_percpu[cpu]->set); + sbus_writel(SUN4M_SOFT_INT(level), &sun4m_irq_percpu[cpu]->set); } static void sun4m_clear_ipi(int cpu, int level) { - unsigned long mask = sun4m_get_irqmask(level); - - sbus_writel(mask, &sun4m_irq_percpu[cpu]->clear); + sbus_writel(SUN4M_SOFT_INT(level), &sun4m_irq_percpu[cpu]->clear); } static void sun4m_set_udt(int cpu) @@ -343,7 +353,15 @@ void sun4m_nmi(struct pt_regs *regs) prom_halt(); } -/* Exported for sun4m_smp.c */ +void sun4m_unmask_profile_irq(void) +{ + unsigned long flags; + + local_irq_save(flags); + sbus_writel(sun4m_imask[SUN4M_PROFILE_IRQ], &sun4m_irq_global->mask_clear); + local_irq_restore(flags); +} + void sun4m_clear_profile_irq(int cpu) { sbus_readl(&timers_percpu[cpu]->l14_limit); @@ -358,6 +376,7 @@ static void __init sun4m_init_timers(irq_handler_t counter_fn) { struct device_node *dp = of_find_node_by_name(NULL, "counter"); int i, err, len, num_cpu_timers; + unsigned int irq; const u32 *addr; if (!dp) { @@ -384,8 +403,9 @@ static void __init sun4m_init_timers(irq_handler_t counter_fn) master_l10_counter = &timers_global->l10_count; - err = request_irq(SUN4M_TIMER_IRQ, counter_fn, - (IRQF_DISABLED | SA_STATIC_ALLOC), "timer", NULL); + irq = sun4m_build_device_irq(NULL, SUN4M_TIMER_IRQ); + + err = request_irq(irq, counter_fn, IRQF_TIMER, "timer", NULL); if (err) { printk(KERN_ERR "sun4m_init_timers: Register IRQ error %d.\n", err); @@ -452,14 +472,11 @@ void __init sun4m_init_IRQ(void) if (num_cpu_iregs == 4) sbus_writel(0, &sun4m_irq_global->interrupt_target); - BTFIXUPSET_CALL(enable_irq, sun4m_enable_irq, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(disable_irq, sun4m_disable_irq, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(enable_pil_irq, sun4m_enable_pil_irq, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(disable_pil_irq, sun4m_disable_pil_irq, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(clear_clock_irq, sun4m_clear_clock_irq, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(load_profile_irq, sun4m_load_profile_irq, BTFIXUPCALL_NORM); sparc_irq_config.init_timers = sun4m_init_timers; + sparc_irq_config.build_device_irq = sun4m_build_device_irq; #ifdef CONFIG_SMP BTFIXUPSET_CALL(set_cpu_int, sun4m_send_ipi, BTFIXUPCALL_NORM); diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c index 5cc7dc51de3..59476868652 100644 --- a/arch/sparc/kernel/sun4m_smp.c +++ b/arch/sparc/kernel/sun4m_smp.c @@ -15,6 +15,9 @@ #include "irq.h" #include "kernel.h" +#define IRQ_IPI_SINGLE 12 +#define IRQ_IPI_MASK 13 +#define IRQ_IPI_RESCHED 14 #define IRQ_CROSS_CALL 15 static inline unsigned long @@ -26,6 +29,7 @@ swap_ulong(volatile unsigned long *ptr, unsigned long val) return val; } +static void smp4m_ipi_init(void); static void smp_setup_percpu_timer(void); void __cpuinit smp4m_callin(void) @@ -59,8 +63,6 @@ void __cpuinit smp4m_callin(void) local_flush_cache_all(); local_flush_tlb_all(); - cpu_probe(); - /* Fix idle thread fields. */ __asm__ __volatile__("ld [%0], %%g6\n\t" : : "r" (¤t_set[cpuid]) @@ -70,7 +72,7 @@ void __cpuinit smp4m_callin(void) atomic_inc(&init_mm.mm_count); current->active_mm = &init_mm; - while (!cpu_isset(cpuid, smp_commenced_mask)) + while (!cpumask_test_cpu(cpuid, &smp_commenced_mask)) mb(); local_irq_enable(); @@ -83,6 +85,7 @@ void __cpuinit smp4m_callin(void) */ void __init smp4m_boot_cpus(void) { + smp4m_ipi_init(); smp_setup_percpu_timer(); local_flush_cache_all(); } @@ -150,18 +153,25 @@ void __init smp4m_smp_done(void) /* Ok, they are spinning and ready to go. */ } -/* At each hardware IRQ, we get this called to forward IRQ reception - * to the next processor. The caller must disable the IRQ level being - * serviced globally so that there are no double interrupts received. - * - * XXX See sparc64 irq.c. - */ -void smp4m_irq_rotate(int cpu) + +/* Initialize IPIs on the SUN4M SMP machine */ +static void __init smp4m_ipi_init(void) +{ +} + +static void smp4m_ipi_resched(int cpu) +{ + set_cpu_int(cpu, IRQ_IPI_RESCHED); +} + +static void smp4m_ipi_single(int cpu) { - int next = cpu_data(cpu).next; + set_cpu_int(cpu, IRQ_IPI_SINGLE); +} - if (next != cpu) - set_irq_udt(next); +static void smp4m_ipi_mask_one(int cpu) +{ + set_cpu_int(cpu, IRQ_IPI_MASK); } static struct smp_funcall { @@ -199,10 +209,10 @@ static void smp4m_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1, { register int i; - cpu_clear(smp_processor_id(), mask); - cpus_and(mask, cpu_online_map, mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + cpumask_and(&mask, cpu_online_mask, &mask); for (i = 0; i < ncpus; i++) { - if (cpu_isset(i, mask)) { + if (cpumask_test_cpu(i, &mask)) { ccall_info.processors_in[i] = 0; ccall_info.processors_out[i] = 0; set_cpu_int(i, IRQ_CROSS_CALL); @@ -218,7 +228,7 @@ static void smp4m_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1, i = 0; do { - if (!cpu_isset(i, mask)) + if (!cpumask_test_cpu(i, &mask)) continue; while (!ccall_info.processors_in[i]) barrier(); @@ -226,7 +236,7 @@ static void smp4m_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1, i = 0; do { - if (!cpu_isset(i, mask)) + if (!cpumask_test_cpu(i, &mask)) continue; while (!ccall_info.processors_out[i]) barrier(); @@ -277,7 +287,7 @@ static void __cpuinit smp_setup_percpu_timer(void) load_profile_irq(cpu, lvl14_resolution); if (cpu == boot_cpu_id) - enable_pil_irq(14); + sun4m_unmask_profile_irq(); } static void __init smp4m_blackbox_id(unsigned *addr) @@ -306,4 +316,7 @@ void __init sun4m_init_smp(void) BTFIXUPSET_BLACKBOX(load_current, smp4m_blackbox_current); BTFIXUPSET_CALL(smp_cross_call, smp4m_cross_call, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(__hard_smp_processor_id, __smp4m_processor_id, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(smp_ipi_resched, smp4m_ipi_resched, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(smp_ipi_single, smp4m_ipi_single, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(smp_ipi_mask_one, smp4m_ipi_mask_one, BTFIXUPCALL_NORM); } diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c index 1eb8b00aed7..7408201d7ef 100644 --- a/arch/sparc/kernel/sysfs.c +++ b/arch/sparc/kernel/sysfs.c @@ -103,9 +103,10 @@ static unsigned long run_on_cpu(unsigned long cpu, unsigned long (*func)(unsigned long), unsigned long arg) { - cpumask_t old_affinity = current->cpus_allowed; + cpumask_t old_affinity; unsigned long ret; + cpumask_copy(&old_affinity, tsk_cpus_allowed(current)); /* should return -EINVAL to userspace */ if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) return 0; diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c index 96046a4024c..1060e0672a4 100644 --- a/arch/sparc/kernel/time_32.c +++ b/arch/sparc/kernel/time_32.c @@ -228,14 +228,10 @@ static void __init sbus_time_init(void) void __init time_init(void) { -#ifdef CONFIG_PCI - extern void pci_time_init(void); - if (pcic_present()) { + if (pcic_present()) pci_time_init(); - return; - } -#endif - sbus_time_init(); + else + sbus_time_init(); } diff --git a/arch/sparc/kernel/us2e_cpufreq.c b/arch/sparc/kernel/us2e_cpufreq.c index 8f982b76c71..531d54fc982 100644 --- a/arch/sparc/kernel/us2e_cpufreq.c +++ b/arch/sparc/kernel/us2e_cpufreq.c @@ -237,7 +237,7 @@ static unsigned int us2e_freq_get(unsigned int cpu) if (!cpu_online(cpu)) return 0; - cpus_allowed = current->cpus_allowed; + cpumask_copy(&cpus_allowed, tsk_cpus_allowed(current)); set_cpus_allowed_ptr(current, cpumask_of(cpu)); clock_tick = sparc64_get_clock_tick(cpu) / 1000; @@ -258,7 +258,7 @@ static void us2e_set_cpu_divider_index(unsigned int cpu, unsigned int index) if (!cpu_online(cpu)) return; - cpus_allowed = current->cpus_allowed; + cpumask_copy(&cpus_allowed, tsk_cpus_allowed(current)); set_cpus_allowed_ptr(current, cpumask_of(cpu)); new_freq = clock_tick = sparc64_get_clock_tick(cpu) / 1000; diff --git a/arch/sparc/kernel/us3_cpufreq.c b/arch/sparc/kernel/us3_cpufreq.c index f35d1e79454..9a8ceb70083 100644 --- a/arch/sparc/kernel/us3_cpufreq.c +++ b/arch/sparc/kernel/us3_cpufreq.c @@ -85,7 +85,7 @@ static unsigned int us3_freq_get(unsigned int cpu) if (!cpu_online(cpu)) return 0; - cpus_allowed = current->cpus_allowed; + cpumask_copy(&cpus_allowed, tsk_cpus_allowed(current)); set_cpus_allowed_ptr(current, cpumask_of(cpu)); reg = read_safari_cfg(); @@ -105,7 +105,7 @@ static void us3_set_cpu_divider_index(unsigned int cpu, unsigned int index) if (!cpu_online(cpu)) return; - cpus_allowed = current->cpus_allowed; + cpumask_copy(&cpus_allowed, tsk_cpus_allowed(current)); set_cpus_allowed_ptr(current, cpumask_of(cpu)); new_freq = sparc64_get_clock_tick(cpu) / 1000; diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index 846d1c4374e..7f01b8fce8b 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile @@ -15,7 +15,6 @@ lib-$(CONFIG_SPARC32) += divdi3.o udivdi3.o lib-$(CONFIG_SPARC32) += copy_user.o locks.o lib-y += atomic_$(BITS).o lib-$(CONFIG_SPARC32) += lshrdi3.o ashldi3.o -lib-$(CONFIG_SPARC32) += rwsem_32.o lib-$(CONFIG_SPARC32) += muldi3.o bitext.o cmpdi2.o lib-$(CONFIG_SPARC64) += copy_page.o clear_page.o bzero.o diff --git a/arch/sparc/lib/rwsem_32.S b/arch/sparc/lib/rwsem_32.S deleted file mode 100644 index 9675268e7fd..00000000000 --- a/arch/sparc/lib/rwsem_32.S +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Assembly part of rw semaphores. - * - * Copyright (C) 1999 Jakub Jelinek (jakub@redhat.com) - */ - -#include <asm/ptrace.h> -#include <asm/psr.h> - - .section .sched.text, "ax" - .align 4 - - .globl ___down_read -___down_read: - rd %psr, %g3 - nop - nop - nop - or %g3, PSR_PIL, %g7 - wr %g7, 0, %psr - nop - nop - nop -#ifdef CONFIG_SMP -1: ldstub [%g1 + 4], %g7 - tst %g7 - bne 1b - ld [%g1], %g7 - sub %g7, 1, %g7 - st %g7, [%g1] - stb %g0, [%g1 + 4] -#else - ld [%g1], %g7 - sub %g7, 1, %g7 - st %g7, [%g1] -#endif - wr %g3, 0, %psr - add %g7, 1, %g7 - nop - nop - subcc %g7, 1, %g7 - bneg 3f - nop -2: jmpl %o7, %g0 - mov %g4, %o7 -3: save %sp, -64, %sp - mov %g1, %l1 - mov %g4, %l4 - bcs 4f - mov %g5, %l5 - call down_read_failed - mov %l1, %o0 - mov %l1, %g1 - mov %l4, %g4 - ba ___down_read - restore %l5, %g0, %g5 -4: call down_read_failed_biased - mov %l1, %o0 - mov %l1, %g1 - mov %l4, %g4 - ba 2b - restore %l5, %g0, %g5 - - .globl ___down_write -___down_write: - rd %psr, %g3 - nop - nop - nop - or %g3, PSR_PIL, %g7 - wr %g7, 0, %psr - sethi %hi(0x01000000), %g2 - nop - nop -#ifdef CONFIG_SMP -1: ldstub [%g1 + 4], %g7 - tst %g7 - bne 1b - ld [%g1], %g7 - sub %g7, %g2, %g7 - st %g7, [%g1] - stb %g0, [%g1 + 4] -#else - ld [%g1], %g7 - sub %g7, %g2, %g7 - st %g7, [%g1] -#endif - wr %g3, 0, %psr - add %g7, %g2, %g7 - nop - nop - subcc %g7, %g2, %g7 - bne 3f - nop -2: jmpl %o7, %g0 - mov %g4, %o7 -3: save %sp, -64, %sp - mov %g1, %l1 - mov %g4, %l4 - bcs 4f - mov %g5, %l5 - call down_write_failed - mov %l1, %o0 - mov %l1, %g1 - mov %l4, %g4 - ba ___down_write - restore %l5, %g0, %g5 -4: call down_write_failed_biased - mov %l1, %o0 - mov %l1, %g1 - mov %l4, %g4 - ba 2b - restore %l5, %g0, %g5 - - .text - .globl ___up_read -___up_read: - rd %psr, %g3 - nop - nop - nop - or %g3, PSR_PIL, %g7 - wr %g7, 0, %psr - nop - nop - nop -#ifdef CONFIG_SMP -1: ldstub [%g1 + 4], %g7 - tst %g7 - bne 1b - ld [%g1], %g7 - add %g7, 1, %g7 - st %g7, [%g1] - stb %g0, [%g1 + 4] -#else - ld [%g1], %g7 - add %g7, 1, %g7 - st %g7, [%g1] -#endif - wr %g3, 0, %psr - nop - nop - nop - cmp %g7, 0 - be 3f - nop -2: jmpl %o7, %g0 - mov %g4, %o7 -3: save %sp, -64, %sp - mov %g1, %l1 - mov %g4, %l4 - mov %g5, %l5 - clr %o1 - call __rwsem_wake - mov %l1, %o0 - mov %l1, %g1 - mov %l4, %g4 - ba 2b - restore %l5, %g0, %g5 - - .globl ___up_write -___up_write: - rd %psr, %g3 - nop - nop - nop - or %g3, PSR_PIL, %g7 - wr %g7, 0, %psr - sethi %hi(0x01000000), %g2 - nop - nop -#ifdef CONFIG_SMP -1: ldstub [%g1 + 4], %g7 - tst %g7 - bne 1b - ld [%g1], %g7 - add %g7, %g2, %g7 - st %g7, [%g1] - stb %g0, [%g1 + 4] -#else - ld [%g1], %g7 - add %g7, %g2, %g7 - st %g7, [%g1] -#endif - wr %g3, 0, %psr - sub %g7, %g2, %g7 - nop - nop - addcc %g7, %g2, %g7 - bcs 3f - nop -2: jmpl %o7, %g0 - mov %g4, %o7 -3: save %sp, -64, %sp - mov %g1, %l1 - mov %g4, %l4 - mov %g5, %l5 - mov %g7, %o1 - call __rwsem_wake - mov %l1, %o0 - mov %l1, %g1 - mov %l4, %g4 - ba 2b - restore %l5, %g0, %g5 diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 2f6ae1d1fb6..e10cd03fab8 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -862,7 +862,7 @@ static void init_node_masks_nonnuma(void) for (i = 0; i < NR_CPUS; i++) numa_cpu_lookup_table[i] = 0; - numa_cpumask_lookup_table[0] = CPU_MASK_ALL; + cpumask_setall(&numa_cpumask_lookup_table[0]); } #ifdef CONFIG_NEED_MULTIPLE_NODES @@ -1080,7 +1080,7 @@ static void __init numa_parse_mdesc_group_cpus(struct mdesc_handle *md, { u64 arc; - cpus_clear(*mask); + cpumask_clear(mask); mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_BACK) { u64 target = mdesc_arc_target(md, arc); @@ -1091,7 +1091,7 @@ static void __init numa_parse_mdesc_group_cpus(struct mdesc_handle *md, continue; id = mdesc_get_property(md, target, "id", NULL); if (*id < nr_cpu_ids) - cpu_set(*id, *mask); + cpumask_set_cpu(*id, mask); } } @@ -1153,13 +1153,13 @@ static int __init numa_parse_mdesc_group(struct mdesc_handle *md, u64 grp, numa_parse_mdesc_group_cpus(md, grp, &mask); - for_each_cpu_mask(cpu, mask) + for_each_cpu(cpu, &mask) numa_cpu_lookup_table[cpu] = index; - numa_cpumask_lookup_table[index] = mask; + cpumask_copy(&numa_cpumask_lookup_table[index], &mask); if (numa_debug) { printk(KERN_INFO "NUMA GROUP[%d]: cpus [ ", index); - for_each_cpu_mask(cpu, mask) + for_each_cpu(cpu, &mask) printk("%d ", cpu); printk("]\n"); } @@ -1218,7 +1218,7 @@ static int __init numa_parse_jbus(void) index = 0; for_each_present_cpu(cpu) { numa_cpu_lookup_table[cpu] = index; - numa_cpumask_lookup_table[index] = cpumask_of_cpu(cpu); + cpumask_copy(&numa_cpumask_lookup_table[index], cpumask_of(cpu)); node_masks[index].mask = ~((1UL << 36UL) - 1UL); node_masks[index].val = cpu << 36UL; diff --git a/arch/um/drivers/mmapper_kern.c b/arch/um/drivers/mmapper_kern.c index 7e0619c2c2c..c0ef803c7c7 100644 --- a/arch/um/drivers/mmapper_kern.c +++ b/arch/um/drivers/mmapper_kern.c @@ -116,7 +116,7 @@ static int __init mmapper_init(void) if (err) { printk(KERN_ERR "mmapper - misc_register failed, err = %d\n", err); - return err;; + return err; } return 0; } diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index a0c46f06121..4a0b7c7e2cc 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -381,6 +381,26 @@ struct apic { extern struct apic *apic; /* + * APIC drivers are probed based on how they are listed in the .apicdrivers + * section. So the order is important and enforced by the ordering + * of different apic driver files in the Makefile. + * + * For the files having two apic drivers, we use apic_drivers() + * to enforce the order with in them. + */ +#define apic_driver(sym) \ + static struct apic *__apicdrivers_##sym __used \ + __aligned(sizeof(struct apic *)) \ + __section(.apicdrivers) = { &sym } + +#define apic_drivers(sym1, sym2) \ + static struct apic *__apicdrivers_##sym1##sym2[2] __used \ + __aligned(sizeof(struct apic *)) \ + __section(.apicdrivers) = { &sym1, &sym2 } + +extern struct apic *__apicdrivers[], *__apicdrivers_end[]; + +/* * APIC functionality to boot other CPUs - only used on SMP: */ #ifdef CONFIG_SMP @@ -458,15 +478,10 @@ static inline unsigned default_get_apic_id(unsigned long x) #define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469 #ifdef CONFIG_X86_64 -extern struct apic apic_flat; -extern struct apic apic_physflat; -extern struct apic apic_x2apic_cluster; -extern struct apic apic_x2apic_phys; extern int default_acpi_madt_oem_check(char *, char *); extern void apic_send_IPI_self(int vector); -extern struct apic apic_x2apic_uv_x; DECLARE_PER_CPU(int, x2apic_extra_bits); extern int default_cpu_present_to_apicid(int mps_cpu); @@ -480,7 +495,7 @@ static inline void default_wait_for_init_deassert(atomic_t *deassert) return; } -extern void generic_bigsmp_probe(void); +extern struct apic *generic_bigsmp_probe(void); #ifdef CONFIG_X86_LOCAL_APIC @@ -516,8 +531,6 @@ extern struct apic apic_noop; #ifdef CONFIG_X86_32 -extern struct apic apic_default; - static inline int noop_x86_32_early_logical_apicid(int cpu) { return BAD_APICID; diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index a97a240f67f..690d1cc9a87 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -105,12 +105,12 @@ struct IR_IO_APIC_route_entry { * # of IO-APICs and # of IRQ routing registers */ extern int nr_ioapics; -extern int nr_ioapic_registers[MAX_IO_APICS]; -#define MP_MAX_IOAPIC_PIN 127 +extern int mpc_ioapic_id(int ioapic); +extern unsigned int mpc_ioapic_addr(int ioapic); +extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic); -/* I/O APIC entries */ -extern struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; +#define MP_MAX_IOAPIC_PIN 127 /* # of MP IRQ source entries */ extern int mp_irq_entries; @@ -152,11 +152,9 @@ extern void ioapic_insert_resources(void); int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr); -extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); -extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); -extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); -extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); -extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); +extern int save_ioapic_entries(void); +extern void mask_ioapic_entries(void); +extern int restore_ioapic_entries(void); extern int get_nr_irqs_gsi(void); @@ -192,19 +190,13 @@ struct io_apic_irq_attr; static inline int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) { return 0; } -static inline struct IO_APIC_route_entry **alloc_ioapic_entries(void) -{ - return NULL; -} - -static inline void free_ioapic_entries(struct IO_APIC_route_entry **ent) { } -static inline int save_IO_APIC_setup(struct IO_APIC_route_entry **ent) +static inline int save_ioapic_entries(void) { return -ENOMEM; } -static inline void mask_IO_APIC_setup(struct IO_APIC_route_entry **ent) { } -static inline int restore_IO_APIC_setup(struct IO_APIC_route_entry **ent) +static inline void mask_ioapic_entries(void) { } +static inline int restore_ioapic_entries(void) { return -ENOMEM; } diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 0f521356432..0049211959c 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -14,6 +14,8 @@ #include <asm/desc_defs.h> struct x86_emulate_ctxt; +enum x86_intercept; +enum x86_intercept_stage; struct x86_exception { u8 vector; @@ -24,6 +26,24 @@ struct x86_exception { }; /* + * This struct is used to carry enough information from the instruction + * decoder to main KVM so that a decision can be made whether the + * instruction needs to be intercepted or not. + */ +struct x86_instruction_info { + u8 intercept; /* which intercept */ + u8 rep_prefix; /* rep prefix? */ + u8 modrm_mod; /* mod part of modrm */ + u8 modrm_reg; /* index of register used */ + u8 modrm_rm; /* rm part of modrm */ + u64 src_val; /* value of source operand */ + u8 src_bytes; /* size of source operand */ + u8 dst_bytes; /* size of destination operand */ + u8 ad_bytes; /* size of src/dst address */ + u64 next_rip; /* rip following the instruction */ +}; + +/* * x86_emulate_ops: * * These operations represent the instruction emulator's interface to memory. @@ -62,6 +82,7 @@ struct x86_exception { #define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ #define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ #define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ +#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */ struct x86_emulate_ops { /* @@ -71,8 +92,9 @@ struct x86_emulate_ops { * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ - int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu, + int (*read_std)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, + unsigned int bytes, struct x86_exception *fault); /* @@ -82,8 +104,8 @@ struct x86_emulate_ops { * @val: [OUT] Value write to memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to write to memory. */ - int (*write_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu, + int (*write_std)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, struct x86_exception *fault); /* * fetch: Read bytes of standard (non-emulated/special) memory. @@ -92,8 +114,8 @@ struct x86_emulate_ops { * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ - int (*fetch)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu, + int (*fetch)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, struct x86_exception *fault); /* @@ -102,11 +124,9 @@ struct x86_emulate_ops { * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ - int (*read_emulated)(unsigned long addr, - void *val, - unsigned int bytes, - struct x86_exception *fault, - struct kvm_vcpu *vcpu); + int (*read_emulated)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, + struct x86_exception *fault); /* * write_emulated: Write bytes to emulated/special memory area. @@ -115,11 +135,10 @@ struct x86_emulate_ops { * required). * @bytes: [IN ] Number of bytes to write to memory. */ - int (*write_emulated)(unsigned long addr, - const void *val, + int (*write_emulated)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, const void *val, unsigned int bytes, - struct x86_exception *fault, - struct kvm_vcpu *vcpu); + struct x86_exception *fault); /* * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an @@ -129,40 +148,54 @@ struct x86_emulate_ops { * @new: [IN ] Value to write to @addr. * @bytes: [IN ] Number of bytes to access using CMPXCHG. */ - int (*cmpxchg_emulated)(unsigned long addr, + int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, const void *old, const void *new, unsigned int bytes, - struct x86_exception *fault, - struct kvm_vcpu *vcpu); - - int (*pio_in_emulated)(int size, unsigned short port, void *val, - unsigned int count, struct kvm_vcpu *vcpu); - - int (*pio_out_emulated)(int size, unsigned short port, const void *val, - unsigned int count, struct kvm_vcpu *vcpu); - - bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3, - int seg, struct kvm_vcpu *vcpu); - void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3, - int seg, struct kvm_vcpu *vcpu); - u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); - void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); - unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); - void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); - void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); - ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); - int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); - int (*cpl)(struct kvm_vcpu *vcpu); - int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); - int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); - int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); - int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); + struct x86_exception *fault); + void (*invlpg)(struct x86_emulate_ctxt *ctxt, ulong addr); + + int (*pio_in_emulated)(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count); + + int (*pio_out_emulated)(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, const void *val, + unsigned int count); + + bool (*get_segment)(struct x86_emulate_ctxt *ctxt, u16 *selector, + struct desc_struct *desc, u32 *base3, int seg); + void (*set_segment)(struct x86_emulate_ctxt *ctxt, u16 selector, + struct desc_struct *desc, u32 base3, int seg); + unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt, + int seg); + void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); + void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); + void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); + void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); + ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); + int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); + int (*cpl)(struct x86_emulate_ctxt *ctxt); + int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); + int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); + int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); + int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); + void (*halt)(struct x86_emulate_ctxt *ctxt); + void (*wbinvd)(struct x86_emulate_ctxt *ctxt); + int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt); + void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */ + void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */ + int (*intercept)(struct x86_emulate_ctxt *ctxt, + struct x86_instruction_info *info, + enum x86_intercept_stage stage); }; +typedef u32 __attribute__((vector_size(16))) sse128_t; + /* Type, address-of, and value of an instruction's operand. */ struct operand { - enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; + enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_NONE } type; unsigned int bytes; union { unsigned long orig_val; @@ -174,11 +207,13 @@ struct operand { ulong ea; unsigned seg; } mem; + unsigned xmm; } addr; union { unsigned long val; u64 val64; char valptr[sizeof(unsigned long) + 2]; + sse128_t vec_val; }; }; @@ -197,6 +232,7 @@ struct read_cache { struct decode_cache { u8 twobyte; u8 b; + u8 intercept; u8 lock_prefix; u8 rep_prefix; u8 op_bytes; @@ -209,6 +245,7 @@ struct decode_cache { u8 seg_override; unsigned int d; int (*execute)(struct x86_emulate_ctxt *ctxt); + int (*check_perm)(struct x86_emulate_ctxt *ctxt); unsigned long regs[NR_VCPU_REGS]; unsigned long eip; /* modrm */ @@ -227,17 +264,15 @@ struct x86_emulate_ctxt { struct x86_emulate_ops *ops; /* Register state before/after emulation. */ - struct kvm_vcpu *vcpu; - unsigned long eflags; unsigned long eip; /* eip before instruction emulation */ /* Emulated execution mode, represented by an X86EMUL_MODE value. */ int mode; - u32 cs_base; /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; + bool guest_mode; /* guest running a nested guest */ bool perm_ok; /* do not check permissions if true */ bool only_vendor_specific_insn; @@ -249,8 +284,8 @@ struct x86_emulate_ctxt { }; /* Repeat String Operation Prefix */ -#define REPE_PREFIX 1 -#define REPNE_PREFIX 2 +#define REPE_PREFIX 0xf3 +#define REPNE_PREFIX 0xf2 /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ @@ -259,6 +294,69 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ +/* any protected mode */ +#define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \ + X86EMUL_MODE_PROT64) + +enum x86_intercept_stage { + X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */ + X86_ICPT_PRE_EXCEPT, + X86_ICPT_POST_EXCEPT, + X86_ICPT_POST_MEMACCESS, +}; + +enum x86_intercept { + x86_intercept_none, + x86_intercept_cr_read, + x86_intercept_cr_write, + x86_intercept_clts, + x86_intercept_lmsw, + x86_intercept_smsw, + x86_intercept_dr_read, + x86_intercept_dr_write, + x86_intercept_lidt, + x86_intercept_sidt, + x86_intercept_lgdt, + x86_intercept_sgdt, + x86_intercept_lldt, + x86_intercept_sldt, + x86_intercept_ltr, + x86_intercept_str, + x86_intercept_rdtsc, + x86_intercept_rdpmc, + x86_intercept_pushf, + x86_intercept_popf, + x86_intercept_cpuid, + x86_intercept_rsm, + x86_intercept_iret, + x86_intercept_intn, + x86_intercept_invd, + x86_intercept_pause, + x86_intercept_hlt, + x86_intercept_invlpg, + x86_intercept_invlpga, + x86_intercept_vmrun, + x86_intercept_vmload, + x86_intercept_vmsave, + x86_intercept_vmmcall, + x86_intercept_stgi, + x86_intercept_clgi, + x86_intercept_skinit, + x86_intercept_rdtscp, + x86_intercept_icebp, + x86_intercept_wbinvd, + x86_intercept_monitor, + x86_intercept_mwait, + x86_intercept_rdmsr, + x86_intercept_wrmsr, + x86_intercept_in, + x86_intercept_ins, + x86_intercept_out, + x86_intercept_outs, + + nr_x86_intercepts +}; + /* Host execution mode. */ #if defined(CONFIG_X86_32) #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 @@ -270,6 +368,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); #define EMULATION_FAILED -1 #define EMULATION_OK 0 #define EMULATION_RESTART 1 +#define EMULATION_INTERCEPTED 2 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int reason, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c8af0991fdf..d2ac8e2ee89 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -30,14 +30,30 @@ #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 +#define KVM_MMIO_SIZE 16 #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 +#define CR0_RESERVED_BITS \ + (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ + | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ + | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) + #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) +#define CR4_RESERVED_BITS \ + (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ + | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ + | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXSAVE \ + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) + +#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) + + #define INVALID_PAGE (~(hpa_t)0) #define VALID_PAGE(x) ((x) != INVALID_PAGE) @@ -118,6 +134,9 @@ enum kvm_reg { enum kvm_reg_ex { VCPU_EXREG_PDPTR = NR_VCPU_REGS, VCPU_EXREG_CR3, + VCPU_EXREG_RFLAGS, + VCPU_EXREG_CPL, + VCPU_EXREG_SEGMENTS, }; enum { @@ -256,7 +275,7 @@ struct kvm_mmu { struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte, unsigned long mmu_seq); + u64 *spte, const void *pte); hpa_t root_hpa; int root_level; int shadow_root_level; @@ -340,7 +359,6 @@ struct kvm_vcpu_arch { struct fpu guest_fpu; u64 xcr0; - gva_t mmio_fault_cr2; struct kvm_pio_request pio; void *pio_data; @@ -367,18 +385,22 @@ struct kvm_vcpu_arch { /* emulate context */ struct x86_emulate_ctxt emulate_ctxt; + bool emulate_regs_need_sync_to_vcpu; + bool emulate_regs_need_sync_from_vcpu; gpa_t time; struct pvclock_vcpu_time_info hv_clock; unsigned int hw_tsc_khz; unsigned int time_offset; struct page *time_page; - u64 last_host_tsc; u64 last_guest_tsc; u64 last_kernel_ns; u64 last_tsc_nsec; u64 last_tsc_write; + u32 virtual_tsc_khz; bool tsc_catchup; + u32 tsc_catchup_mult; + s8 tsc_catchup_shift; bool nmi_pending; bool nmi_injected; @@ -448,9 +470,6 @@ struct kvm_arch { u64 last_tsc_nsec; u64 last_tsc_offset; u64 last_tsc_write; - u32 virtual_tsc_khz; - u32 virtual_tsc_mult; - s8 virtual_tsc_shift; struct kvm_xen_hvm_config xen_hvm_config; @@ -502,6 +521,8 @@ struct kvm_vcpu_stat { u32 nmi_injections; }; +struct x86_instruction_info; + struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ @@ -586,9 +607,17 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); + void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); + void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); + + int (*check_intercept)(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage); + const struct trace_print_flags *exit_reasons_str; }; @@ -627,6 +656,13 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); extern bool tdp_enabled; +/* control of guest tsc rate supported? */ +extern bool kvm_has_tsc_control; +/* minimum supported tsc_khz for guests */ +extern u32 kvm_min_guest_tsc_khz; +/* maximum supported tsc_khz for guests */ +extern u32 kvm_max_guest_tsc_khz; + enum emulation_result { EMULATE_DONE, /* no further processing */ EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ @@ -645,9 +681,6 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu, return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); } -void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); -void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); - void kvm_enable_efer_bits(u64); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); @@ -657,8 +690,6 @@ struct x86_emulate_ctxt; int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); -int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); -int emulate_clts(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -721,8 +752,6 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); -int kvm_fix_hypercall(struct kvm_vcpu *vcpu); - int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3cce71413d0..485b4f1f079 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -118,6 +118,7 @@ complete list. */ #define MSR_AMD64_PATCH_LEVEL 0x0000008b +#define MSR_AMD64_TSC_RATIO 0xc0000104 #define MSR_AMD64_NB_CFG 0xc001001f #define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h new file mode 100644 index 00000000000..6bf5b8e478c --- /dev/null +++ b/arch/x86/include/asm/x2apic.h @@ -0,0 +1,62 @@ +/* + * Common bits for X2APIC cluster/physical modes. + */ + +#ifndef _ASM_X86_X2APIC_H +#define _ASM_X86_X2APIC_H + +#include <asm/apic.h> +#include <asm/ipi.h> +#include <linux/cpumask.h> + +/* + * Need to use more than cpu 0, because we need more vectors + * when MSI-X are used. + */ +static const struct cpumask *x2apic_target_cpus(void) +{ + return cpu_online_mask; +} + +static int x2apic_apic_id_registered(void) +{ + return 1; +} + +/* + * For now each logical cpu is in its own vector allocation domain. + */ +static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); +} + +static void +__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) +{ + unsigned long cfg = __prepare_ICR(0, vector, dest); + native_x2apic_icr_write(cfg, apicid); +} + +static unsigned int x2apic_get_apic_id(unsigned long id) +{ + return id; +} + +static unsigned long x2apic_set_apic_id(unsigned int id) +{ + return id; +} + +static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) +{ + return initial_apicid >> index_msb; +} + +static void x2apic_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + +#endif /* _ASM_X86_X2APIC_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 9a966c579af..4558f0d0822 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -970,7 +970,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) mp_irq.irqflag = (trigger << 2) | polarity; mp_irq.srcbus = MP_ISA_BUS; mp_irq.srcbusirq = bus_irq; /* IRQ */ - mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */ + mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */ mp_irq.dstirq = pin; /* INTIN# */ mp_save_irq(&mp_irq); @@ -1021,7 +1021,7 @@ void __init mp_config_acpi_legacy_irqs(void) if (ioapic < 0) continue; pin = mp_find_ioapic_pin(ioapic, gsi); - dstapic = mp_ioapics[ioapic].apicid; + dstapic = mpc_ioapic_id(ioapic); for (idx = 0; idx < mp_irq_entries; idx++) { struct mpc_intsrc *irq = mp_irqs + idx; @@ -1082,7 +1082,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, mp_irq.srcbus = number; mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); ioapic = mp_find_ioapic(gsi); - mp_irq.dstapic = mp_ioapics[ioapic].apicid; + mp_irq.dstapic = mpc_ioapic_id(ioapic); mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); mp_save_irq(&mp_irq); @@ -1113,7 +1113,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) if (ioapic_pin > MP_MAX_IOAPIC_PIN) { printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapics[ioapic].apicid, + "%d-%d\n", mpc_ioapic_id(ioapic), ioapic_pin); return gsi; } diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 873e7e1ead7..cd8cbeb5fa3 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1538,13 +1538,11 @@ static void do_detach(struct device *dev) { struct iommu_dev_data *dev_data; struct amd_iommu *iommu; - struct pci_dev *pdev; u16 devid; devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; dev_data = get_dev_data(dev); - pdev = to_pci_dev(dev); /* decrease reference counters */ dev_data->domain->dev_iommu[iommu->index] -= 1; @@ -1703,10 +1701,9 @@ static struct protection_domain *domain_for_device(struct device *dev) struct protection_domain *dom; struct iommu_dev_data *dev_data, *alias_data; unsigned long flags; - u16 devid, alias; + u16 devid; devid = get_device_id(dev); - alias = amd_iommu_alias_table[devid]; dev_data = get_dev_data(dev); alias_data = get_dev_data(dev_data->alias); if (!alias_data) diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 3966b564ea4..767fd04f284 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,20 +2,25 @@ # Makefile for local APIC drivers and for the IO-APIC code # -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o obj-y += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o ifeq ($(CONFIG_X86_64),y) -obj-y += apic_flat_64.o -obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o -obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o +# APIC probe will depend on the listing order here obj-$(CONFIG_X86_UV) += x2apic_uv_x.o +obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o +obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o +obj-y += apic_flat_64.o endif -obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o +# APIC probe will depend on the listing order here obj-$(CONFIG_X86_NUMAQ) += numaq_32.o -obj-$(CONFIG_X86_ES7000) += es7000_32.o obj-$(CONFIG_X86_SUMMIT) += summit_32.o +obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o +obj-$(CONFIG_X86_ES7000) += es7000_32.o + +# For 32bit, probe_32 need to be listed last +obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f92a8e5d1e2..b961af86bfe 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1461,7 +1461,6 @@ int __init enable_IR(void) void __init enable_IR_x2apic(void) { unsigned long flags; - struct IO_APIC_route_entry **ioapic_entries; int ret, x2apic_enabled = 0; int dmar_table_init_ret; @@ -1469,13 +1468,7 @@ void __init enable_IR_x2apic(void) if (dmar_table_init_ret && !x2apic_supported()) return; - ioapic_entries = alloc_ioapic_entries(); - if (!ioapic_entries) { - pr_err("Allocate ioapic_entries failed\n"); - goto out; - } - - ret = save_IO_APIC_setup(ioapic_entries); + ret = save_ioapic_entries(); if (ret) { pr_info("Saving IO-APIC state failed: %d\n", ret); goto out; @@ -1483,7 +1476,7 @@ void __init enable_IR_x2apic(void) local_irq_save(flags); legacy_pic->mask_all(); - mask_IO_APIC_setup(ioapic_entries); + mask_ioapic_entries(); if (dmar_table_init_ret) ret = 0; @@ -1514,14 +1507,11 @@ void __init enable_IR_x2apic(void) nox2apic: if (!ret) /* IR enabling failed */ - restore_IO_APIC_setup(ioapic_entries); + restore_ioapic_entries(); legacy_pic->restore_mask(); local_irq_restore(flags); out: - if (ioapic_entries) - free_ioapic_entries(ioapic_entries); - if (x2apic_enabled) return; @@ -2095,28 +2085,20 @@ static void lapic_resume(void) { unsigned int l, h; unsigned long flags; - int maxlvt, ret; - struct IO_APIC_route_entry **ioapic_entries = NULL; + int maxlvt; if (!apic_pm_state.active) return; local_irq_save(flags); if (intr_remapping_enabled) { - ioapic_entries = alloc_ioapic_entries(); - if (!ioapic_entries) { - WARN(1, "Alloc ioapic_entries in lapic resume failed."); - goto restore; - } - - ret = save_IO_APIC_setup(ioapic_entries); - if (ret) { - WARN(1, "Saving IO-APIC state failed: %d\n", ret); - free_ioapic_entries(ioapic_entries); - goto restore; - } - - mask_IO_APIC_setup(ioapic_entries); + /* + * IO-APIC and PIC have their own resume routines. + * We just mask them here to make sure the interrupt + * subsystem is completely quiet while we enable x2apic + * and interrupt-remapping. + */ + mask_ioapic_entries(); legacy_pic->mask_all(); } @@ -2159,13 +2141,9 @@ static void lapic_resume(void) apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - if (intr_remapping_enabled) { + if (intr_remapping_enabled) reenable_intr_remapping(x2apic_mode); - legacy_pic->restore_mask(); - restore_IO_APIC_setup(ioapic_entries); - free_ioapic_entries(ioapic_entries); - } -restore: + local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 5652d31fe10..f7a41e4cae4 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -16,6 +16,7 @@ #include <linux/ctype.h> #include <linux/init.h> #include <linux/hardirq.h> +#include <linux/module.h> #include <asm/smp.h> #include <asm/apic.h> #include <asm/ipi.h> @@ -24,6 +25,12 @@ #include <acpi/acpi_bus.h> #endif +static struct apic apic_physflat; +static struct apic apic_flat; + +struct apic __read_mostly *apic = &apic_flat; +EXPORT_SYMBOL_GPL(apic); + static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 1; @@ -164,7 +171,7 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } -struct apic apic_flat = { +static struct apic apic_flat = { .name = "flat", .probe = NULL, .acpi_madt_oem_check = flat_acpi_madt_oem_check, @@ -312,10 +319,18 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, return per_cpu(x86_cpu_to_apicid, cpu); } -struct apic apic_physflat = { +static int physflat_probe(void) +{ + if (apic == &apic_physflat || num_possible_cpus() > 8) + return 1; + + return 0; +} + +static struct apic apic_physflat = { .name = "physical flat", - .probe = NULL, + .probe = physflat_probe, .acpi_madt_oem_check = physflat_acpi_madt_oem_check, .apic_id_registered = flat_apic_id_registered, @@ -369,3 +384,8 @@ struct apic apic_physflat = { .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, }; + +/* + * We need to check for physflat first, so this order is important. + */ +apic_drivers(apic_physflat, apic_flat); diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index d84ac5a584b..efd737e827f 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -193,7 +193,7 @@ static int probe_bigsmp(void) return dmi_bigsmp; } -struct apic apic_bigsmp = { +static struct apic apic_bigsmp = { .name = "bigsmp", .probe = probe_bigsmp, @@ -254,3 +254,13 @@ struct apic apic_bigsmp = { .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, }; + +struct apic * __init generic_bigsmp_probe(void) +{ + if (probe_bigsmp()) + return &apic_bigsmp; + + return NULL; +} + +apic_driver(apic_bigsmp); diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 70533de5bd2..9536b3fe43f 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -620,7 +620,7 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem, } /* We've been warned by a false positive warning.Use __refdata to keep calm. */ -struct apic __refdata apic_es7000_cluster = { +static struct apic __refdata apic_es7000_cluster = { .name = "es7000", .probe = probe_es7000, @@ -685,7 +685,7 @@ struct apic __refdata apic_es7000_cluster = { .x86_32_early_logical_apicid = es7000_early_logical_apicid, }; -struct apic __refdata apic_es7000 = { +static struct apic __refdata apic_es7000 = { .name = "es7000", .probe = probe_es7000, @@ -747,3 +747,9 @@ struct apic __refdata apic_es7000 = { .x86_32_early_logical_apicid = es7000_early_logical_apicid, }; + +/* + * Need to check for es7000 followed by es7000_cluster, so this order + * in apic_drivers is important. + */ +apic_drivers(apic_es7000, apic_es7000_cluster); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 5260fe91bcb..d5e57db0f7b 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -19,9 +19,9 @@ #include <linux/delay.h> #ifdef CONFIG_HARDLOCKUP_DETECTOR -u64 hw_nmi_get_sample_period(void) +u64 hw_nmi_get_sample_period(int watchdog_thresh) { - return (u64)(cpu_khz) * 1000 * 60; + return (u64)(cpu_khz) * 1000 * watchdog_thresh; } #endif diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 45fd33d1fd3..9488dcff7ae 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -76,17 +76,40 @@ int sis_apic_bug = -1; static DEFINE_RAW_SPINLOCK(ioapic_lock); static DEFINE_RAW_SPINLOCK(vector_lock); -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; +static struct ioapic { + /* + * # of IRQ routing registers + */ + int nr_registers; + /* + * Saved state during suspend/resume, or while enabling intr-remap. + */ + struct IO_APIC_route_entry *saved_registers; + /* I/O APIC config */ + struct mpc_ioapic mp_config; + /* IO APIC gsi routing info */ + struct mp_ioapic_gsi gsi_config; + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} ioapics[MAX_IO_APICS]; -/* I/O APIC entries */ -struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; -int nr_ioapics; +#define mpc_ioapic_ver(id) ioapics[id].mp_config.apicver + +int mpc_ioapic_id(int id) +{ + return ioapics[id].mp_config.apicid; +} -/* IO APIC gsi routing info */ -struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; +unsigned int mpc_ioapic_addr(int id) +{ + return ioapics[id].mp_config.apicaddr; +} + +struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int id) +{ + return &ioapics[id].gsi_config; +} + +int nr_ioapics; /* The one past the highest gsi number used */ u32 gsi_top; @@ -179,6 +202,14 @@ int __init arch_early_irq_init(void) io_apic_irqs = ~0UL; } + for (i = 0; i < nr_ioapics; i++) { + ioapics[i].saved_registers = + kzalloc(sizeof(struct IO_APIC_route_entry) * + ioapics[i].nr_registers, GFP_KERNEL); + if (!ioapics[i].saved_registers) + pr_err("IOAPIC %d: suspend/resume impossible!\n", i); + } + cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); node = cpu_to_node(0); @@ -297,7 +328,7 @@ struct io_apic { static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) { return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].apicaddr & ~PAGE_MASK); + + (mpc_ioapic_addr(idx) & ~PAGE_MASK); } static inline void io_apic_eoi(unsigned int apic, unsigned int vector) @@ -573,7 +604,7 @@ static void clear_IO_APIC (void) int apic, pin; for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) clear_IO_APIC_pin(apic, pin); } @@ -615,74 +646,43 @@ static int __init ioapic_pirq_setup(char *str) __setup("pirq=", ioapic_pirq_setup); #endif /* CONFIG_X86_32 */ -struct IO_APIC_route_entry **alloc_ioapic_entries(void) -{ - int apic; - struct IO_APIC_route_entry **ioapic_entries; - - ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics, - GFP_KERNEL); - if (!ioapic_entries) - return 0; - - for (apic = 0; apic < nr_ioapics; apic++) { - ioapic_entries[apic] = - kzalloc(sizeof(struct IO_APIC_route_entry) * - nr_ioapic_registers[apic], GFP_KERNEL); - if (!ioapic_entries[apic]) - goto nomem; - } - - return ioapic_entries; - -nomem: - while (--apic >= 0) - kfree(ioapic_entries[apic]); - kfree(ioapic_entries); - - return 0; -} - /* * Saves all the IO-APIC RTE's */ -int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) +int save_ioapic_entries(void) { int apic, pin; - - if (!ioapic_entries) - return -ENOMEM; + int err = 0; for (apic = 0; apic < nr_ioapics; apic++) { - if (!ioapic_entries[apic]) - return -ENOMEM; + if (!ioapics[apic].saved_registers) { + err = -ENOMEM; + continue; + } - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - ioapic_entries[apic][pin] = + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) + ioapics[apic].saved_registers[pin] = ioapic_read_entry(apic, pin); } - return 0; + return err; } /* * Mask all IO APIC entries. */ -void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) +void mask_ioapic_entries(void) { int apic, pin; - if (!ioapic_entries) - return; - for (apic = 0; apic < nr_ioapics; apic++) { - if (!ioapic_entries[apic]) - break; + if (ioapics[apic].saved_registers) + continue; - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { struct IO_APIC_route_entry entry; - entry = ioapic_entries[apic][pin]; + entry = ioapics[apic].saved_registers[pin]; if (!entry.mask) { entry.mask = 1; ioapic_write_entry(apic, pin, entry); @@ -692,36 +692,23 @@ void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) } /* - * Restore IO APIC entries which was saved in ioapic_entries. + * Restore IO APIC entries which was saved in the ioapic structure. */ -int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) +int restore_ioapic_entries(void) { int apic, pin; - if (!ioapic_entries) - return -ENOMEM; - for (apic = 0; apic < nr_ioapics; apic++) { - if (!ioapic_entries[apic]) - return -ENOMEM; + if (ioapics[apic].saved_registers) + continue; - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) ioapic_write_entry(apic, pin, - ioapic_entries[apic][pin]); + ioapics[apic].saved_registers[pin]); } return 0; } -void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) -{ - int apic; - - for (apic = 0; apic < nr_ioapics; apic++) - kfree(ioapic_entries[apic]); - - kfree(ioapic_entries); -} - /* * Find the IRQ entry number of a certain pin. */ @@ -731,7 +718,7 @@ static int find_irq_entry(int apic, int pin, int type) for (i = 0; i < mp_irq_entries; i++) if (mp_irqs[i].irqtype == type && - (mp_irqs[i].dstapic == mp_ioapics[apic].apicid || + (mp_irqs[i].dstapic == mpc_ioapic_id(apic) || mp_irqs[i].dstapic == MP_APIC_ALL) && mp_irqs[i].dstirq == pin) return i; @@ -773,7 +760,7 @@ static int __init find_isa_irq_apic(int irq, int type) if (i < mp_irq_entries) { int apic; for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic) + if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic) return apic; } } @@ -942,6 +929,7 @@ static int pin_2_irq(int idx, int apic, int pin) { int irq; int bus = mp_irqs[idx].srcbus; + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic); /* * Debugging check, we are in big trouble if this message pops up! @@ -952,7 +940,7 @@ static int pin_2_irq(int idx, int apic, int pin) if (test_bit(bus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; } else { - u32 gsi = mp_gsi_routing[apic].gsi_base + pin; + u32 gsi = gsi_cfg->gsi_base + pin; if (gsi >= NR_IRQS_LEGACY) irq = gsi; @@ -1003,7 +991,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, int lbus = mp_irqs[i].srcbus; for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || + if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic || mp_irqs[i].dstapic == MP_APIC_ALL) break; @@ -1222,7 +1210,7 @@ static inline int IO_APIC_irq_trigger(int irq) int apic, idx, pin; for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { idx = find_irq_entry(apic, pin, mp_INT); if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) return irq_trigger(idx); @@ -1350,14 +1338,14 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq, apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " "IRQ %d Mode:%i Active:%i)\n", - apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector, + apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector, irq, trigger, polarity); - if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry, + if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry, dest, trigger, polarity, cfg->vector, pin)) { printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", - mp_ioapics[apic_id].apicid, pin); + mpc_ioapic_id(apic_id), pin); __clear_irq_vector(irq, cfg); return; } @@ -1369,17 +1357,13 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq, ioapic_write_entry(apic_id, pin, entry); } -static struct { - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); -} mp_ioapic_routing[MAX_IO_APICS]; - static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin) { if (idx != -1) return false; apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n", - mp_ioapics[apic_id].apicid, pin); + mpc_ioapic_id(apic_id), pin); return true; } @@ -1389,7 +1373,7 @@ static void __init __io_apic_setup_irqs(unsigned int apic_id) struct io_apic_irq_attr attr; unsigned int pin, irq; - for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { + for (pin = 0; pin < ioapics[apic_id].nr_registers; pin++) { idx = find_irq_entry(apic_id, pin, mp_INT); if (io_apic_pin_not_connected(idx, apic_id, pin)) continue; @@ -1511,7 +1495,7 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].apicid, nr_ioapic_registers[i]); + mpc_ioapic_id(i), ioapics[i].nr_registers); /* * We are a bit conservative about what we expect. We have to @@ -1531,7 +1515,7 @@ __apicdebuginit(void) print_IO_APIC(void) raw_spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); + printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(apic)); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); @@ -1825,7 +1809,7 @@ void __init enable_IO_APIC(void) for(apic = 0; apic < nr_ioapics; apic++) { int pin; /* See if any of the pins is in ExtINT mode */ - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { struct IO_APIC_route_entry entry; entry = ioapic_read_entry(apic, pin); @@ -1949,14 +1933,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) reg_00.raw = io_apic_read(apic_id, 0); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - old_id = mp_ioapics[apic_id].apicid; + old_id = mpc_ioapic_id(apic_id); - if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) { + if (mpc_ioapic_id(apic_id) >= get_physical_broadcast()) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - apic_id, mp_ioapics[apic_id].apicid); + apic_id, mpc_ioapic_id(apic_id)); printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", reg_00.bits.ID); - mp_ioapics[apic_id].apicid = reg_00.bits.ID; + ioapics[apic_id].mp_config.apicid = reg_00.bits.ID; } /* @@ -1965,9 +1949,9 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) * 'stuck on smp_invalidate_needed IPI wait' messages. */ if (apic->check_apicid_used(&phys_id_present_map, - mp_ioapics[apic_id].apicid)) { + mpc_ioapic_id(apic_id))) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - apic_id, mp_ioapics[apic_id].apicid); + apic_id, mpc_ioapic_id(apic_id)); for (i = 0; i < get_physical_broadcast(); i++) if (!physid_isset(i, phys_id_present_map)) break; @@ -1976,13 +1960,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", i); physid_set(i, phys_id_present_map); - mp_ioapics[apic_id].apicid = i; + ioapics[apic_id].mp_config.apicid = i; } else { physid_mask_t tmp; - apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp); + apic->apicid_to_cpu_present(mpc_ioapic_id(apic_id), + &tmp); apic_printk(APIC_VERBOSE, "Setting %d in the " "phys_id_present_map\n", - mp_ioapics[apic_id].apicid); + mpc_ioapic_id(apic_id)); physids_or(phys_id_present_map, phys_id_present_map, tmp); } @@ -1990,24 +1975,24 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) * We need to adjust the IRQ routing table * if the ID changed. */ - if (old_id != mp_ioapics[apic_id].apicid) + if (old_id != mpc_ioapic_id(apic_id)) for (i = 0; i < mp_irq_entries; i++) if (mp_irqs[i].dstapic == old_id) mp_irqs[i].dstapic - = mp_ioapics[apic_id].apicid; + = mpc_ioapic_id(apic_id); /* * Update the ID register according to the right value * from the MPC table if they are different. */ - if (mp_ioapics[apic_id].apicid == reg_00.bits.ID) + if (mpc_ioapic_id(apic_id) == reg_00.bits.ID) continue; apic_printk(APIC_VERBOSE, KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic_id].apicid); + mpc_ioapic_id(apic_id)); - reg_00.bits.ID = mp_ioapics[apic_id].apicid; + reg_00.bits.ID = mpc_ioapic_id(apic_id); raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic_id, 0, reg_00.raw); raw_spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2018,7 +2003,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic_id, 0); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) + if (reg_00.bits.ID != mpc_ioapic_id(apic_id)) printk("could not set ID!\n"); else apic_printk(APIC_VERBOSE, " ok.\n"); @@ -2404,7 +2389,7 @@ static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, cfg->irq_2_pin) { - if (mp_ioapics[entry->apic].apicver >= 0x20) { + if (mpc_ioapic_ver(entry->apic) >= 0x20) { /* * Intr-remapping uses pin number as the virtual vector * in the RTE. Actual vector is programmed in @@ -2918,49 +2903,19 @@ static int __init io_apic_bug_finalize(void) late_initcall(io_apic_bug_finalize); -static struct IO_APIC_route_entry *ioapic_saved_data[MAX_IO_APICS]; - -static void suspend_ioapic(int ioapic_id) +static void resume_ioapic_id(int ioapic_id) { - struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id]; - int i; - - if (!saved_data) - return; - - for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++) - saved_data[i] = ioapic_read_entry(ioapic_id, i); -} - -static int ioapic_suspend(void) -{ - int ioapic_id; - - for (ioapic_id = 0; ioapic_id < nr_ioapics; ioapic_id++) - suspend_ioapic(ioapic_id); - - return 0; -} - -static void resume_ioapic(int ioapic_id) -{ - struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id]; unsigned long flags; union IO_APIC_reg_00 reg_00; - int i; - if (!saved_data) - return; raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic_id, 0); - if (reg_00.bits.ID != mp_ioapics[ioapic_id].apicid) { - reg_00.bits.ID = mp_ioapics[ioapic_id].apicid; + if (reg_00.bits.ID != mpc_ioapic_id(ioapic_id)) { + reg_00.bits.ID = mpc_ioapic_id(ioapic_id); io_apic_write(ioapic_id, 0, reg_00.raw); } raw_spin_unlock_irqrestore(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++) - ioapic_write_entry(ioapic_id, i, saved_data[i]); } static void ioapic_resume(void) @@ -2968,28 +2923,18 @@ static void ioapic_resume(void) int ioapic_id; for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--) - resume_ioapic(ioapic_id); + resume_ioapic_id(ioapic_id); + + restore_ioapic_entries(); } static struct syscore_ops ioapic_syscore_ops = { - .suspend = ioapic_suspend, + .suspend = save_ioapic_entries, .resume = ioapic_resume, }; static int __init ioapic_init_ops(void) { - int i; - - for (i = 0; i < nr_ioapics; i++) { - unsigned int size; - - size = nr_ioapic_registers[i] - * sizeof(struct IO_APIC_route_entry); - ioapic_saved_data[i] = kzalloc(size, GFP_KERNEL); - if (!ioapic_saved_data[i]) - pr_err("IOAPIC %d: suspend/resume impossible!\n", i); - } - register_syscore_ops(&ioapic_syscore_ops); return 0; @@ -3592,14 +3537,14 @@ int io_apic_setup_irq_pin_once(unsigned int irq, int node, int ret; /* Avoid redundant programming */ - if (test_bit(pin, mp_ioapic_routing[id].pin_programmed)) { + if (test_bit(pin, ioapics[id].pin_programmed)) { pr_debug("Pin %d-%d already programmed\n", - mp_ioapics[id].apicid, pin); + mpc_ioapic_id(id), pin); return 0; } ret = io_apic_setup_irq_pin(irq, node, attr); if (!ret) - set_bit(pin, mp_ioapic_routing[id].pin_programmed); + set_bit(pin, ioapics[id].pin_programmed); return ret; } @@ -3764,8 +3709,7 @@ static u8 __init io_apic_unique_id(u8 id) bitmap_zero(used, 256); for (i = 0; i < nr_ioapics; i++) { - struct mpc_ioapic *ia = &mp_ioapics[i]; - __set_bit(ia->apicid, used); + __set_bit(mpc_ioapic_id(i), used); } if (!test_bit(id, used)) return id; @@ -3825,7 +3769,7 @@ void __init setup_ioapic_dest(void) return; for (ioapic = 0; ioapic < nr_ioapics; ioapic++) - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) { irq_entry = find_irq_entry(ioapic, pin, mp_INT); if (irq_entry == -1) continue; @@ -3896,7 +3840,7 @@ void __init ioapic_and_gsi_init(void) ioapic_res = ioapic_setup_resources(nr_ioapics); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { - ioapic_phys = mp_ioapics[i].apicaddr; + ioapic_phys = mpc_ioapic_addr(i); #ifdef CONFIG_X86_32 if (!ioapic_phys) { printk(KERN_ERR @@ -3956,8 +3900,9 @@ int mp_find_ioapic(u32 gsi) /* Find the IOAPIC that manages this GSI. */ for (i = 0; i < nr_ioapics; i++) { - if ((gsi >= mp_gsi_routing[i].gsi_base) - && (gsi <= mp_gsi_routing[i].gsi_end)) + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i); + if ((gsi >= gsi_cfg->gsi_base) + && (gsi <= gsi_cfg->gsi_end)) return i; } @@ -3967,12 +3912,16 @@ int mp_find_ioapic(u32 gsi) int mp_find_ioapic_pin(int ioapic, u32 gsi) { + struct mp_ioapic_gsi *gsi_cfg; + if (WARN_ON(ioapic == -1)) return -1; - if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end)) + + gsi_cfg = mp_ioapic_gsi_routing(ioapic); + if (WARN_ON(gsi > gsi_cfg->gsi_end)) return -1; - return gsi - mp_gsi_routing[ioapic].gsi_base; + return gsi - gsi_cfg->gsi_base; } static __init int bad_ioapic(unsigned long address) @@ -3994,40 +3943,42 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) { int idx = 0; int entries; + struct mp_ioapic_gsi *gsi_cfg; if (bad_ioapic(address)) return; idx = nr_ioapics; - mp_ioapics[idx].type = MP_IOAPIC; - mp_ioapics[idx].flags = MPC_APIC_USABLE; - mp_ioapics[idx].apicaddr = address; + ioapics[idx].mp_config.type = MP_IOAPIC; + ioapics[idx].mp_config.flags = MPC_APIC_USABLE; + ioapics[idx].mp_config.apicaddr = address; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].apicid = io_apic_unique_id(id); - mp_ioapics[idx].apicver = io_apic_get_version(idx); + ioapics[idx].mp_config.apicid = io_apic_unique_id(id); + ioapics[idx].mp_config.apicver = io_apic_get_version(idx); /* * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). */ entries = io_apic_get_redir_entries(idx); - mp_gsi_routing[idx].gsi_base = gsi_base; - mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1; + gsi_cfg = mp_ioapic_gsi_routing(idx); + gsi_cfg->gsi_base = gsi_base; + gsi_cfg->gsi_end = gsi_base + entries - 1; /* * The number of IO-APIC IRQ registers (== #pins): */ - nr_ioapic_registers[idx] = entries; + ioapics[idx].nr_registers = entries; - if (mp_gsi_routing[idx].gsi_end >= gsi_top) - gsi_top = mp_gsi_routing[idx].gsi_end + 1; + if (gsi_cfg->gsi_end >= gsi_top) + gsi_top = gsi_cfg->gsi_end + 1; printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " - "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, - mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, - mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end); + "GSI %d-%d\n", idx, mpc_ioapic_id(idx), + mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), + gsi_cfg->gsi_base, gsi_cfg->gsi_end); nr_ioapics++; } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 30f13319e24..c4a61ca1349 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -472,8 +472,8 @@ static void numaq_setup_portio_remap(void) (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); } -/* Use __refdata to keep false positive warning calm. */ -struct apic __refdata apic_numaq = { +/* Use __refdata to keep false positive warning calm. */ +static struct apic __refdata apic_numaq = { .name = "NUMAQ", .probe = probe_numaq, @@ -537,3 +537,5 @@ struct apic __refdata apic_numaq = { .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, .x86_32_numa_cpu_node = numaq_numa_cpu_node, }; + +apic_driver(apic_numaq); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 6541e471fd9..b5254ad044a 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -52,31 +52,6 @@ static int __init print_ipi_mode(void) } late_initcall(print_ipi_mode); -void __init default_setup_apic_routing(void) -{ - int version = apic_version[boot_cpu_physical_apicid]; - - if (num_possible_cpus() > 8) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; - break; - } - /* If P4 and above fall through */ - case X86_VENDOR_AMD: - def_to_bigsmp = 1; - } - } - -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif - - if (apic->setup_apic_routing) - apic->setup_apic_routing(); -} - static int default_x86_32_early_logical_apicid(int cpu) { return 1 << cpu; @@ -112,7 +87,7 @@ static int probe_default(void) return 1; } -struct apic apic_default = { +static struct apic apic_default = { .name = "default", .probe = probe_default, @@ -174,44 +149,22 @@ struct apic apic_default = { .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, }; -extern struct apic apic_numaq; -extern struct apic apic_summit; -extern struct apic apic_bigsmp; -extern struct apic apic_es7000; -extern struct apic apic_es7000_cluster; +apic_driver(apic_default); struct apic *apic = &apic_default; EXPORT_SYMBOL_GPL(apic); -static struct apic *apic_probe[] __initdata = { -#ifdef CONFIG_X86_NUMAQ - &apic_numaq, -#endif -#ifdef CONFIG_X86_SUMMIT - &apic_summit, -#endif -#ifdef CONFIG_X86_BIGSMP - &apic_bigsmp, -#endif -#ifdef CONFIG_X86_ES7000 - &apic_es7000, - &apic_es7000_cluster, -#endif - &apic_default, /* must be last */ - NULL, -}; - static int cmdline_apic __initdata; static int __init parse_apic(char *arg) { - int i; + struct apic **drv; if (!arg) return -EINVAL; - for (i = 0; apic_probe[i]; i++) { - if (!strcmp(apic_probe[i]->name, arg)) { - apic = apic_probe[i]; + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if (!strcmp((*drv)->name, arg)) { + apic = *drv; cmdline_apic = 1; return 0; } @@ -222,38 +175,58 @@ static int __init parse_apic(char *arg) } early_param("apic", parse_apic); -void __init generic_bigsmp_probe(void) +void __init default_setup_apic_routing(void) { + int version = apic_version[boot_cpu_physical_apicid]; + + if (num_possible_cpus() > 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } + #ifdef CONFIG_X86_BIGSMP /* - * This routine is used to switch to bigsmp mode when + * This is used to switch to bigsmp mode when * - There is no apic= option specified by the user * - generic_apic_probe() has chosen apic_default as the sub_arch * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support */ if (!cmdline_apic && apic == &apic_default) { - if (apic_bigsmp.probe()) { - apic = &apic_bigsmp; + struct apic *bigsmp = generic_bigsmp_probe(); + if (bigsmp) { + apic = bigsmp; printk(KERN_INFO "Overriding APIC driver with %s\n", apic->name); } } #endif + + if (apic->setup_apic_routing) + apic->setup_apic_routing(); } void __init generic_apic_probe(void) { if (!cmdline_apic) { - int i; - for (i = 0; apic_probe[i]; i++) { - if (apic_probe[i]->probe()) { - apic = apic_probe[i]; + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->probe()) { + apic = *drv; break; } } /* Not visible without early console */ - if (!apic_probe[i]) + if (drv == __apicdrivers_end) panic("Didn't find an APIC driver"); } printk(KERN_INFO "Using APIC driver %s\n", apic->name); @@ -264,16 +237,16 @@ void __init generic_apic_probe(void) int __init generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) { - int i; + struct apic **drv; - for (i = 0; apic_probe[i]; ++i) { - if (!apic_probe[i]->mps_oem_check) + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if (!((*drv)->mps_oem_check)) continue; - if (!apic_probe[i]->mps_oem_check(mpc, oem, productid)) + if (!(*drv)->mps_oem_check(mpc, oem, productid)) continue; if (!cmdline_apic) { - apic = apic_probe[i]; + apic = *drv; printk(KERN_INFO "Switched to APIC driver `%s'.\n", apic->name); } @@ -284,16 +257,16 @@ generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - int i; + struct apic **drv; - for (i = 0; apic_probe[i]; ++i) { - if (!apic_probe[i]->acpi_madt_oem_check) + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if (!(*drv)->acpi_madt_oem_check) continue; - if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) + if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) continue; if (!cmdline_apic) { - apic = apic_probe[i]; + apic = *drv; printk(KERN_INFO "Switched to APIC driver `%s'.\n", apic->name); } diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index d8c4a6feb28..3fe98669892 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -23,27 +23,6 @@ #include <asm/ipi.h> #include <asm/setup.h> -extern struct apic apic_flat; -extern struct apic apic_physflat; -extern struct apic apic_x2xpic_uv_x; -extern struct apic apic_x2apic_phys; -extern struct apic apic_x2apic_cluster; - -struct apic __read_mostly *apic = &apic_flat; -EXPORT_SYMBOL_GPL(apic); - -static struct apic *apic_probe[] __initdata = { -#ifdef CONFIG_X86_UV - &apic_x2apic_uv_x, -#endif -#ifdef CONFIG_X86_X2APIC - &apic_x2apic_phys, - &apic_x2apic_cluster, -#endif - &apic_physflat, - NULL, -}; - static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) { return hard_smp_processor_id() >> index_msb; @@ -54,26 +33,20 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) */ void __init default_setup_apic_routing(void) { + struct apic **drv; enable_IR_x2apic(); -#ifdef CONFIG_X86_X2APIC - if (x2apic_mode -#ifdef CONFIG_X86_UV - && apic != &apic_x2apic_uv_x -#endif - ) { - if (x2apic_phys) - apic = &apic_x2apic_phys; - else - apic = &apic_x2apic_cluster; + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->probe && (*drv)->probe()) { + if (apic != *drv) { + apic = *drv; + pr_info("Switched APIC routing to %s.\n", + apic->name); + } + break; + } } -#endif - - if (apic == &apic_flat && num_possible_cpus() > 8) - apic = &apic_physflat; - - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); if (is_vsmp_box()) { /* need to update phys_pkg_id */ @@ -90,13 +63,15 @@ void apic_send_IPI_self(int vector) int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - int i; + struct apic **drv; - for (i = 0; apic_probe[i]; ++i) { - if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { - apic = apic_probe[i]; - printk(KERN_INFO "Setting APIC routing to %s.\n", - apic->name); + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) { + if (apic != *drv) { + apic = *drv; + pr_info("Setting APIC routing to %s.\n", + apic->name); + } return 1; } } diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 35bcd7d995a..19114423c58 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -491,7 +491,7 @@ void setup_summit(void) } #endif -struct apic apic_summit = { +static struct apic apic_summit = { .name = "summit", .probe = probe_summit, @@ -552,3 +552,5 @@ struct apic apic_summit = { .x86_32_early_logical_apicid = summit_early_logical_apicid, }; + +apic_driver(apic_summit); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 90949bbd566..50079587582 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -5,118 +5,95 @@ #include <linux/ctype.h> #include <linux/init.h> #include <linux/dmar.h> +#include <linux/cpu.h> #include <asm/smp.h> -#include <asm/apic.h> -#include <asm/ipi.h> +#include <asm/x2apic.h> static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster); +static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return x2apic_enabled(); } -/* - * need to use more than cpu 0, because we need more vectors when - * MSI-X are used. - */ -static const struct cpumask *x2apic_target_cpus(void) +static inline u32 x2apic_cluster(int cpu) { - return cpu_online_mask; -} - -/* - * for now each logical cpu is in its own vector allocation domain. - */ -static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); + return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16; } static void - __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) +__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) { - unsigned long cfg; + struct cpumask *cpus_in_cluster_ptr; + struct cpumask *ipi_mask_ptr; + unsigned int cpu, this_cpu; + unsigned long flags; + u32 dest; + + x2apic_wrmsr_fence(); + + local_irq_save(flags); - cfg = __prepare_ICR(0, vector, dest); + this_cpu = smp_processor_id(); /* - * send the IPI. + * We are to modify mask, so we need an own copy + * and be sure it's manipulated with irq off. */ - native_x2apic_icr_write(cfg, apicid); -} + ipi_mask_ptr = __raw_get_cpu_var(ipi_mask); + cpumask_copy(ipi_mask_ptr, mask); -/* - * for now, we send the IPI's one by one in the cpumask. - * TBD: Based on the cpu mask, we can send the IPI's to the cluster group - * at once. We have 16 cpu's in a cluster. This will minimize IPI register - * writes. - */ -static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) -{ - unsigned long query_cpu; - unsigned long flags; + /* + * The idea is to send one IPI per cluster. + */ + for_each_cpu(cpu, ipi_mask_ptr) { + unsigned long i; - x2apic_wrmsr_fence(); + cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu); + dest = 0; - local_irq_save(flags); - for_each_cpu(query_cpu, mask) { - __x2apic_send_IPI_dest( - per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); + /* Collect cpus in cluster. */ + for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) { + if (apic_dest == APIC_DEST_ALLINC || i != this_cpu) + dest |= per_cpu(x86_cpu_to_logical_apicid, i); + } + + if (!dest) + continue; + + __x2apic_send_IPI_dest(dest, vector, apic->dest_logical); + /* + * Cluster sibling cpus should be discared now so + * we would not send IPI them second time. + */ + cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr); } + local_irq_restore(flags); } +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC); +} + static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { - unsigned long this_cpu = smp_processor_id(); - unsigned long query_cpu; - unsigned long flags; - - x2apic_wrmsr_fence(); - - local_irq_save(flags); - for_each_cpu(query_cpu, mask) { - if (query_cpu == this_cpu) - continue; - __x2apic_send_IPI_dest( - per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); - } - local_irq_restore(flags); + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); } static void x2apic_send_IPI_allbutself(int vector) { - unsigned long this_cpu = smp_processor_id(); - unsigned long query_cpu; - unsigned long flags; - - x2apic_wrmsr_fence(); - - local_irq_save(flags); - for_each_online_cpu(query_cpu) { - if (query_cpu == this_cpu) - continue; - __x2apic_send_IPI_dest( - per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); - } - local_irq_restore(flags); + __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT); } static void x2apic_send_IPI_all(int vector) { - x2apic_send_IPI_mask(cpu_online_mask, vector); -} - -static int x2apic_apic_id_registered(void) -{ - return 1; + __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) @@ -151,43 +128,90 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, return per_cpu(x86_cpu_to_logical_apicid, cpu); } -static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) +static void init_x2apic_ldr(void) { - unsigned int id; + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; - id = x; - return id; + per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR); + + __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); + for_each_online_cpu(cpu) { + if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) + continue; + __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu)); + __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu)); + } } -static unsigned long set_apic_id(unsigned int id) + /* + * At CPU state changes, update the x2apic cluster sibling info. + */ +static int __cpuinit +update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned long x; + unsigned int this_cpu = (unsigned long)hcpu; + unsigned int cpu; + int err = 0; + + switch (action) { + case CPU_UP_PREPARE: + if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu), + GFP_KERNEL)) { + err = -ENOMEM; + } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu), + GFP_KERNEL)) { + free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); + err = -ENOMEM; + } + break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + for_each_online_cpu(cpu) { + if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) + continue; + __cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu)); + __cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu)); + } + free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); + free_cpumask_var(per_cpu(ipi_mask, this_cpu)); + break; + } - x = id; - return x; + return notifier_from_errno(err); } -static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) -{ - return initial_apicid >> index_msb; -} +static struct notifier_block __refdata x2apic_cpu_notifier = { + .notifier_call = update_clusterinfo, +}; -static void x2apic_send_IPI_self(int vector) +static int x2apic_init_cpu_notifier(void) { - apic_write(APIC_SELF_IPI, vector); + int cpu = smp_processor_id(); + + zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL); + + BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)); + + __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu)); + register_hotcpu_notifier(&x2apic_cpu_notifier); + return 1; } -static void init_x2apic_ldr(void) +static int x2apic_cluster_probe(void) { - int cpu = smp_processor_id(); - - per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); + if (x2apic_mode) + return x2apic_init_cpu_notifier(); + else + return 0; } -struct apic apic_x2apic_cluster = { +static struct apic apic_x2apic_cluster = { .name = "cluster x2apic", - .probe = NULL, + .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, .apic_id_registered = x2apic_apic_id_registered, @@ -211,11 +235,11 @@ struct apic apic_x2apic_cluster = { .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, .enable_apic_mode = NULL, - .phys_pkg_id = x2apic_cluster_phys_pkg_id, + .phys_pkg_id = x2apic_phys_pkg_id, .mps_oem_check = NULL, - .get_apic_id = x2apic_cluster_phys_get_apic_id, - .set_apic_id = set_apic_id, + .get_apic_id = x2apic_get_apic_id, + .set_apic_id = x2apic_set_apic_id, .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, @@ -240,3 +264,5 @@ struct apic apic_x2apic_cluster = { .wait_icr_idle = native_x2apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; + +apic_driver(apic_x2apic_cluster); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index c7e6d6645bf..f5373dfde21 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -7,11 +7,12 @@ #include <linux/dmar.h> #include <asm/smp.h> -#include <asm/apic.h> -#include <asm/ipi.h> +#include <asm/x2apic.h> int x2apic_phys; +static struct apic apic_x2apic_phys; + static int set_x2apic_phys_mode(char *arg) { x2apic_phys = 1; @@ -27,94 +28,46 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -/* - * need to use more than cpu 0, because we need more vectors when - * MSI-X are used. - */ -static const struct cpumask *x2apic_target_cpus(void) -{ - return cpu_online_mask; -} - -static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - -static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, - unsigned int dest) -{ - unsigned long cfg; - - cfg = __prepare_ICR(0, vector, dest); - - /* - * send the IPI. - */ - native_x2apic_icr_write(cfg, apicid); -} - -static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +static void +__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) { unsigned long query_cpu; + unsigned long this_cpu; unsigned long flags; x2apic_wrmsr_fence(); local_irq_save(flags); + + this_cpu = smp_processor_id(); for_each_cpu(query_cpu, mask) { + if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu) + continue; __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC); +} + static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { - unsigned long this_cpu = smp_processor_id(); - unsigned long query_cpu; - unsigned long flags; - - x2apic_wrmsr_fence(); - - local_irq_save(flags); - for_each_cpu(query_cpu, mask) { - if (query_cpu != this_cpu) - __x2apic_send_IPI_dest( - per_cpu(x86_cpu_to_apicid, query_cpu), - vector, APIC_DEST_PHYSICAL); - } - local_irq_restore(flags); + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); } static void x2apic_send_IPI_allbutself(int vector) { - unsigned long this_cpu = smp_processor_id(); - unsigned long query_cpu; - unsigned long flags; - - x2apic_wrmsr_fence(); - - local_irq_save(flags); - for_each_online_cpu(query_cpu) { - if (query_cpu == this_cpu) - continue; - __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), - vector, APIC_DEST_PHYSICAL); - } - local_irq_restore(flags); + __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT); } static void x2apic_send_IPI_all(int vector) { - x2apic_send_IPI_mask(cpu_online_mask, vector); -} - -static int x2apic_apic_id_registered(void) -{ - return 1; + __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) @@ -149,34 +102,22 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, return per_cpu(x86_cpu_to_apicid, cpu); } -static unsigned int x2apic_phys_get_apic_id(unsigned long x) -{ - return x; -} - -static unsigned long set_apic_id(unsigned int id) -{ - return id; -} - -static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) +static void init_x2apic_ldr(void) { - return initial_apicid >> index_msb; } -static void x2apic_send_IPI_self(int vector) +static int x2apic_phys_probe(void) { - apic_write(APIC_SELF_IPI, vector); -} + if (x2apic_mode && x2apic_phys) + return 1; -static void init_x2apic_ldr(void) -{ + return apic == &apic_x2apic_phys; } -struct apic apic_x2apic_phys = { +static struct apic apic_x2apic_phys = { .name = "physical x2apic", - .probe = NULL, + .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, .apic_id_registered = x2apic_apic_id_registered, @@ -203,8 +144,8 @@ struct apic apic_x2apic_phys = { .phys_pkg_id = x2apic_phys_pkg_id, .mps_oem_check = NULL, - .get_apic_id = x2apic_phys_get_apic_id, - .set_apic_id = set_apic_id, + .get_apic_id = x2apic_get_apic_id, + .set_apic_id = x2apic_set_apic_id, .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, @@ -229,3 +170,5 @@ struct apic apic_x2apic_phys = { .wait_icr_idle = native_x2apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; + +apic_driver(apic_x2apic_phys); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 7acd2d2ac96..f450b683dfc 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -58,6 +58,8 @@ unsigned int uv_apicid_hibits; EXPORT_SYMBOL_GPL(uv_apicid_hibits); static DEFINE_SPINLOCK(uv_nmi_lock); +static struct apic apic_x2apic_uv_x; + static unsigned long __init uv_early_read_mmr(unsigned long addr) { unsigned long val, *mmr; @@ -326,10 +328,15 @@ static void uv_send_IPI_self(int vector) apic_write(APIC_SELF_IPI, vector); } -struct apic __refdata apic_x2apic_uv_x = { +static int uv_probe(void) +{ + return apic == &apic_x2apic_uv_x; +} + +static struct apic __refdata apic_x2apic_uv_x = { .name = "UV large system", - .probe = NULL, + .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, .apic_id_registered = uv_apic_id_registered, @@ -859,3 +866,5 @@ void __init uv_system_init(void) if (is_kdump_kernel()) reboot_type = BOOT_ACPI; } + +apic_driver(apic_x2apic_uv_x); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 6f9d1f6063e..8f5cabb3c5b 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -629,10 +629,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 */ u64 mask; + int err; - rdmsrl(MSR_AMD64_MCx_MASK(4), mask); - mask |= (1 << 10); - wrmsrl(MSR_AMD64_MCx_MASK(4), mask); + err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); + if (err == 0) { + mask |= (1 << 10); + checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); + } } } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cbc70a27430..c8b41623377 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -254,7 +254,7 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) } #endif -static int disable_smep __initdata; +static int disable_smep __cpuinitdata; static __init int setup_disable_smep(char *arg) { disable_smep = 1; @@ -262,7 +262,7 @@ static __init int setup_disable_smep(char *arg) } __setup("nosmep", setup_disable_smep); -static __init void setup_smep(struct cpuinfo_x86 *c) +static __cpuinit void setup_smep(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_SMEP)) { if (unlikely(disable_smep)) { diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index e90f08458e6..690bc846183 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -369,6 +369,7 @@ static struct of_ioapic_type of_ioapic_type[] = static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, u32 *out_hwirq, u32 *out_type) { + struct mp_ioapic_gsi *gsi_cfg; struct io_apic_irq_attr attr; struct of_ioapic_type *it; u32 line, idx, type; @@ -378,7 +379,8 @@ static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, line = *intspec; idx = (u32) id->priv; - *out_hwirq = line + mp_gsi_routing[idx].gsi_base; + gsi_cfg = mp_ioapic_gsi_routing(idx); + *out_hwirq = line + gsi_cfg->gsi_base; intspec++; type = *intspec; @@ -407,7 +409,7 @@ static void __init ioapic_add_ofnode(struct device_node *np) } for (i = 0; i < nr_ioapics; i++) { - if (r.start == mp_ioapics[i].apicaddr) { + if (r.start == mpc_ioapic_addr(i)) { struct irq_domain *id; id = kzalloc(sizeof(*id), GFP_KERNEL); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 6f9bfffb272..9103b89c145 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -285,7 +285,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) intsrc.type = MP_INTSRC; intsrc.irqflag = 0; /* conforming */ intsrc.srcbus = 0; - intsrc.dstapic = mp_ioapics[0].apicid; + intsrc.dstapic = mpc_ioapic_id(0); intsrc.irqtype = mp_INT; diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c index 787a5e499dd..3f92ce07e52 100644 --- a/arch/x86/kernel/test_nx.c +++ b/arch/x86/kernel/test_nx.c @@ -161,7 +161,7 @@ static int test_NX(void) } #endif - return 0; + return ret; } static void test_exit(void) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 624a2016198..49927a863cc 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -306,6 +306,13 @@ SECTIONS } . = ALIGN(8); + .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) { + __apicdrivers = .; + *(.apicdrivers); + __apicdrivers_end = .; + } + + . = ALIGN(8); /* * .exit.text is discard at runtime, not link time, to deal with * references from .altinstructions and .eh_frame diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0ad47b819a8..d6e2477feb1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -73,9 +73,14 @@ #define MemAbs (1<<11) /* Memory operand is absolute displacement */ #define String (1<<12) /* String instruction (rep capable) */ #define Stack (1<<13) /* Stack instruction (push/pop) */ +#define GroupMask (7<<14) /* Opcode uses one of the group mechanisms */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ -#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ +#define GroupDual (2<<14) /* Alternate decoding of mod == 3 */ +#define Prefix (3<<14) /* Instruction varies with 66/f2/f3 prefix */ +#define RMExt (4<<14) /* Opcode extension in ModRM r/m if mod == 3 */ +#define Sse (1<<17) /* SSE Vector instruction */ /* Misc flags */ +#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ #define VendorSpecific (1<<22) /* Vendor specific instruction */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ @@ -102,11 +107,14 @@ struct opcode { u32 flags; + u8 intercept; union { int (*execute)(struct x86_emulate_ctxt *ctxt); struct opcode *group; struct group_dual *gdual; + struct gprefix *gprefix; } u; + int (*check_perm)(struct x86_emulate_ctxt *ctxt); }; struct group_dual { @@ -114,6 +122,13 @@ struct group_dual { struct opcode mod3[8]; }; +struct gprefix { + struct opcode pfx_no; + struct opcode pfx_66; + struct opcode pfx_f2; + struct opcode pfx_f3; +}; + /* EFLAGS bit definitions. */ #define EFLG_ID (1<<21) #define EFLG_VIP (1<<20) @@ -248,42 +263,42 @@ struct group_dual { "w", "r", _LO32, "r", "", "r") /* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ - do { \ - unsigned long _tmp; \ - _type _clv = (_cl).val; \ - _type _srcv = (_src).val; \ - _type _dstv = (_dst).val; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "2") \ - _op _suffix " %4,%1 \n" \ - _POST_EFLAGS("0", "5", "2") \ - : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ - : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ - ); \ - \ - (_cl).val = (unsigned long) _clv; \ - (_src).val = (unsigned long) _srcv; \ - (_dst).val = (unsigned long) _dstv; \ +#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ + do { \ + unsigned long _tmp; \ + _type _clv = (_cl).val; \ + _type _srcv = (_src).val; \ + _type _dstv = (_dst).val; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "5", "2") \ + _op _suffix " %4,%1 \n" \ + _POST_EFLAGS("0", "5", "2") \ + : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ + : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ + ); \ + \ + (_cl).val = (unsigned long) _clv; \ + (_src).val = (unsigned long) _srcv; \ + (_dst).val = (unsigned long) _dstv; \ } while (0) -#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ - do { \ - switch ((_dst).bytes) { \ - case 2: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "w", unsigned short); \ - break; \ - case 4: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "l", unsigned int); \ - break; \ - case 8: \ - ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "q", unsigned long)); \ - break; \ - } \ +#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ + do { \ + switch ((_dst).bytes) { \ + case 2: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "w", unsigned short); \ + break; \ + case 4: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "l", unsigned int); \ + break; \ + case 8: \ + ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "q", unsigned long)); \ + break; \ + } \ } while (0) #define __emulate_1op(_op, _dst, _eflags, _suffix) \ @@ -346,13 +361,25 @@ struct group_dual { } while (0) /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ - do { \ - switch((_src).bytes) { \ - case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \ - case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \ - case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \ - case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \ +#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ + do { \ + switch((_src).bytes) { \ + case 1: \ + __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "b"); \ + break; \ + case 2: \ + __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "w"); \ + break; \ + case 4: \ + __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "l"); \ + break; \ + case 8: \ + ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "q")); \ + break; \ } \ } while (0) @@ -388,13 +415,33 @@ struct group_dual { (_type)_x; \ }) -#define insn_fetch_arr(_arr, _size, _eip) \ +#define insn_fetch_arr(_arr, _size, _eip) \ ({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ (_eip) += (_size); \ }) +static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, + enum x86_intercept intercept, + enum x86_intercept_stage stage) +{ + struct x86_instruction_info info = { + .intercept = intercept, + .rep_prefix = ctxt->decode.rep_prefix, + .modrm_mod = ctxt->decode.modrm_mod, + .modrm_reg = ctxt->decode.modrm_reg, + .modrm_rm = ctxt->decode.modrm_rm, + .src_val = ctxt->decode.src.val64, + .src_bytes = ctxt->decode.src.bytes, + .dst_bytes = ctxt->decode.dst.bytes, + .ad_bytes = ctxt->decode.ad_bytes, + .next_rip = ctxt->eip, + }; + + return ctxt->ops->intercept(ctxt, &info, stage); +} + static inline unsigned long ad_mask(struct decode_cache *c) { return (1UL << (c->ad_bytes << 3)) - 1; @@ -430,6 +477,13 @@ static inline void jmp_rel(struct decode_cache *c, int rel) register_address_increment(c, &c->eip, rel); } +static u32 desc_limit_scaled(struct desc_struct *desc) +{ + u32 limit = get_desc_limit(desc); + + return desc->g ? (limit << 12) | 0xfff : limit; +} + static void set_seg_override(struct decode_cache *c, int seg) { c->has_seg_override = true; @@ -442,11 +496,10 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) return 0; - return ops->get_cached_segment_base(seg, ctxt->vcpu); + return ops->get_cached_segment_base(ctxt, seg); } static unsigned seg_override(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, struct decode_cache *c) { if (!c->has_seg_override) @@ -455,18 +508,6 @@ static unsigned seg_override(struct x86_emulate_ctxt *ctxt, return c->seg_override; } -static ulong linear(struct x86_emulate_ctxt *ctxt, - struct segmented_address addr) -{ - struct decode_cache *c = &ctxt->decode; - ulong la; - - la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; - if (c->ad_bytes != 8) - la &= (u32)-1; - return la; -} - static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, u32 error, bool valid) { @@ -476,11 +517,21 @@ static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, return X86EMUL_PROPAGATE_FAULT; } +static int emulate_db(struct x86_emulate_ctxt *ctxt) +{ + return emulate_exception(ctxt, DB_VECTOR, 0, false); +} + static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err) { return emulate_exception(ctxt, GP_VECTOR, err, true); } +static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err) +{ + return emulate_exception(ctxt, SS_VECTOR, err, true); +} + static int emulate_ud(struct x86_emulate_ctxt *ctxt) { return emulate_exception(ctxt, UD_VECTOR, 0, false); @@ -496,6 +547,128 @@ static int emulate_de(struct x86_emulate_ctxt *ctxt) return emulate_exception(ctxt, DE_VECTOR, 0, false); } +static int emulate_nm(struct x86_emulate_ctxt *ctxt) +{ + return emulate_exception(ctxt, NM_VECTOR, 0, false); +} + +static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) +{ + u16 selector; + struct desc_struct desc; + + ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg); + return selector; +} + +static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, + unsigned seg) +{ + u16 dummy; + u32 base3; + struct desc_struct desc; + + ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg); + ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); +} + +static int __linearize(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + unsigned size, bool write, bool fetch, + ulong *linear) +{ + struct decode_cache *c = &ctxt->decode; + struct desc_struct desc; + bool usable; + ulong la; + u32 lim; + u16 sel; + unsigned cpl, rpl; + + la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; + switch (ctxt->mode) { + case X86EMUL_MODE_REAL: + break; + case X86EMUL_MODE_PROT64: + if (((signed long)la << 16) >> 16 != la) + return emulate_gp(ctxt, 0); + break; + default: + usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL, + addr.seg); + if (!usable) + goto bad; + /* code segment or read-only data segment */ + if (((desc.type & 8) || !(desc.type & 2)) && write) + goto bad; + /* unreadable code segment */ + if (!fetch && (desc.type & 8) && !(desc.type & 2)) + goto bad; + lim = desc_limit_scaled(&desc); + if ((desc.type & 8) || !(desc.type & 4)) { + /* expand-up segment */ + if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) + goto bad; + } else { + /* exapand-down segment */ + if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) + goto bad; + lim = desc.d ? 0xffffffff : 0xffff; + if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) + goto bad; + } + cpl = ctxt->ops->cpl(ctxt); + rpl = sel & 3; + cpl = max(cpl, rpl); + if (!(desc.type & 8)) { + /* data segment */ + if (cpl > desc.dpl) + goto bad; + } else if ((desc.type & 8) && !(desc.type & 4)) { + /* nonconforming code segment */ + if (cpl != desc.dpl) + goto bad; + } else if ((desc.type & 8) && (desc.type & 4)) { + /* conforming code segment */ + if (cpl < desc.dpl) + goto bad; + } + break; + } + if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8) + la &= (u32)-1; + *linear = la; + return X86EMUL_CONTINUE; +bad: + if (addr.seg == VCPU_SREG_SS) + return emulate_ss(ctxt, addr.seg); + else + return emulate_gp(ctxt, addr.seg); +} + +static int linearize(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + unsigned size, bool write, + ulong *linear) +{ + return __linearize(ctxt, addr, size, write, false, linear); +} + + +static int segmented_read_std(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + void *data, + unsigned size) +{ + int rc; + ulong linear; + + rc = linearize(ctxt, addr, size, false, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); +} + static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long eip, u8 *dest) @@ -505,10 +678,15 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, int size, cur_size; if (eip == fc->end) { + unsigned long linear; + struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip}; cur_size = fc->end - fc->start; size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); - rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, - size, ctxt->vcpu, &ctxt->exception); + rc = __linearize(ctxt, addr, size, false, true, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + rc = ops->fetch(ctxt, linear, fc->data + cur_size, + size, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; fc->end += size; @@ -551,7 +729,6 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs, } static int read_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, struct segmented_address addr, u16 *size, unsigned long *address, int op_bytes) { @@ -560,13 +737,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, if (op_bytes == 2) op_bytes = 3; *address = 0; - rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2, - ctxt->vcpu, &ctxt->exception); + rc = segmented_read_std(ctxt, addr, size, 2); if (rc != X86EMUL_CONTINUE) return rc; addr.ea += 2; - rc = ops->read_std(linear(ctxt, addr), address, op_bytes, - ctxt->vcpu, &ctxt->exception); + rc = segmented_read_std(ctxt, addr, address, op_bytes); return rc; } @@ -623,7 +798,63 @@ static void fetch_register_operand(struct operand *op) } } -static void decode_register_operand(struct operand *op, +static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) +{ + ctxt->ops->get_fpu(ctxt); + switch (reg) { + case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break; + case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break; + case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break; + case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break; + case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break; + case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break; + case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break; + case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break; +#ifdef CONFIG_X86_64 + case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break; + case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break; + case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break; + case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break; + case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break; + case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break; + case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break; + case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break; +#endif + default: BUG(); + } + ctxt->ops->put_fpu(ctxt); +} + +static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, + int reg) +{ + ctxt->ops->get_fpu(ctxt); + switch (reg) { + case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break; + case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break; + case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break; + case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break; + case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break; + case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break; + case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break; + case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break; +#ifdef CONFIG_X86_64 + case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break; + case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break; + case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break; + case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break; + case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break; + case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break; + case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break; + case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break; +#endif + default: BUG(); + } + ctxt->ops->put_fpu(ctxt); +} + +static void decode_register_operand(struct x86_emulate_ctxt *ctxt, + struct operand *op, struct decode_cache *c, int inhibit_bytereg) { @@ -632,6 +863,15 @@ static void decode_register_operand(struct operand *op, if (!(c->d & ModRM)) reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); + + if (c->d & Sse) { + op->type = OP_XMM; + op->bytes = 16; + op->addr.xmm = reg; + read_sse_reg(ctxt, &op->vec_val, reg); + return; + } + op->type = OP_REG; if ((c->d & ByteOp) && !inhibit_bytereg) { op->addr.reg = decode_register(reg, c->regs, highbyte_regs); @@ -671,6 +911,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; op->addr.reg = decode_register(c->modrm_rm, c->regs, c->d & ByteOp); + if (c->d & Sse) { + op->type = OP_XMM; + op->bytes = 16; + op->addr.xmm = c->modrm_rm; + read_sse_reg(ctxt, &op->vec_val, c->modrm_rm); + return rc; + } fetch_register_operand(op); return rc; } @@ -819,8 +1066,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, if (mc->pos < mc->end) goto read_cached; - rc = ops->read_emulated(addr, mc->data + mc->end, n, - &ctxt->exception, ctxt->vcpu); + rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n, + &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; mc->end += n; @@ -834,6 +1081,50 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } +static int segmented_read(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + void *data, + unsigned size) +{ + int rc; + ulong linear; + + rc = linearize(ctxt, addr, size, false, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return read_emulated(ctxt, ctxt->ops, linear, data, size); +} + +static int segmented_write(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + const void *data, + unsigned size) +{ + int rc; + ulong linear; + + rc = linearize(ctxt, addr, size, true, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return ctxt->ops->write_emulated(ctxt, linear, data, size, + &ctxt->exception); +} + +static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + const void *orig_data, const void *data, + unsigned size) +{ + int rc; + ulong linear; + + rc = linearize(ctxt, addr, size, true, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data, + size, &ctxt->exception); +} + static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned int size, unsigned short port, @@ -854,7 +1145,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (n == 0) n = 1; rc->pos = rc->end = 0; - if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu)) + if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n)) return 0; rc->end = n * size; } @@ -864,28 +1155,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, return 1; } -static u32 desc_limit_scaled(struct desc_struct *desc) -{ - u32 limit = get_desc_limit(desc); - - return desc->g ? (limit << 12) | 0xfff : limit; -} - static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 selector, struct desc_ptr *dt) { if (selector & 1 << 2) { struct desc_struct desc; + u16 sel; + memset (dt, 0, sizeof *dt); - if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR, - ctxt->vcpu)) + if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR)) return; dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ dt->address = get_desc_base(&desc); } else - ops->get_gdt(dt, ctxt->vcpu); + ops->get_gdt(ctxt, dt); } /* allowed just for 8 bytes segments */ @@ -903,8 +1188,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (dt.size < index * 8 + 7) return emulate_gp(ctxt, selector & 0xfffc); addr = dt.address + index * 8; - ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, - &ctxt->exception); + ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); return ret; } @@ -925,8 +1209,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, return emulate_gp(ctxt, selector & 0xfffc); addr = dt.address + index * 8; - ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, - &ctxt->exception); + ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); return ret; } @@ -986,7 +1269,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, rpl = selector & 3; dpl = seg_desc.dpl; - cpl = ops->cpl(ctxt->vcpu); + cpl = ops->cpl(ctxt); switch (seg) { case VCPU_SREG_SS: @@ -1042,8 +1325,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, return ret; } load: - ops->set_segment_selector(selector, seg, ctxt->vcpu); - ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu); + ops->set_segment(ctxt, selector, &seg_desc, 0, seg); return X86EMUL_CONTINUE; exception: emulate_exception(ctxt, err_vec, err_code, true); @@ -1069,8 +1351,7 @@ static void write_register_operand(struct operand *op) } } -static inline int writeback(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int writeback(struct x86_emulate_ctxt *ctxt) { int rc; struct decode_cache *c = &ctxt->decode; @@ -1081,23 +1362,22 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, break; case OP_MEM: if (c->lock_prefix) - rc = ops->cmpxchg_emulated( - linear(ctxt, c->dst.addr.mem), - &c->dst.orig_val, - &c->dst.val, - c->dst.bytes, - &ctxt->exception, - ctxt->vcpu); + rc = segmented_cmpxchg(ctxt, + c->dst.addr.mem, + &c->dst.orig_val, + &c->dst.val, + c->dst.bytes); else - rc = ops->write_emulated( - linear(ctxt, c->dst.addr.mem), - &c->dst.val, - c->dst.bytes, - &ctxt->exception, - ctxt->vcpu); + rc = segmented_write(ctxt, + c->dst.addr.mem, + &c->dst.val, + c->dst.bytes); if (rc != X86EMUL_CONTINUE) return rc; break; + case OP_XMM: + write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm); + break; case OP_NONE: /* no writeback */ break; @@ -1107,21 +1387,21 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static inline void emulate_push(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_push(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; + struct segmented_address addr; - c->dst.type = OP_MEM; - c->dst.bytes = c->op_bytes; - c->dst.val = c->src.val; register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]); - c->dst.addr.mem.seg = VCPU_SREG_SS; + addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); + addr.seg = VCPU_SREG_SS; + + /* Disable writeback. */ + c->dst.type = OP_NONE; + return segmented_write(ctxt, addr, &c->src.val, c->op_bytes); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, void *dest, int len) { struct decode_cache *c = &ctxt->decode; @@ -1130,7 +1410,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); addr.seg = VCPU_SREG_SS; - rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len); + rc = segmented_read(ctxt, addr, dest, len); if (rc != X86EMUL_CONTINUE) return rc; @@ -1138,6 +1418,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, return rc; } +static int em_pop(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + return emulate_pop(ctxt, &c->dst.val, c->op_bytes); +} + static int emulate_popf(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, void *dest, int len) @@ -1145,9 +1432,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, int rc; unsigned long val, change_mask; int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - int cpl = ops->cpl(ctxt->vcpu); + int cpl = ops->cpl(ctxt); - rc = emulate_pop(ctxt, ops, &val, len); + rc = emulate_pop(ctxt, &val, len); if (rc != X86EMUL_CONTINUE) return rc; @@ -1179,14 +1466,24 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, return rc; } -static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int seg) +static int em_popf(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; - c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); + c->dst.type = OP_REG; + c->dst.addr.reg = &ctxt->eflags; + c->dst.bytes = c->op_bytes; + return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); +} - emulate_push(ctxt, ops); +static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) +{ + struct decode_cache *c = &ctxt->decode; + + c->src.val = get_segment_selector(ctxt, seg); + + return em_push(ctxt); } static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, @@ -1196,7 +1493,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, unsigned long selector; int rc; - rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); + rc = emulate_pop(ctxt, &selector, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; @@ -1204,8 +1501,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, return rc; } -static int emulate_pusha(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_pusha(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; unsigned long old_esp = c->regs[VCPU_REGS_RSP]; @@ -1216,23 +1512,25 @@ static int emulate_pusha(struct x86_emulate_ctxt *ctxt, (reg == VCPU_REGS_RSP) ? (c->src.val = old_esp) : (c->src.val = c->regs[reg]); - emulate_push(ctxt, ops); - - rc = writeback(ctxt, ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; ++reg; } - /* Disable writeback. */ - c->dst.type = OP_NONE; - return rc; } -static int emulate_popa(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_pushf(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->src.val = (unsigned long)ctxt->eflags; + return em_push(ctxt); +} + +static int em_popa(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; @@ -1245,7 +1543,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt, --reg; } - rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); + rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes); if (rc != X86EMUL_CONTINUE) break; --reg; @@ -1265,37 +1563,32 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, /* TODO: Add limit checks */ c->src.val = ctxt->eflags; - emulate_push(ctxt, ops); - rc = writeback(ctxt, ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); - c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - emulate_push(ctxt, ops); - rc = writeback(ctxt, ops); + c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; c->src.val = c->eip; - emulate_push(ctxt, ops); - rc = writeback(ctxt, ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; - c->dst.type = OP_NONE; - - ops->get_idt(&dt, ctxt->vcpu); + ops->get_idt(ctxt, &dt); eip_addr = dt.address + (irq << 2); cs_addr = dt.address + (irq << 2) + 2; - rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception); + rc = ops->read_std(ctxt, cs_addr, &cs, 2, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; - rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception); + rc = ops->read_std(ctxt, eip_addr, &eip, 2, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; @@ -1339,7 +1632,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, /* TODO: Add stack limit check */ - rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes); + rc = emulate_pop(ctxt, &temp_eip, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; @@ -1347,12 +1640,12 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, if (temp_eip & ~0xffff) return emulate_gp(ctxt, 0); - rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + rc = emulate_pop(ctxt, &cs, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes); + rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; @@ -1394,15 +1687,31 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, } } -static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_jmp_far(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + unsigned short sel; + + memcpy(&sel, c->src.valptr + c->op_bytes, 2); + + rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->eip = 0; + memcpy(&c->eip, c->src.valptr, c->op_bytes); + return X86EMUL_CONTINUE; +} + +static int em_grp1a(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; - return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); + return emulate_pop(ctxt, &c->dst.val, c->dst.bytes); } -static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) +static int em_grp2(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; switch (c->modrm_reg) { @@ -1429,10 +1738,10 @@ static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); break; } + return X86EMUL_CONTINUE; } -static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_grp3(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; unsigned long *rax = &c->regs[VCPU_REGS_RAX]; @@ -1471,10 +1780,10 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_grp45(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; switch (c->modrm_reg) { case 0: /* inc */ @@ -1488,21 +1797,23 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, old_eip = c->eip; c->eip = c->src.val; c->src.val = old_eip; - emulate_push(ctxt, ops); + rc = em_push(ctxt); break; } case 4: /* jmp abs */ c->eip = c->src.val; break; + case 5: /* jmp far */ + rc = em_jmp_far(ctxt); + break; case 6: /* push */ - emulate_push(ctxt, ops); + rc = em_push(ctxt); break; } - return X86EMUL_CONTINUE; + return rc; } -static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_grp9(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; u64 old = c->dst.orig_val64; @@ -1528,12 +1839,12 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, int rc; unsigned long cs; - rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); + rc = emulate_pop(ctxt, &c->eip, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; if (c->op_bytes == 4) c->eip = (u32)c->eip; - rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + rc = emulate_pop(ctxt, &cs, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); @@ -1562,8 +1873,10 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, struct desc_struct *cs, struct desc_struct *ss) { + u16 selector; + memset(cs, 0, sizeof(struct desc_struct)); - ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu); + ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); memset(ss, 0, sizeof(struct desc_struct)); cs->l = 0; /* will be adjusted later */ @@ -1593,44 +1906,44 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) struct desc_struct cs, ss; u64 msr_data; u16 cs_sel, ss_sel; + u64 efer = 0; /* syscall is not available in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) return emulate_ud(ctxt); + ops->get_msr(ctxt, MSR_EFER, &efer); setup_syscalls_segments(ctxt, ops, &cs, &ss); - ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + ops->get_msr(ctxt, MSR_STAR, &msr_data); msr_data >>= 32; cs_sel = (u16)(msr_data & 0xfffc); ss_sel = (u16)(msr_data + 8); - if (is_long_mode(ctxt->vcpu)) { + if (efer & EFER_LMA) { cs.d = 0; cs.l = 1; } - ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); + ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); c->regs[VCPU_REGS_RCX] = c->eip; - if (is_long_mode(ctxt->vcpu)) { + if (efer & EFER_LMA) { #ifdef CONFIG_X86_64 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; - ops->get_msr(ctxt->vcpu, + ops->get_msr(ctxt, ctxt->mode == X86EMUL_MODE_PROT64 ? MSR_LSTAR : MSR_CSTAR, &msr_data); c->eip = msr_data; - ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); + ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); ctxt->eflags &= ~(msr_data | EFLG_RF); #endif } else { /* legacy mode */ - ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + ops->get_msr(ctxt, MSR_STAR, &msr_data); c->eip = (u32)msr_data; ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); @@ -1646,7 +1959,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) struct desc_struct cs, ss; u64 msr_data; u16 cs_sel, ss_sel; + u64 efer = 0; + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); /* inject #GP if in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL) return emulate_gp(ctxt, 0); @@ -1659,7 +1974,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) setup_syscalls_segments(ctxt, ops, &cs, &ss); - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); switch (ctxt->mode) { case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) @@ -1676,21 +1991,18 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs_sel &= ~SELECTOR_RPL_MASK; ss_sel = cs_sel + 8; ss_sel &= ~SELECTOR_RPL_MASK; - if (ctxt->mode == X86EMUL_MODE_PROT64 - || is_long_mode(ctxt->vcpu)) { + if (ctxt->mode == X86EMUL_MODE_PROT64 || (efer & EFER_LMA)) { cs.d = 0; cs.l = 1; } - ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); + ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); + ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); c->eip = msr_data; - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); + ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; return X86EMUL_CONTINUE; @@ -1719,7 +2031,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs.dpl = 3; ss.dpl = 3; - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); switch (usermode) { case X86EMUL_MODE_PROT32: cs_sel = (u16)(msr_data + 16); @@ -1739,10 +2051,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs_sel |= SELECTOR_RPL_MASK; ss_sel |= SELECTOR_RPL_MASK; - ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); + ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); c->eip = c->regs[VCPU_REGS_RDX]; c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; @@ -1759,7 +2069,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, if (ctxt->mode == X86EMUL_MODE_VM86) return true; iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - return ops->cpl(ctxt->vcpu) > iopl; + return ops->cpl(ctxt) > iopl; } static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, @@ -1769,11 +2079,11 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, struct desc_struct tr_seg; u32 base3; int r; - u16 io_bitmap_ptr, perm, bit_idx = port & 0x7; + u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7; unsigned mask = (1 << len) - 1; unsigned long base; - ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu); + ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR); if (!tr_seg.p) return false; if (desc_limit_scaled(&tr_seg) < 103) @@ -1782,13 +2092,12 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, #ifdef CONFIG_X86_64 base |= ((u64)base3) << 32; #endif - r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL); + r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL); if (r != X86EMUL_CONTINUE) return false; if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) return false; - r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu, - NULL); + r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL); if (r != X86EMUL_CONTINUE) return false; if ((perm >> bit_idx) & mask) @@ -1829,11 +2138,11 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, tss->si = c->regs[VCPU_REGS_RSI]; tss->di = c->regs[VCPU_REGS_RDI]; - tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); - tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); - tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); - tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); + tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); + tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); + tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS); + tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); + tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR); } static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, @@ -1858,11 +2167,11 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, * SDM says that segment selectors are loaded before segment * descriptors */ - ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu); - ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); - ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); + set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR); + set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); + set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); + set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); + set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); /* * Now load segment descriptors. If fault happenes at this stage @@ -1896,7 +2205,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, int ret; u32 new_tss_base = get_desc_base(new_desc); - ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ @@ -1904,13 +2213,13 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, save_state_to_tss16(ctxt, ops, &tss_seg); - ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; - ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ @@ -1919,10 +2228,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, if (old_tss_sel != 0xffff) { tss_seg.prev_task_link = old_tss_sel; - ret = ops->write_std(new_tss_base, + ret = ops->write_std(ctxt, new_tss_base, &tss_seg.prev_task_link, sizeof tss_seg.prev_task_link, - ctxt->vcpu, &ctxt->exception); + &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; @@ -1937,7 +2246,7 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, { struct decode_cache *c = &ctxt->decode; - tss->cr3 = ops->get_cr(3, ctxt->vcpu); + tss->cr3 = ops->get_cr(ctxt, 3); tss->eip = c->eip; tss->eflags = ctxt->eflags; tss->eax = c->regs[VCPU_REGS_RAX]; @@ -1949,13 +2258,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, tss->esi = c->regs[VCPU_REGS_RSI]; tss->edi = c->regs[VCPU_REGS_RDI]; - tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); - tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); - tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); - tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu); - tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu); - tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); + tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); + tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); + tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS); + tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); + tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS); + tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS); + tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR); } static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, @@ -1965,7 +2274,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int ret; - if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) + if (ops->set_cr(ctxt, 3, tss->cr3)) return emulate_gp(ctxt, 0); c->eip = tss->eip; ctxt->eflags = tss->eflags | 2; @@ -1982,13 +2291,13 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * SDM says that segment selectors are loaded before segment * descriptors */ - ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu); - ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); - ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); - ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu); - ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu); + set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); + set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); + set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); + set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); + set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); + set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS); + set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); /* * Now load segment descriptors. If fault happenes at this stage @@ -2028,7 +2337,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, int ret; u32 new_tss_base = get_desc_base(new_desc); - ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ @@ -2036,13 +2345,13 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, save_state_to_tss32(ctxt, ops, &tss_seg); - ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; - ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ @@ -2051,10 +2360,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, if (old_tss_sel != 0xffff) { tss_seg.prev_task_link = old_tss_sel; - ret = ops->write_std(new_tss_base, + ret = ops->write_std(ctxt, new_tss_base, &tss_seg.prev_task_link, sizeof tss_seg.prev_task_link, - ctxt->vcpu, &ctxt->exception); + &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; @@ -2070,9 +2379,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, { struct desc_struct curr_tss_desc, next_tss_desc; int ret; - u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); + u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); ulong old_tss_base = - ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu); + ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); u32 desc_limit; /* FIXME: old_tss_base == ~0 ? */ @@ -2088,7 +2397,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if (reason != TASK_SWITCH_IRET) { if ((tss_selector & 3) > next_tss_desc.dpl || - ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) + ops->cpl(ctxt) > next_tss_desc.dpl) return emulate_gp(ctxt, 0); } @@ -2132,9 +2441,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, &next_tss_desc); } - ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); - ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu); - ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); + ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); + ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); if (has_error_code) { struct decode_cache *c = &ctxt->decode; @@ -2142,7 +2450,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; c->lock_prefix = 0; c->src.val = (unsigned long) error_code; - emulate_push(ctxt, ops); + ret = em_push(ctxt); } return ret; @@ -2162,13 +2470,10 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, has_error_code, error_code); - if (rc == X86EMUL_CONTINUE) { - rc = writeback(ctxt, ops); - if (rc == X86EMUL_CONTINUE) - ctxt->eip = c->eip; - } + if (rc == X86EMUL_CONTINUE) + ctxt->eip = c->eip; - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, @@ -2182,12 +2487,6 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, op->addr.mem.seg = seg; } -static int em_push(struct x86_emulate_ctxt *ctxt) -{ - emulate_push(ctxt, ctxt->ops); - return X86EMUL_CONTINUE; -} - static int em_das(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -2234,7 +2533,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) ulong old_eip; int rc; - old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); old_eip = c->eip; memcpy(&sel, c->src.valptr + c->op_bytes, 2); @@ -2245,20 +2544,12 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) memcpy(&c->eip, c->src.valptr, c->op_bytes); c->src.val = old_cs; - emulate_push(ctxt, ctxt->ops); - rc = writeback(ctxt, ctxt->ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; c->src.val = old_eip; - emulate_push(ctxt, ctxt->ops); - rc = writeback(ctxt, ctxt->ops); - if (rc != X86EMUL_CONTINUE) - return rc; - - c->dst.type = OP_NONE; - - return X86EMUL_CONTINUE; + return em_push(ctxt); } static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) @@ -2269,13 +2560,79 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) c->dst.type = OP_REG; c->dst.addr.reg = &c->eip; c->dst.bytes = c->op_bytes; - rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); + rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); return X86EMUL_CONTINUE; } +static int em_add(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_or(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_adc(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_sbb(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_and(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_sub(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_xor(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_cmp(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + static int em_imul(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -2306,13 +2663,10 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt) static int em_rdtsc(struct x86_emulate_ctxt *ctxt) { - unsigned cpl = ctxt->ops->cpl(ctxt->vcpu); struct decode_cache *c = &ctxt->decode; u64 tsc = 0; - if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) - return emulate_gp(ctxt, 0); - ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); + ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); c->regs[VCPU_REGS_RAX] = (u32)tsc; c->regs[VCPU_REGS_RDX] = tsc >> 32; return X86EMUL_CONTINUE; @@ -2325,22 +2679,375 @@ static int em_mov(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_movdqu(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes); + return X86EMUL_CONTINUE; +} + +static int em_invlpg(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + ulong linear; + + rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear); + if (rc == X86EMUL_CONTINUE) + ctxt->ops->invlpg(ctxt, linear); + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_clts(struct x86_emulate_ctxt *ctxt) +{ + ulong cr0; + + cr0 = ctxt->ops->get_cr(ctxt, 0); + cr0 &= ~X86_CR0_TS; + ctxt->ops->set_cr(ctxt, 0, cr0); + return X86EMUL_CONTINUE; +} + +static int em_vmcall(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + if (c->modrm_mod != 3 || c->modrm_rm != 1) + return X86EMUL_UNHANDLEABLE; + + rc = ctxt->ops->fix_hypercall(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + + /* Let the processor re-execute the fixed hypercall */ + c->eip = ctxt->eip; + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_lgdt(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct desc_ptr desc_ptr; + int rc; + + rc = read_descriptor(ctxt, c->src.addr.mem, + &desc_ptr.size, &desc_ptr.address, + c->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + ctxt->ops->set_gdt(ctxt, &desc_ptr); + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_vmmcall(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + rc = ctxt->ops->fix_hypercall(ctxt); + + /* Disable writeback. */ + c->dst.type = OP_NONE; + return rc; +} + +static int em_lidt(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct desc_ptr desc_ptr; + int rc; + + rc = read_descriptor(ctxt, c->src.addr.mem, + &desc_ptr.size, &desc_ptr.address, + c->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + ctxt->ops->set_idt(ctxt, &desc_ptr); + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_smsw(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.bytes = 2; + c->dst.val = ctxt->ops->get_cr(ctxt, 0); + return X86EMUL_CONTINUE; +} + +static int em_lmsw(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) + | (c->src.val & 0x0f)); + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static bool valid_cr(int nr) +{ + switch (nr) { + case 0: + case 2 ... 4: + case 8: + return true; + default: + return false; + } +} + +static int check_cr_read(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + if (!valid_cr(c->modrm_reg)) + return emulate_ud(ctxt); + + return X86EMUL_CONTINUE; +} + +static int check_cr_write(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u64 new_val = c->src.val64; + int cr = c->modrm_reg; + u64 efer = 0; + + static u64 cr_reserved_bits[] = { + 0xffffffff00000000ULL, + 0, 0, 0, /* CR3 checked later */ + CR4_RESERVED_BITS, + 0, 0, 0, + CR8_RESERVED_BITS, + }; + + if (!valid_cr(cr)) + return emulate_ud(ctxt); + + if (new_val & cr_reserved_bits[cr]) + return emulate_gp(ctxt, 0); + + switch (cr) { + case 0: { + u64 cr4; + if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) || + ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD))) + return emulate_gp(ctxt, 0); + + cr4 = ctxt->ops->get_cr(ctxt, 4); + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + + if ((new_val & X86_CR0_PG) && (efer & EFER_LME) && + !(cr4 & X86_CR4_PAE)) + return emulate_gp(ctxt, 0); + + break; + } + case 3: { + u64 rsvd = 0; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + if (efer & EFER_LMA) + rsvd = CR3_L_MODE_RESERVED_BITS; + else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE) + rsvd = CR3_PAE_RESERVED_BITS; + else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG) + rsvd = CR3_NONPAE_RESERVED_BITS; + + if (new_val & rsvd) + return emulate_gp(ctxt, 0); + + break; + } + case 4: { + u64 cr4; + + cr4 = ctxt->ops->get_cr(ctxt, 4); + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + + if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE)) + return emulate_gp(ctxt, 0); + + break; + } + } + + return X86EMUL_CONTINUE; +} + +static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) +{ + unsigned long dr7; + + ctxt->ops->get_dr(ctxt, 7, &dr7); + + /* Check if DR7.Global_Enable is set */ + return dr7 & (1 << 13); +} + +static int check_dr_read(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int dr = c->modrm_reg; + u64 cr4; + + if (dr > 7) + return emulate_ud(ctxt); + + cr4 = ctxt->ops->get_cr(ctxt, 4); + if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5)) + return emulate_ud(ctxt); + + if (check_dr7_gd(ctxt)) + return emulate_db(ctxt); + + return X86EMUL_CONTINUE; +} + +static int check_dr_write(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u64 new_val = c->src.val64; + int dr = c->modrm_reg; + + if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL)) + return emulate_gp(ctxt, 0); + + return check_dr_read(ctxt); +} + +static int check_svme(struct x86_emulate_ctxt *ctxt) +{ + u64 efer; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + + if (!(efer & EFER_SVME)) + return emulate_ud(ctxt); + + return X86EMUL_CONTINUE; +} + +static int check_svme_pa(struct x86_emulate_ctxt *ctxt) +{ + u64 rax = ctxt->decode.regs[VCPU_REGS_RAX]; + + /* Valid physical address? */ + if (rax & 0xffff000000000000ULL) + return emulate_gp(ctxt, 0); + + return check_svme(ctxt); +} + +static int check_rdtsc(struct x86_emulate_ctxt *ctxt) +{ + u64 cr4 = ctxt->ops->get_cr(ctxt, 4); + + if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt)) + return emulate_ud(ctxt); + + return X86EMUL_CONTINUE; +} + +static int check_rdpmc(struct x86_emulate_ctxt *ctxt) +{ + u64 cr4 = ctxt->ops->get_cr(ctxt, 4); + u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX]; + + if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || + (rcx > 3)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + +static int check_perm_in(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.bytes = min(c->dst.bytes, 4u); + if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + +static int check_perm_out(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->src.bytes = min(c->src.bytes, 4u); + if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + #define D(_y) { .flags = (_y) } +#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } +#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ + .check_perm = (_p) } #define N D(0) +#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } -#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } +#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } +#define II(_f, _e, _i) \ + { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } +#define IIP(_f, _e, _i, _p) \ + { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \ + .check_perm = (_p) } +#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) } #define D2bv(_f) D((_f) | ByteOp), D(_f) +#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) -#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \ - D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \ - D2bv(((_f) & ~Lock) | DstAcc | SrcImm) +#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ + I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ + I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) +static struct opcode group7_rm1[] = { + DI(SrcNone | ModRM | Priv, monitor), + DI(SrcNone | ModRM | Priv, mwait), + N, N, N, N, N, N, +}; + +static struct opcode group7_rm3[] = { + DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), + II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall), + DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), + DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), + DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), + DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme), + DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme), + DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme), +}; + +static struct opcode group7_rm7[] = { + N, + DIP(SrcNone | ModRM, rdtscp, check_rdtsc), + N, N, N, N, N, N, +}; static struct opcode group1[] = { - X7(D(Lock)), N + I(Lock, em_add), + I(Lock, em_or), + I(Lock, em_adc), + I(Lock, em_sbb), + I(Lock, em_and), + I(Lock, em_sub), + I(Lock, em_xor), + I(0, em_cmp), }; static struct opcode group1A[] = { @@ -2366,16 +3073,28 @@ static struct opcode group5[] = { D(SrcMem | ModRM | Stack), N, }; +static struct opcode group6[] = { + DI(ModRM | Prot, sldt), + DI(ModRM | Prot, str), + DI(ModRM | Prot | Priv, lldt), + DI(ModRM | Prot | Priv, ltr), + N, N, N, N, +}; + static struct group_dual group7 = { { - N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), - D(SrcNone | ModRM | DstMem | Mov), N, - D(SrcMem16 | ModRM | Mov | Priv), - D(SrcMem | ModRM | ByteOp | Priv | NoAccess), + DI(ModRM | Mov | DstMem | Priv, sgdt), + DI(ModRM | Mov | DstMem | Priv, sidt), + II(ModRM | SrcMem | Priv, em_lgdt, lgdt), + II(ModRM | SrcMem | Priv, em_lidt, lidt), + II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, + II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), + II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg), }, { - D(SrcNone | ModRM | Priv | VendorSpecific), N, - N, D(SrcNone | ModRM | Priv | VendorSpecific), - D(SrcNone | ModRM | DstMem | Mov), N, - D(SrcMem16 | ModRM | Mov | Priv), N, + I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall), + EXT(0, group7_rm1), + N, EXT(0, group7_rm3), + II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, + II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), } }; static struct opcode group8[] = { @@ -2394,35 +3113,40 @@ static struct opcode group11[] = { I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), }; +static struct gprefix pfx_0f_6f_0f_7f = { + N, N, N, I(Sse, em_movdqu), +}; + static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ - D6ALU(Lock), + I6ALU(Lock, em_add), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x08 - 0x0F */ - D6ALU(Lock), + I6ALU(Lock, em_or), D(ImplicitOps | Stack | No64), N, /* 0x10 - 0x17 */ - D6ALU(Lock), + I6ALU(Lock, em_adc), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x18 - 0x1F */ - D6ALU(Lock), + I6ALU(Lock, em_sbb), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x20 - 0x27 */ - D6ALU(Lock), N, N, + I6ALU(Lock, em_and), N, N, /* 0x28 - 0x2F */ - D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das), + I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), /* 0x30 - 0x37 */ - D6ALU(Lock), N, N, + I6ALU(Lock, em_xor), N, N, /* 0x38 - 0x3F */ - D6ALU(0), N, N, + I6ALU(0, em_cmp), N, N, /* 0x40 - 0x4F */ X16(D(DstReg)), /* 0x50 - 0x57 */ X8(I(SrcReg | Stack, em_push)), /* 0x58 - 0x5F */ - X8(D(DstReg | Stack)), + X8(I(DstReg | Stack, em_pop)), /* 0x60 - 0x67 */ - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + I(ImplicitOps | Stack | No64, em_pusha), + I(ImplicitOps | Stack | No64, em_popa), N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , N, N, N, N, /* 0x68 - 0x6F */ @@ -2430,8 +3154,8 @@ static struct opcode opcode_table[256] = { I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), I(SrcImmByte | Mov | Stack, em_push), I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), - D2bv(DstDI | Mov | String), /* insb, insw/insd */ - D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ + D2bvIP(DstDI | Mov | String, ins, check_perm_in), /* insb, insw/insd */ + D2bvIP(SrcSI | ImplicitOps | String, outs, check_perm_out), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ X16(D(SrcImmByte)), /* 0x80 - 0x87 */ @@ -2446,21 +3170,22 @@ static struct opcode opcode_table[256] = { D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), /* 0x90 - 0x97 */ - X8(D(SrcAcc | DstReg)), + DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), /* 0x98 - 0x9F */ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), I(SrcImmFAddr | No64, em_call_far), N, - D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, + II(ImplicitOps | Stack, em_pushf, pushf), + II(ImplicitOps | Stack, em_popf, popf), N, N, /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), I2bv(SrcSI | DstDI | Mov | String, em_mov), - D2bv(SrcSI | DstDI | String), + I2bv(SrcSI | DstDI | String, em_cmp), /* 0xA8 - 0xAF */ D2bv(DstAcc | SrcImm), I2bv(SrcAcc | DstDI | Mov | String, em_mov), I2bv(SrcSI | DstAcc | Mov | String, em_mov), - D2bv(SrcAcc | DstDI | String), + I2bv(SrcAcc | DstDI | String, em_cmp), /* 0xB0 - 0xB7 */ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), /* 0xB8 - 0xBF */ @@ -2473,7 +3198,8 @@ static struct opcode opcode_table[256] = { G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ N, N, N, D(ImplicitOps | Stack), - D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), + D(ImplicitOps), DI(SrcImmByte, intn), + D(ImplicitOps | No64), DI(ImplicitOps, iret), /* 0xD0 - 0xD7 */ D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), N, N, N, N, @@ -2481,14 +3207,17 @@ static struct opcode opcode_table[256] = { N, N, N, N, N, N, N, N, /* 0xE0 - 0xE7 */ X4(D(SrcImmByte)), - D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte), + D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), + D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), /* 0xE8 - 0xEF */ D(SrcImm | Stack), D(SrcImm | ImplicitOps), D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), - D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps), + D2bvIP(SrcNone | DstAcc, in, check_perm_in), + D2bvIP(SrcAcc | ImplicitOps, out, check_perm_out), /* 0xF0 - 0xF7 */ - N, N, N, N, - D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), + N, DI(ImplicitOps, icebp), N, N, + DI(ImplicitOps | Priv, hlt), D(ImplicitOps), + G(ByteOp, group3), G(0, group3), /* 0xF8 - 0xFF */ D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), @@ -2496,20 +3225,24 @@ static struct opcode opcode_table[256] = { static struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ - N, GD(0, &group7), N, N, - N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N, - D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, + G(0, group6), GD(0, &group7), N, N, + N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N, + DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM), N, N, /* 0x10 - 0x1F */ N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, /* 0x20 - 0x2F */ - D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264), - D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264), + DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), + DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), + DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write), + DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write), N, N, N, N, N, N, N, N, N, N, N, N, /* 0x30 - 0x3F */ - D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), - D(ImplicitOps | Priv), N, + DI(ImplicitOps | Priv, wrmsr), + IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), + DI(ImplicitOps | Priv, rdmsr), + DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), N, N, N, N, N, N, N, N, N, N, @@ -2518,21 +3251,27 @@ static struct opcode twobyte_table[256] = { /* 0x50 - 0x5F */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0x60 - 0x6F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + N, N, N, N, + N, N, N, N, + N, N, N, N, + N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f), /* 0x70 - 0x7F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + N, N, N, N, + N, N, N, N, + N, N, N, N, + N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f), /* 0x80 - 0x8F */ X16(D(SrcImm)), /* 0x90 - 0x9F */ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ D(ImplicitOps | Stack), D(ImplicitOps | Stack), - N, D(DstMem | SrcReg | ModRM | BitOp), + DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), N, N, /* 0xA8 - 0xAF */ D(ImplicitOps | Stack), D(ImplicitOps | Stack), - N, D(DstMem | SrcReg | ModRM | BitOp | Lock), + DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), @@ -2564,10 +3303,13 @@ static struct opcode twobyte_table[256] = { #undef G #undef GD #undef I +#undef GP +#undef EXT #undef D2bv +#undef D2bvIP #undef I2bv -#undef D6ALU +#undef I6ALU static unsigned imm_size(struct decode_cache *c) { @@ -2625,8 +3367,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes, dual, goffset; - struct opcode opcode, *g_mod012, *g_mod3; + int def_op_bytes, def_ad_bytes, goffset, simd_prefix; + bool op_prefix = false; + struct opcode opcode; struct operand memop = { .type = OP_NONE }; c->eip = ctxt->eip; @@ -2634,7 +3377,6 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) c->fetch.end = c->fetch.start + insn_len; if (insn_len > 0) memcpy(c->fetch.data, insn, insn_len); - ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); switch (mode) { case X86EMUL_MODE_REAL: @@ -2662,6 +3404,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) for (;;) { switch (c->b = insn_fetch(u8, 1, c->eip)) { case 0x66: /* operand-size override */ + op_prefix = true; /* switch between 2/4 bytes */ c->op_bytes = def_op_bytes ^ 6; break; @@ -2692,10 +3435,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) c->lock_prefix = 1; break; case 0xf2: /* REPNE/REPNZ */ - c->rep_prefix = REPNE_PREFIX; - break; case 0xf3: /* REP/REPE/REPZ */ - c->rep_prefix = REPE_PREFIX; + c->rep_prefix = c->b; break; default: goto done_prefixes; @@ -2722,29 +3463,49 @@ done_prefixes: } c->d = opcode.flags; - if (c->d & Group) { - dual = c->d & GroupDual; - c->modrm = insn_fetch(u8, 1, c->eip); - --c->eip; - - if (c->d & GroupDual) { - g_mod012 = opcode.u.gdual->mod012; - g_mod3 = opcode.u.gdual->mod3; - } else - g_mod012 = g_mod3 = opcode.u.group; - - c->d &= ~(Group | GroupDual); - - goffset = (c->modrm >> 3) & 7; + while (c->d & GroupMask) { + switch (c->d & GroupMask) { + case Group: + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + goffset = (c->modrm >> 3) & 7; + opcode = opcode.u.group[goffset]; + break; + case GroupDual: + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + goffset = (c->modrm >> 3) & 7; + if ((c->modrm >> 6) == 3) + opcode = opcode.u.gdual->mod3[goffset]; + else + opcode = opcode.u.gdual->mod012[goffset]; + break; + case RMExt: + goffset = c->modrm & 7; + opcode = opcode.u.group[goffset]; + break; + case Prefix: + if (c->rep_prefix && op_prefix) + return X86EMUL_UNHANDLEABLE; + simd_prefix = op_prefix ? 0x66 : c->rep_prefix; + switch (simd_prefix) { + case 0x00: opcode = opcode.u.gprefix->pfx_no; break; + case 0x66: opcode = opcode.u.gprefix->pfx_66; break; + case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break; + case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; + } + break; + default: + return X86EMUL_UNHANDLEABLE; + } - if ((c->modrm >> 6) == 3) - opcode = g_mod3[goffset]; - else - opcode = g_mod012[goffset]; + c->d &= ~GroupMask; c->d |= opcode.flags; } c->execute = opcode.u.execute; + c->check_perm = opcode.check_perm; + c->intercept = opcode.intercept; /* Unrecognised? */ if (c->d == 0 || (c->d & Undefined)) @@ -2763,6 +3524,9 @@ done_prefixes: c->op_bytes = 4; } + if (c->d & Sse) + c->op_bytes = 16; + /* ModRM and SIB bytes. */ if (c->d & ModRM) { rc = decode_modrm(ctxt, ops, &memop); @@ -2776,7 +3540,7 @@ done_prefixes: if (!c->has_seg_override) set_seg_override(c, VCPU_SREG_DS); - memop.addr.mem.seg = seg_override(ctxt, ops, c); + memop.addr.mem.seg = seg_override(ctxt, c); if (memop.type == OP_MEM && c->ad_bytes != 8) memop.addr.mem.ea = (u32)memop.addr.mem.ea; @@ -2792,7 +3556,7 @@ done_prefixes: case SrcNone: break; case SrcReg: - decode_register_operand(&c->src, c, 0); + decode_register_operand(ctxt, &c->src, c, 0); break; case SrcMem16: memop.bytes = 2; @@ -2836,7 +3600,7 @@ done_prefixes: c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->src.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSI]); - c->src.addr.mem.seg = seg_override(ctxt, ops, c), + c->src.addr.mem.seg = seg_override(ctxt, c); c->src.val = 0; break; case SrcImmFAddr: @@ -2883,7 +3647,7 @@ done_prefixes: /* Decode and fetch the destination operand: register or memory. */ switch (c->d & DstMask) { case DstReg: - decode_register_operand(&c->dst, c, + decode_register_operand(ctxt, &c->dst, c, c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); break; case DstImmUByte: @@ -2926,7 +3690,7 @@ done_prefixes: } done: - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) @@ -2979,12 +3743,51 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } + if ((c->d & Sse) + && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) + || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { + rc = emulate_ud(ctxt); + goto done; + } + + if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { + rc = emulate_nm(ctxt); + goto done; + } + + if (unlikely(ctxt->guest_mode) && c->intercept) { + rc = emulator_check_intercept(ctxt, c->intercept, + X86_ICPT_PRE_EXCEPT); + if (rc != X86EMUL_CONTINUE) + goto done; + } + /* Privileged instruction can be executed only in CPL=0 */ - if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { + if ((c->d & Priv) && ops->cpl(ctxt)) { rc = emulate_gp(ctxt, 0); goto done; } + /* Instruction can only be executed in protected mode */ + if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { + rc = emulate_ud(ctxt); + goto done; + } + + /* Do instruction specific permission checks */ + if (c->check_perm) { + rc = c->check_perm(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + } + + if (unlikely(ctxt->guest_mode) && c->intercept) { + rc = emulator_check_intercept(ctxt, c->intercept, + X86_ICPT_POST_EXCEPT); + if (rc != X86EMUL_CONTINUE) + goto done; + } + if (c->rep_prefix && (c->d & String)) { /* All REP prefixes have the same first termination condition */ if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { @@ -2994,16 +3797,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) } if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { - rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem), - c->src.valptr, c->src.bytes); + rc = segmented_read(ctxt, c->src.addr.mem, + c->src.valptr, c->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val64 = c->src.val64; } if (c->src2.type == OP_MEM) { - rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem), - &c->src2.val, c->src2.bytes); + rc = segmented_read(ctxt, c->src2.addr.mem, + &c->src2.val, c->src2.bytes); if (rc != X86EMUL_CONTINUE) goto done; } @@ -3014,7 +3817,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { /* optimisation - avoid slow emulated read if Mov */ - rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem), + rc = segmented_read(ctxt, c->dst.addr.mem, &c->dst.val, c->dst.bytes); if (rc != X86EMUL_CONTINUE) goto done; @@ -3023,6 +3826,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: + if (unlikely(ctxt->guest_mode) && c->intercept) { + rc = emulator_check_intercept(ctxt, c->intercept, + X86_ICPT_POST_MEMACCESS); + if (rc != X86EMUL_CONTINUE) + goto done; + } + if (c->execute) { rc = c->execute(ctxt); if (rc != X86EMUL_CONTINUE) @@ -3034,75 +3844,33 @@ special_insn: goto twobyte_insn; switch (c->b) { - case 0x00 ... 0x05: - add: /* add */ - emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); - break; case 0x06: /* push es */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); break; case 0x07: /* pop es */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); break; - case 0x08 ... 0x0d: - or: /* or */ - emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); - break; case 0x0e: /* push cs */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); - break; - case 0x10 ... 0x15: - adc: /* adc */ - emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); break; case 0x16: /* push ss */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); break; case 0x17: /* pop ss */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); break; - case 0x18 ... 0x1d: - sbb: /* sbb */ - emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); - break; case 0x1e: /* push ds */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); break; case 0x1f: /* pop ds */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); break; - case 0x20 ... 0x25: - and: /* and */ - emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); - break; - case 0x28 ... 0x2d: - sub: /* sub */ - emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); - break; - case 0x30 ... 0x35: - xor: /* xor */ - emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); - break; - case 0x38 ... 0x3d: - cmp: /* cmp */ - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - break; case 0x40 ... 0x47: /* inc r16/r32 */ emulate_1op("inc", c->dst, ctxt->eflags); break; case 0x48 ... 0x4f: /* dec r16/r32 */ emulate_1op("dec", c->dst, ctxt->eflags); break; - case 0x58 ... 0x5f: /* pop reg */ - pop_instruction: - rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); - break; - case 0x60: /* pusha */ - rc = emulate_pusha(ctxt, ops); - break; - case 0x61: /* popa */ - rc = emulate_popa(ctxt, ops); - break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) goto cannot_emulate; @@ -3121,26 +3889,6 @@ special_insn: if (test_cc(c->b, ctxt->eflags)) jmp_rel(c, c->src.val); break; - case 0x80 ... 0x83: /* Grp1 */ - switch (c->modrm_reg) { - case 0: - goto add; - case 1: - goto or; - case 2: - goto adc; - case 3: - goto sbb; - case 4: - goto and; - case 5: - goto sub; - case 6: - goto xor; - case 7: - goto cmp; - } - break; case 0x84 ... 0x85: test: emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); @@ -3162,7 +3910,7 @@ special_insn: rc = emulate_ud(ctxt); goto done; } - c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); + c->dst.val = get_segment_selector(ctxt, c->modrm_reg); break; case 0x8d: /* lea r16/r32, m */ c->dst.val = c->src.addr.mem.ea; @@ -3187,7 +3935,7 @@ special_insn: break; } case 0x8f: /* pop (sole member of Grp1a) */ - rc = emulate_grp1a(ctxt, ops); + rc = em_grp1a(ctxt); break; case 0x90 ... 0x97: /* nop / xchg reg, rax */ if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) @@ -3200,31 +3948,17 @@ special_insn: case 8: c->dst.val = (s32)c->dst.val; break; } break; - case 0x9c: /* pushf */ - c->src.val = (unsigned long) ctxt->eflags; - emulate_push(ctxt, ops); - break; - case 0x9d: /* popf */ - c->dst.type = OP_REG; - c->dst.addr.reg = &ctxt->eflags; - c->dst.bytes = c->op_bytes; - rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); - break; - case 0xa6 ... 0xa7: /* cmps */ - c->dst.type = OP_NONE; /* Disable writeback. */ - goto cmp; case 0xa8 ... 0xa9: /* test ax, imm */ goto test; - case 0xae ... 0xaf: /* scas */ - goto cmp; case 0xc0 ... 0xc1: - emulate_grp2(ctxt); + rc = em_grp2(ctxt); break; case 0xc3: /* ret */ c->dst.type = OP_REG; c->dst.addr.reg = &c->eip; c->dst.bytes = c->op_bytes; - goto pop_instruction; + rc = em_pop(ctxt); + break; case 0xc4: /* les */ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); break; @@ -3252,11 +3986,11 @@ special_insn: rc = emulate_iret(ctxt, ops); break; case 0xd0 ... 0xd1: /* Grp2 */ - emulate_grp2(ctxt); + rc = em_grp2(ctxt); break; case 0xd2 ... 0xd3: /* Grp2 */ c->src.val = c->regs[VCPU_REGS_RCX]; - emulate_grp2(ctxt); + rc = em_grp2(ctxt); break; case 0xe0 ... 0xe2: /* loop/loopz/loopnz */ register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); @@ -3278,23 +4012,14 @@ special_insn: long int rel = c->src.val; c->src.val = (unsigned long) c->eip; jmp_rel(c, rel); - emulate_push(ctxt, ops); + rc = em_push(ctxt); break; } case 0xe9: /* jmp rel */ goto jmp; - case 0xea: { /* jmp far */ - unsigned short sel; - jump_far: - memcpy(&sel, c->src.valptr + c->op_bytes, 2); - - if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS)) - goto done; - - c->eip = 0; - memcpy(&c->eip, c->src.valptr, c->op_bytes); + case 0xea: /* jmp far */ + rc = em_jmp_far(ctxt); break; - } case 0xeb: jmp: /* jmp rel short */ jmp_rel(c, c->src.val); @@ -3304,11 +4029,6 @@ special_insn: case 0xed: /* in (e/r)ax,dx */ c->src.val = c->regs[VCPU_REGS_RDX]; do_io_in: - c->dst.bytes = min(c->dst.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { - rc = emulate_gp(ctxt, 0); - goto done; - } if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, &c->dst.val)) goto done; /* IO is needed */ @@ -3317,25 +4037,19 @@ special_insn: case 0xef: /* out dx,(e/r)ax */ c->dst.val = c->regs[VCPU_REGS_RDX]; do_io_out: - c->src.bytes = min(c->src.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->dst.val, - c->src.bytes)) { - rc = emulate_gp(ctxt, 0); - goto done; - } - ops->pio_out_emulated(c->src.bytes, c->dst.val, - &c->src.val, 1, ctxt->vcpu); + ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val, + &c->src.val, 1); c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ - ctxt->vcpu->arch.halt_request = 1; + ctxt->ops->halt(ctxt); break; case 0xf5: /* cmc */ /* complement carry flag from eflags reg */ ctxt->eflags ^= EFLG_CF; break; case 0xf6 ... 0xf7: /* Grp3 */ - rc = emulate_grp3(ctxt, ops); + rc = em_grp3(ctxt); break; case 0xf8: /* clc */ ctxt->eflags &= ~EFLG_CF; @@ -3366,13 +4080,11 @@ special_insn: ctxt->eflags |= EFLG_DF; break; case 0xfe: /* Grp4 */ - grp45: - rc = emulate_grp45(ctxt, ops); + rc = em_grp45(ctxt); break; case 0xff: /* Grp5 */ - if (c->modrm_reg == 5) - goto jump_far; - goto grp45; + rc = em_grp45(ctxt); + break; default: goto cannot_emulate; } @@ -3381,7 +4093,7 @@ special_insn: goto done; writeback: - rc = writeback(ctxt, ops); + rc = writeback(ctxt); if (rc != X86EMUL_CONTINUE) goto done; @@ -3392,7 +4104,7 @@ writeback: c->dst.type = saved_dst_type; if ((c->d & SrcMask) == SrcSI) - string_addr_inc(ctxt, seg_override(ctxt, ops, c), + string_addr_inc(ctxt, seg_override(ctxt, c), VCPU_REGS_RSI, &c->src); if ((c->d & DstMask) == DstDI) @@ -3427,115 +4139,34 @@ writeback: done: if (rc == X86EMUL_PROPAGATE_FAULT) ctxt->have_exception = true; + if (rc == X86EMUL_INTERCEPTED) + return EMULATION_INTERCEPTED; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; twobyte_insn: switch (c->b) { - case 0x01: /* lgdt, lidt, lmsw */ - switch (c->modrm_reg) { - u16 size; - unsigned long address; - - case 0: /* vmcall */ - if (c->modrm_mod != 3 || c->modrm_rm != 1) - goto cannot_emulate; - - rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) - goto done; - - /* Let the processor re-execute the fixed hypercall */ - c->eip = ctxt->eip; - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 2: /* lgdt */ - rc = read_descriptor(ctxt, ops, c->src.addr.mem, - &size, &address, c->op_bytes); - if (rc != X86EMUL_CONTINUE) - goto done; - realmode_lgdt(ctxt->vcpu, size, address); - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 3: /* lidt/vmmcall */ - if (c->modrm_mod == 3) { - switch (c->modrm_rm) { - case 1: - rc = kvm_fix_hypercall(ctxt->vcpu); - break; - default: - goto cannot_emulate; - } - } else { - rc = read_descriptor(ctxt, ops, c->src.addr.mem, - &size, &address, - c->op_bytes); - if (rc != X86EMUL_CONTINUE) - goto done; - realmode_lidt(ctxt->vcpu, size, address); - } - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 4: /* smsw */ - c->dst.bytes = 2; - c->dst.val = ops->get_cr(0, ctxt->vcpu); - break; - case 6: /* lmsw */ - ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) | - (c->src.val & 0x0f), ctxt->vcpu); - c->dst.type = OP_NONE; - break; - case 5: /* not defined */ - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - case 7: /* invlpg*/ - emulate_invlpg(ctxt->vcpu, - linear(ctxt, c->src.addr.mem)); - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - default: - goto cannot_emulate; - } - break; case 0x05: /* syscall */ rc = emulate_syscall(ctxt, ops); break; case 0x06: - emulate_clts(ctxt->vcpu); + rc = em_clts(ctxt); break; case 0x09: /* wbinvd */ - kvm_emulate_wbinvd(ctxt->vcpu); + (ctxt->ops->wbinvd)(ctxt); break; case 0x08: /* invd */ case 0x0d: /* GrpP (prefetch) */ case 0x18: /* Grp16 (prefetch/nop) */ break; case 0x20: /* mov cr, reg */ - switch (c->modrm_reg) { - case 1: - case 5 ... 7: - case 9 ... 15: - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); + c->dst.val = ops->get_cr(ctxt, c->modrm_reg); break; case 0x21: /* mov from dr to reg */ - if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && - (c->modrm_reg == 4 || c->modrm_reg == 5)) { - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); + ops->get_dr(ctxt, c->modrm_reg, &c->dst.val); break; case 0x22: /* mov reg, cr */ - if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { + if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) { emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; goto done; @@ -3543,16 +4174,9 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x23: /* mov from reg to dr */ - if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && - (c->modrm_reg == 4 || c->modrm_reg == 5)) { - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - - if (ops->set_dr(c->modrm_reg, c->src.val & + if (ops->set_dr(ctxt, c->modrm_reg, c->src.val & ((ctxt->mode == X86EMUL_MODE_PROT64) ? - ~0ULL : ~0U), ctxt->vcpu) < 0) { + ~0ULL : ~0U)) < 0) { /* #UD condition is already handled by the code above */ emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; @@ -3565,7 +4189,7 @@ twobyte_insn: /* wrmsr */ msr_data = (u32)c->regs[VCPU_REGS_RAX] | ((u64)c->regs[VCPU_REGS_RDX] << 32); - if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { + if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) { emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; goto done; @@ -3574,7 +4198,7 @@ twobyte_insn: break; case 0x32: /* rdmsr */ - if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { + if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) { emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; goto done; @@ -3603,7 +4227,7 @@ twobyte_insn: c->dst.val = test_cc(c->b, ctxt->eflags); break; case 0xa0: /* push fs */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); break; case 0xa1: /* pop fs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); @@ -3620,7 +4244,7 @@ twobyte_insn: emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); break; case 0xa8: /* push gs */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); break; case 0xa9: /* pop gs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); @@ -3727,7 +4351,7 @@ twobyte_insn: (u64) c->src.val; break; case 0xc7: /* Grp9 (cmpxchg8b) */ - rc = emulate_grp9(ctxt, ops); + rc = em_grp9(ctxt); break; default: goto cannot_emulate; @@ -3739,5 +4363,5 @@ twobyte_insn: goto writeback; cannot_emulate: - return -1; + return EMULATION_FAILED; } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 46d08ca0b48..51a97426e79 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -33,7 +33,6 @@ struct kvm_kpit_state { }; struct kvm_pit { - unsigned long base_addresss; struct kvm_io_device dev; struct kvm_io_device speaker_dev; struct kvm *kvm; @@ -51,7 +50,6 @@ struct kvm_pit { #define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 #define KVM_PIT_CHANNEL_MASK 0x3 -void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); void kvm_free_pit(struct kvm *kvm); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index ba910d14941..53e2d084bff 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -75,7 +75,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm); void kvm_destroy_pic(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); -void kvm_pic_clear_isr_ack(struct kvm *kvm); static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) { @@ -100,7 +99,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_timers(struct kvm_vcpu *vcpu); -int pit_has_pending_timer(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 22fae7593ee..28418054b88 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1206,7 +1206,7 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) static void nonpaging_update_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, - const void *pte, unsigned long mmu_seq) + const void *pte) { WARN_ON(1); } @@ -3163,9 +3163,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, } static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, - u64 *spte, - const void *new, unsigned long mmu_seq) + struct kvm_mmu_page *sp, u64 *spte, + const void *new) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { ++vcpu->kvm->stat.mmu_pde_zapped; @@ -3173,7 +3172,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, } ++vcpu->kvm->stat.mmu_pte_updated; - vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq); + vcpu->arch.mmu.update_pte(vcpu, sp, spte, new); } static bool need_remote_flush(u64 old, u64 new) @@ -3229,7 +3228,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_mmu_page *sp; struct hlist_node *node; LIST_HEAD(invalid_list); - unsigned long mmu_seq; u64 entry, gentry, *spte; unsigned pte_size, page_offset, misaligned, quadrant, offset; int level, npte, invlpg_counter, r, flooded = 0; @@ -3271,9 +3269,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, break; } - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - spin_lock(&vcpu->kvm->mmu_lock); if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) gentry = 0; @@ -3345,8 +3340,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (gentry && !((sp->role.word ^ vcpu->arch.mmu.base_role.word) & mask.word)) - mmu_pte_write_new_pte(vcpu, sp, spte, &gentry, - mmu_seq); + mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); if (!remote_flush && need_remote_flush(entry, *spte)) remote_flush = true; ++spte; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index c6397795d86..6c4dc010c4c 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -78,15 +78,19 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } -static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, - gfn_t table_gfn, unsigned index, - pt_element_t orig_pte, pt_element_t new_pte) +static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + pt_element_t __user *ptep_user, unsigned index, + pt_element_t orig_pte, pt_element_t new_pte) { + int npages; pt_element_t ret; pt_element_t *table; struct page *page; - page = gfn_to_page(kvm, table_gfn); + npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page); + /* Check if the user is doing something meaningless. */ + if (unlikely(npages != 1)) + return -EFAULT; table = kmap_atomic(page, KM_USER0); ret = CMPXCHG(&table[index], orig_pte, new_pte); @@ -117,6 +121,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gva_t addr, u32 access) { pt_element_t pte; + pt_element_t __user *ptep_user; gfn_t table_gfn; unsigned index, pt_access, uninitialized_var(pte_access); gpa_t pte_gpa; @@ -152,6 +157,9 @@ walk: pt_access = ACC_ALL; for (;;) { + gfn_t real_gfn; + unsigned long host_addr; + index = PT_INDEX(addr, walker->level); table_gfn = gpte_to_gfn(pte); @@ -160,43 +168,64 @@ walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte, - offset, sizeof(pte), - PFERR_USER_MASK|PFERR_WRITE_MASK)) { + real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), + PFERR_USER_MASK|PFERR_WRITE_MASK); + if (unlikely(real_gfn == UNMAPPED_GVA)) { + present = false; + break; + } + real_gfn = gpa_to_gfn(real_gfn); + + host_addr = gfn_to_hva(vcpu->kvm, real_gfn); + if (unlikely(kvm_is_error_hva(host_addr))) { + present = false; + break; + } + + ptep_user = (pt_element_t __user *)((void *)host_addr + offset); + if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) { present = false; break; } trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) { + if (unlikely(!is_present_gpte(pte))) { present = false; break; } - if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) { + if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, + walker->level))) { rsvd_fault = true; break; } - if (write_fault && !is_writable_pte(pte)) - if (user_fault || is_write_protection(vcpu)) - eperm = true; + if (unlikely(write_fault && !is_writable_pte(pte) + && (user_fault || is_write_protection(vcpu)))) + eperm = true; - if (user_fault && !(pte & PT_USER_MASK)) + if (unlikely(user_fault && !(pte & PT_USER_MASK))) eperm = true; #if PTTYPE == 64 - if (fetch_fault && (pte & PT64_NX_MASK)) + if (unlikely(fetch_fault && (pte & PT64_NX_MASK))) eperm = true; #endif - if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { + if (!eperm && !rsvd_fault + && unlikely(!(pte & PT_ACCESSED_MASK))) { + int ret; trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); - if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, - index, pte, pte|PT_ACCESSED_MASK)) + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, + pte, pte|PT_ACCESSED_MASK); + if (unlikely(ret < 0)) { + present = false; + break; + } else if (ret) goto walk; + mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_ACCESSED_MASK; } @@ -241,17 +270,21 @@ walk: --walker->level; } - if (!present || eperm || rsvd_fault) + if (unlikely(!present || eperm || rsvd_fault)) goto error; - if (write_fault && !is_dirty_gpte(pte)) { - bool ret; + if (write_fault && unlikely(!is_dirty_gpte(pte))) { + int ret; trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, - pte|PT_DIRTY_MASK); - if (ret) + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, + pte, pte|PT_DIRTY_MASK); + if (unlikely(ret < 0)) { + present = false; + goto error; + } else if (ret) goto walk; + mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_DIRTY_MASK; walker->ptes[walker->level - 1] = pte; @@ -325,7 +358,7 @@ no_present: } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte, unsigned long mmu_seq) + u64 *spte, const void *pte) { pt_element_t gpte; unsigned pte_access; @@ -342,8 +375,6 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, kvm_release_pfn_clean(pfn); return; } - if (mmu_notifier_retry(vcpu, mmu_seq)) - return; /* * we call mmu_set_spte() with host_writable = true because that diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6bb15d583e4..506e4fe23ad 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -63,6 +63,10 @@ MODULE_LICENSE("GPL"); #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +#define TSC_RATIO_RSVD 0xffffff0000000000ULL +#define TSC_RATIO_MIN 0x0000000000000001ULL +#define TSC_RATIO_MAX 0x000000ffffffffffULL + static bool erratum_383_found __read_mostly; static const u32 host_save_user_msrs[] = { @@ -93,14 +97,6 @@ struct nested_state { /* A VMEXIT is required but not yet emulated */ bool exit_required; - /* - * If we vmexit during an instruction emulation we need this to restore - * the l1 guest rip after the emulation - */ - unsigned long vmexit_rip; - unsigned long vmexit_rsp; - unsigned long vmexit_rax; - /* cache for intercepts of the guest */ u32 intercept_cr; u32 intercept_dr; @@ -144,8 +140,13 @@ struct vcpu_svm { unsigned int3_injected; unsigned long int3_rip; u32 apf_reason; + + u64 tsc_ratio; }; +static DEFINE_PER_CPU(u64, current_tsc_ratio); +#define TSC_RATIO_DEFAULT 0x0100000000ULL + #define MSR_INVALID 0xffffffffU static struct svm_direct_access_msrs { @@ -190,6 +191,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm); static int nested_svm_vmexit(struct vcpu_svm *svm); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); +static u64 __scale_tsc(u64 ratio, u64 tsc); enum { VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, @@ -376,7 +378,6 @@ struct svm_cpu_data { }; static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); -static uint32_t svm_features; struct svm_init_data { int cpu; @@ -569,6 +570,10 @@ static int has_svm(void) static void svm_hardware_disable(void *garbage) { + /* Make sure we clean up behind us */ + if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) + wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); + cpu_svm_disable(); } @@ -610,6 +615,11 @@ static int svm_hardware_enable(void *garbage) wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); + if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { + wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); + __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; + } + svm_init_erratum_383(); return 0; @@ -791,6 +801,23 @@ static __init int svm_hardware_setup(void) if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); + if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + u64 max; + + kvm_has_tsc_control = true; + + /* + * Make sure the user can only configure tsc_khz values that + * fit into a signed integer. + * A min value is not calculated needed because it will always + * be 1 on all machines and a value of 0 is used to disable + * tsc-scaling for the vcpu. + */ + max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX)); + + kvm_max_guest_tsc_khz = max; + } + if (nested) { printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); @@ -802,8 +829,6 @@ static __init int svm_hardware_setup(void) goto err; } - svm_features = cpuid_edx(SVM_CPUID_FUNC); - if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; @@ -854,6 +879,64 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } +static u64 __scale_tsc(u64 ratio, u64 tsc) +{ + u64 mult, frac, _tsc; + + mult = ratio >> 32; + frac = ratio & ((1ULL << 32) - 1); + + _tsc = tsc; + _tsc *= mult; + _tsc += (tsc >> 32) * frac; + _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32; + + return _tsc; +} + +static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 _tsc = tsc; + + if (svm->tsc_ratio != TSC_RATIO_DEFAULT) + _tsc = __scale_tsc(svm->tsc_ratio, tsc); + + return _tsc; +} + +static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 ratio; + u64 khz; + + /* TSC scaling supported? */ + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) + return; + + /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */ + if (user_tsc_khz == 0) { + vcpu->arch.virtual_tsc_khz = 0; + svm->tsc_ratio = TSC_RATIO_DEFAULT; + return; + } + + khz = user_tsc_khz; + + /* TSC scaling required - calculate ratio */ + ratio = khz << 32; + do_div(ratio, tsc_khz); + + if (ratio == 0 || ratio & TSC_RATIO_RSVD) { + WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n", + user_tsc_khz); + return; + } + vcpu->arch.virtual_tsc_khz = user_tsc_khz; + svm->tsc_ratio = ratio; +} + static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -880,6 +963,15 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } +static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +{ + u64 tsc; + + tsc = svm_scale_tsc(vcpu, native_read_tsc()); + + return target_tsc - tsc; +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -975,7 +1067,7 @@ static void init_vmcb(struct vcpu_svm *svm) svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; save->dr7 = 0x400; - save->rflags = 2; + kvm_set_rflags(&svm->vcpu, 2); save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; @@ -1048,6 +1140,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto out; } + svm->tsc_ratio = TSC_RATIO_DEFAULT; + err = kvm_vcpu_init(&svm->vcpu, kvm, id); if (err) goto free_svm; @@ -1141,6 +1235,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); + + if (static_cpu_has(X86_FEATURE_TSCRATEMSR) && + svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) { + __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio; + wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio); + } } static void svm_vcpu_put(struct kvm_vcpu *vcpu) @@ -1365,31 +1465,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); - if (is_guest_mode(vcpu)) { - /* - * We are here because we run in nested mode, the host kvm - * intercepts cr0 writes but the l1 hypervisor does not. - * But the L1 hypervisor may intercept selective cr0 writes. - * This needs to be checked here. - */ - unsigned long old, new; - - /* Remove bits that would trigger a real cr0 write intercept */ - old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK; - new = cr0 & SVM_CR0_SELECTIVE_MASK; - - if (old == new) { - /* cr0 write with ts and mp unchanged */ - svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; - if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) { - svm->nested.vmexit_rip = kvm_rip_read(vcpu); - svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); - svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX); - return; - } - } - } - #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { @@ -2127,7 +2202,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); nested_vmcb->save.cr2 = vmcb->save.cr2; nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; - nested_vmcb->save.rflags = vmcb->save.rflags; + nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu); nested_vmcb->save.rip = vmcb->save.rip; nested_vmcb->save.rsp = vmcb->save.rsp; nested_vmcb->save.rax = vmcb->save.rax; @@ -2184,7 +2259,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) svm->vmcb->save.ds = hsave->save.ds; svm->vmcb->save.gdtr = hsave->save.gdtr; svm->vmcb->save.idtr = hsave->save.idtr; - svm->vmcb->save.rflags = hsave->save.rflags; + kvm_set_rflags(&svm->vcpu, hsave->save.rflags); svm_set_efer(&svm->vcpu, hsave->save.efer); svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); svm_set_cr4(&svm->vcpu, hsave->save.cr4); @@ -2312,7 +2387,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) hsave->save.efer = svm->vcpu.arch.efer; hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; - hsave->save.rflags = vmcb->save.rflags; + hsave->save.rflags = kvm_get_rflags(&svm->vcpu); hsave->save.rip = kvm_rip_read(&svm->vcpu); hsave->save.rsp = vmcb->save.rsp; hsave->save.rax = vmcb->save.rax; @@ -2323,7 +2398,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) copy_vmcb_control_area(hsave, vmcb); - if (svm->vmcb->save.rflags & X86_EFLAGS_IF) + if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) svm->vcpu.arch.hflags |= HF_HIF_MASK; else svm->vcpu.arch.hflags &= ~HF_HIF_MASK; @@ -2341,7 +2416,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) svm->vmcb->save.ds = nested_vmcb->save.ds; svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; svm->vmcb->save.idtr = nested_vmcb->save.idtr; - svm->vmcb->save.rflags = nested_vmcb->save.rflags; + kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags); svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); @@ -2443,13 +2518,13 @@ static int vmload_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); if (!nested_vmcb) return 1; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + nested_svm_vmloadsave(nested_vmcb, svm->vmcb); nested_svm_unmap(page); @@ -2464,13 +2539,13 @@ static int vmsave_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); if (!nested_vmcb) return 1; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + nested_svm_vmloadsave(svm->vmcb, nested_vmcb); nested_svm_unmap(page); @@ -2676,6 +2751,29 @@ static int emulate_on_interception(struct vcpu_svm *svm) return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; } +bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) +{ + unsigned long cr0 = svm->vcpu.arch.cr0; + bool ret = false; + u64 intercept; + + intercept = svm->nested.intercept; + + if (!is_guest_mode(&svm->vcpu) || + (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))) + return false; + + cr0 &= ~SVM_CR0_SELECTIVE_MASK; + val &= ~SVM_CR0_SELECTIVE_MASK; + + if (cr0 ^ val) { + svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; + ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); + } + + return ret; +} + #define CR_VALID (1ULL << 63) static int cr_interception(struct vcpu_svm *svm) @@ -2699,7 +2797,11 @@ static int cr_interception(struct vcpu_svm *svm) val = kvm_register_read(&svm->vcpu, reg); switch (cr) { case 0: - err = kvm_set_cr0(&svm->vcpu, val); + if (!check_selective_cr0_intercepted(svm, val)) + err = kvm_set_cr0(&svm->vcpu, val); + else + return 1; + break; case 3: err = kvm_set_cr3(&svm->vcpu, val); @@ -2744,23 +2846,6 @@ static int cr_interception(struct vcpu_svm *svm) return 1; } -static int cr0_write_interception(struct vcpu_svm *svm) -{ - struct kvm_vcpu *vcpu = &svm->vcpu; - int r; - - r = cr_interception(svm); - - if (svm->nested.vmexit_rip) { - kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); - kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp); - kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax); - svm->nested.vmexit_rip = 0; - } - - return r; -} - static int dr_interception(struct vcpu_svm *svm) { int reg, dr; @@ -2813,7 +2898,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) case MSR_IA32_TSC: { struct vmcb *vmcb = get_host_vmcb(svm); - *data = vmcb->control.tsc_offset + native_read_tsc(); + *data = vmcb->control.tsc_offset + + svm_scale_tsc(vcpu, native_read_tsc()); + break; } case MSR_STAR: @@ -3048,7 +3135,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR4] = cr_interception, [SVM_EXIT_READ_CR8] = cr_interception, [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, - [SVM_EXIT_WRITE_CR0] = cr0_write_interception, + [SVM_EXIT_WRITE_CR0] = cr_interception, [SVM_EXIT_WRITE_CR3] = cr_interception, [SVM_EXIT_WRITE_CR4] = cr_interception, [SVM_EXIT_WRITE_CR8] = cr8_write_interception, @@ -3104,97 +3191,109 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_NPF] = pf_interception, }; -void dump_vmcb(struct kvm_vcpu *vcpu) +static void dump_vmcb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; pr_err("VMCB Control Area:\n"); - pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff); - pr_err("cr_write: %04x\n", control->intercept_cr >> 16); - pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff); - pr_err("dr_write: %04x\n", control->intercept_dr >> 16); - pr_err("exceptions: %08x\n", control->intercept_exceptions); - pr_err("intercepts: %016llx\n", control->intercept); - pr_err("pause filter count: %d\n", control->pause_filter_count); - pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa); - pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa); - pr_err("tsc_offset: %016llx\n", control->tsc_offset); - pr_err("asid: %d\n", control->asid); - pr_err("tlb_ctl: %d\n", control->tlb_ctl); - pr_err("int_ctl: %08x\n", control->int_ctl); - pr_err("int_vector: %08x\n", control->int_vector); - pr_err("int_state: %08x\n", control->int_state); - pr_err("exit_code: %08x\n", control->exit_code); - pr_err("exit_info1: %016llx\n", control->exit_info_1); - pr_err("exit_info2: %016llx\n", control->exit_info_2); - pr_err("exit_int_info: %08x\n", control->exit_int_info); - pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err); - pr_err("nested_ctl: %lld\n", control->nested_ctl); - pr_err("nested_cr3: %016llx\n", control->nested_cr3); - pr_err("event_inj: %08x\n", control->event_inj); - pr_err("event_inj_err: %08x\n", control->event_inj_err); - pr_err("lbr_ctl: %lld\n", control->lbr_ctl); - pr_err("next_rip: %016llx\n", control->next_rip); + pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff); + pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16); + pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff); + pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16); + pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); + pr_err("%-20s%016llx\n", "intercepts:", control->intercept); + pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); + pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); + pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); + pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); + pr_err("%-20s%d\n", "asid:", control->asid); + pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); + pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); + pr_err("%-20s%08x\n", "int_vector:", control->int_vector); + pr_err("%-20s%08x\n", "int_state:", control->int_state); + pr_err("%-20s%08x\n", "exit_code:", control->exit_code); + pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); + pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); + pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); + pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); + pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); + pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); + pr_err("%-20s%08x\n", "event_inj:", control->event_inj); + pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); + pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); + pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); pr_err("VMCB State Save Area:\n"); - pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n", - save->es.selector, save->es.attrib, - save->es.limit, save->es.base); - pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n", - save->cs.selector, save->cs.attrib, - save->cs.limit, save->cs.base); - pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n", - save->ss.selector, save->ss.attrib, - save->ss.limit, save->ss.base); - pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n", - save->ds.selector, save->ds.attrib, - save->ds.limit, save->ds.base); - pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n", - save->fs.selector, save->fs.attrib, - save->fs.limit, save->fs.base); - pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n", - save->gs.selector, save->gs.attrib, - save->gs.limit, save->gs.base); - pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", - save->gdtr.selector, save->gdtr.attrib, - save->gdtr.limit, save->gdtr.base); - pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", - save->ldtr.selector, save->ldtr.attrib, - save->ldtr.limit, save->ldtr.base); - pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", - save->idtr.selector, save->idtr.attrib, - save->idtr.limit, save->idtr.base); - pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n", - save->tr.selector, save->tr.attrib, - save->tr.limit, save->tr.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "es:", + save->es.selector, save->es.attrib, + save->es.limit, save->es.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "cs:", + save->cs.selector, save->cs.attrib, + save->cs.limit, save->cs.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "ss:", + save->ss.selector, save->ss.attrib, + save->ss.limit, save->ss.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "ds:", + save->ds.selector, save->ds.attrib, + save->ds.limit, save->ds.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "fs:", + save->fs.selector, save->fs.attrib, + save->fs.limit, save->fs.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "gs:", + save->gs.selector, save->gs.attrib, + save->gs.limit, save->gs.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "gdtr:", + save->gdtr.selector, save->gdtr.attrib, + save->gdtr.limit, save->gdtr.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "ldtr:", + save->ldtr.selector, save->ldtr.attrib, + save->ldtr.limit, save->ldtr.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "idtr:", + save->idtr.selector, save->idtr.attrib, + save->idtr.limit, save->idtr.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "tr:", + save->tr.selector, save->tr.attrib, + save->tr.limit, save->tr.base); pr_err("cpl: %d efer: %016llx\n", save->cpl, save->efer); - pr_err("cr0: %016llx cr2: %016llx\n", - save->cr0, save->cr2); - pr_err("cr3: %016llx cr4: %016llx\n", - save->cr3, save->cr4); - pr_err("dr6: %016llx dr7: %016llx\n", - save->dr6, save->dr7); - pr_err("rip: %016llx rflags: %016llx\n", - save->rip, save->rflags); - pr_err("rsp: %016llx rax: %016llx\n", - save->rsp, save->rax); - pr_err("star: %016llx lstar: %016llx\n", - save->star, save->lstar); - pr_err("cstar: %016llx sfmask: %016llx\n", - save->cstar, save->sfmask); - pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n", - save->kernel_gs_base, save->sysenter_cs); - pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n", - save->sysenter_esp, save->sysenter_eip); - pr_err("gpat: %016llx dbgctl: %016llx\n", - save->g_pat, save->dbgctl); - pr_err("br_from: %016llx br_to: %016llx\n", - save->br_from, save->br_to); - pr_err("excp_from: %016llx excp_to: %016llx\n", - save->last_excp_from, save->last_excp_to); - + pr_err("%-15s %016llx %-13s %016llx\n", + "cr0:", save->cr0, "cr2:", save->cr2); + pr_err("%-15s %016llx %-13s %016llx\n", + "cr3:", save->cr3, "cr4:", save->cr4); + pr_err("%-15s %016llx %-13s %016llx\n", + "dr6:", save->dr6, "dr7:", save->dr7); + pr_err("%-15s %016llx %-13s %016llx\n", + "rip:", save->rip, "rflags:", save->rflags); + pr_err("%-15s %016llx %-13s %016llx\n", + "rsp:", save->rsp, "rax:", save->rax); + pr_err("%-15s %016llx %-13s %016llx\n", + "star:", save->star, "lstar:", save->lstar); + pr_err("%-15s %016llx %-13s %016llx\n", + "cstar:", save->cstar, "sfmask:", save->sfmask); + pr_err("%-15s %016llx %-13s %016llx\n", + "kernel_gs_base:", save->kernel_gs_base, + "sysenter_cs:", save->sysenter_cs); + pr_err("%-15s %016llx %-13s %016llx\n", + "sysenter_esp:", save->sysenter_esp, + "sysenter_eip:", save->sysenter_eip); + pr_err("%-15s %016llx %-13s %016llx\n", + "gpat:", save->g_pat, "dbgctl:", save->dbgctl); + pr_err("%-15s %016llx %-13s %016llx\n", + "br_from:", save->br_from, "br_to:", save->br_to); + pr_err("%-15s %016llx %-13s %016llx\n", + "excp_from:", save->last_excp_from, + "excp_to:", save->last_excp_to); } static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) @@ -3384,7 +3483,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) return 0; - ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); + ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF); if (is_guest_mode(vcpu)) return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); @@ -3871,6 +3970,186 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) update_cr0_intercept(svm); } +#define PRE_EX(exit) { .exit_code = (exit), \ + .stage = X86_ICPT_PRE_EXCEPT, } +#define POST_EX(exit) { .exit_code = (exit), \ + .stage = X86_ICPT_POST_EXCEPT, } +#define POST_MEM(exit) { .exit_code = (exit), \ + .stage = X86_ICPT_POST_MEMACCESS, } + +static struct __x86_intercept { + u32 exit_code; + enum x86_intercept_stage stage; +} x86_intercept_map[] = { + [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0), + [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0), + [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0), + [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0), + [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0), + [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0), + [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0), + [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ), + [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ), + [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE), + [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE), + [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ), + [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ), + [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE), + [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE), + [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN), + [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL), + [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD), + [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE), + [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI), + [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI), + [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT), + [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA), + [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP), + [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR), + [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT), + [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG), + [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD), + [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD), + [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR), + [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC), + [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR), + [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC), + [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID), + [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM), + [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE), + [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF), + [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF), + [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT), + [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET), + [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP), + [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT), + [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO), + [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), + [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), + [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), +}; + +#undef PRE_EX +#undef POST_EX +#undef POST_MEM + +static int svm_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int vmexit, ret = X86EMUL_CONTINUE; + struct __x86_intercept icpt_info; + struct vmcb *vmcb = svm->vmcb; + + if (info->intercept >= ARRAY_SIZE(x86_intercept_map)) + goto out; + + icpt_info = x86_intercept_map[info->intercept]; + + if (stage != icpt_info.stage) + goto out; + + switch (icpt_info.exit_code) { + case SVM_EXIT_READ_CR0: + if (info->intercept == x86_intercept_cr_read) + icpt_info.exit_code += info->modrm_reg; + break; + case SVM_EXIT_WRITE_CR0: { + unsigned long cr0, val; + u64 intercept; + + if (info->intercept == x86_intercept_cr_write) + icpt_info.exit_code += info->modrm_reg; + + if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0) + break; + + intercept = svm->nested.intercept; + + if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))) + break; + + cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; + val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; + + if (info->intercept == x86_intercept_lmsw) { + cr0 &= 0xfUL; + val &= 0xfUL; + /* lmsw can't clear PE - catch this here */ + if (cr0 & X86_CR0_PE) + val |= X86_CR0_PE; + } + + if (cr0 ^ val) + icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; + + break; + } + case SVM_EXIT_READ_DR0: + case SVM_EXIT_WRITE_DR0: + icpt_info.exit_code += info->modrm_reg; + break; + case SVM_EXIT_MSR: + if (info->intercept == x86_intercept_wrmsr) + vmcb->control.exit_info_1 = 1; + else + vmcb->control.exit_info_1 = 0; + break; + case SVM_EXIT_PAUSE: + /* + * We get this for NOP only, but pause + * is rep not, check this here + */ + if (info->rep_prefix != REPE_PREFIX) + goto out; + case SVM_EXIT_IOIO: { + u64 exit_info; + u32 bytes; + + exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16; + + if (info->intercept == x86_intercept_in || + info->intercept == x86_intercept_ins) { + exit_info |= SVM_IOIO_TYPE_MASK; + bytes = info->src_bytes; + } else { + bytes = info->dst_bytes; + } + + if (info->intercept == x86_intercept_outs || + info->intercept == x86_intercept_ins) + exit_info |= SVM_IOIO_STR_MASK; + + if (info->rep_prefix) + exit_info |= SVM_IOIO_REP_MASK; + + bytes = min(bytes, 4u); + + exit_info |= bytes << SVM_IOIO_SIZE_SHIFT; + + exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1); + + vmcb->control.exit_info_1 = exit_info; + vmcb->control.exit_info_2 = info->next_rip; + + break; + } + default: + break; + } + + vmcb->control.next_rip = info->next_rip; + vmcb->control.exit_code = icpt_info.exit_code; + vmexit = nested_svm_exit_handled(svm); + + ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED + : X86EMUL_CONTINUE; + +out: + return ret; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -3952,10 +4231,14 @@ static struct kvm_x86_ops svm_x86_ops = { .has_wbinvd_exit = svm_has_wbinvd_exit, + .set_tsc_khz = svm_set_tsc_khz, .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset = svm_adjust_tsc_offset, + .compute_tsc_offset = svm_compute_tsc_offset, .set_tdp_cr3 = set_tdp_cr3, + + .check_intercept = svm_check_intercept, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5b4cdcbd154..4c3fa0f6746 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -128,8 +128,11 @@ struct vcpu_vmx { unsigned long host_rsp; int launched; u8 fail; + u8 cpl; + bool nmi_known_unmasked; u32 exit_intr_info; u32 idt_vectoring_info; + ulong rflags; struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; @@ -159,6 +162,10 @@ struct vcpu_vmx { u32 ar; } tr, es, ds, fs, gs; } rmode; + struct { + u32 bitmask; /* 4 bits per segment (1 bit per field) */ + struct kvm_save_segment seg[8]; + } segment_cache; int vpid; bool emulation_required; @@ -171,6 +178,15 @@ struct vcpu_vmx { bool rdtscp_enabled; }; +enum segment_cache_field { + SEG_FIELD_SEL = 0, + SEG_FIELD_BASE = 1, + SEG_FIELD_LIMIT = 2, + SEG_FIELD_AR = 3, + + SEG_FIELD_NR = 4 +}; + static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_vmx, vcpu); @@ -643,6 +659,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask) vmcs_writel(field, vmcs_readl(field) | mask); } +static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) +{ + vmx->segment_cache.bitmask = 0; +} + +static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, + unsigned field) +{ + bool ret; + u32 mask = 1 << (seg * SEG_FIELD_NR + field); + + if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { + vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); + vmx->segment_cache.bitmask = 0; + } + ret = vmx->segment_cache.bitmask & mask; + vmx->segment_cache.bitmask |= mask; + return ret; +} + +static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) +{ + u16 *p = &vmx->segment_cache.seg[seg].selector; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) + *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); + return *p; +} + +static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) +{ + ulong *p = &vmx->segment_cache.seg[seg].base; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) + *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); + return *p; +} + +static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) +{ + u32 *p = &vmx->segment_cache.seg[seg].limit; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) + *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); + return *p; +} + +static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) +{ + u32 *p = &vmx->segment_cache.seg[seg].ar; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) + *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); + return *p; +} + static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; @@ -970,17 +1042,24 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags, save_rflags; - rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) { - rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; - save_rflags = to_vmx(vcpu)->rmode.save_rflags; - rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { + __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); + rflags = vmcs_readl(GUEST_RFLAGS); + if (to_vmx(vcpu)->rmode.vm86_active) { + rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + save_rflags = to_vmx(vcpu)->rmode.save_rflags; + rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + } + to_vmx(vcpu)->rflags = rflags; } - return rflags; + return to_vmx(vcpu)->rflags; } static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); + __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + to_vmx(vcpu)->rflags = rflags; if (to_vmx(vcpu)->rmode.vm86_active) { to_vmx(vcpu)->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; @@ -1053,7 +1132,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, } if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE) + int inc_eip = 0; + if (kvm_exception_is_soft(nr)) + inc_eip = vcpu->arch.event_exit_inst_len; + if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } @@ -1151,6 +1233,16 @@ static u64 guest_read_tsc(void) } /* + * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ + * ioctl. In this case the call-back should update internal vmx state to make + * the changes effective. + */ +static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +{ + /* Nothing to do here */ +} + +/* * writes 'offset' into guest's timestamp counter offset register */ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) @@ -1164,6 +1256,11 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) vmcs_write64(TSC_OFFSET, offset + adjustment); } +static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +{ + return target_tsc - native_read_tsc(); +} + /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -1243,9 +1340,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) break; #ifdef CONFIG_X86_64 case MSR_FS_BASE: + vmx_segment_cache_clear(vmx); vmcs_writel(GUEST_FS_BASE, data); break; case MSR_GS_BASE: + vmx_segment_cache_clear(vmx); vmcs_writel(GUEST_GS_BASE, data); break; case MSR_KERNEL_GS_BASE: @@ -1689,6 +1788,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->emulation_required = 1; vmx->rmode.vm86_active = 0; + vmx_segment_cache_clear(vmx); + vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); @@ -1712,6 +1813,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu) fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); + vmx_segment_cache_clear(vmx); + vmcs_write16(GUEST_SS_SELECTOR, 0); vmcs_write32(GUEST_SS_AR_BYTES, 0x93); @@ -1775,6 +1878,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); } + vmx_segment_cache_clear(vmx); + vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR); vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); @@ -1851,6 +1956,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu) { u32 guest_tr_ar; + vmx_segment_cache_clear(to_vmx(vcpu)); + guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { printk(KERN_DEBUG "%s: tss fixup for long mode. \n", @@ -1998,6 +2105,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; + __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); } static u64 construct_eptp(unsigned long root_hpa) @@ -2053,7 +2161,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; struct kvm_save_segment *save; u32 ar; @@ -2075,13 +2182,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->limit = save->limit; ar = save->ar; if (seg == VCPU_SREG_TR - || var->selector == vmcs_read16(sf->selector)) + || var->selector == vmx_read_guest_seg_selector(vmx, seg)) goto use_saved_rmode_seg; } - var->base = vmcs_readl(sf->base); - var->limit = vmcs_read32(sf->limit); - var->selector = vmcs_read16(sf->selector); - ar = vmcs_read32(sf->ar_bytes); + var->base = vmx_read_guest_seg_base(vmx, seg); + var->limit = vmx_read_guest_seg_limit(vmx, seg); + var->selector = vmx_read_guest_seg_selector(vmx, seg); + ar = vmx_read_guest_seg_ar(vmx, seg); use_saved_rmode_seg: if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) ar = 0; @@ -2098,27 +2205,37 @@ use_saved_rmode_seg: static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) { - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; struct kvm_segment s; if (to_vmx(vcpu)->rmode.vm86_active) { vmx_get_segment(vcpu, &s, seg); return s.base; } - return vmcs_readl(sf->base); + return vmx_read_guest_seg_base(to_vmx(vcpu), seg); } -static int vmx_get_cpl(struct kvm_vcpu *vcpu) +static int __vmx_get_cpl(struct kvm_vcpu *vcpu) { if (!is_protmode(vcpu)) return 0; - if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ + if (!is_long_mode(vcpu) + && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ return 3; - return vmcs_read16(GUEST_CS_SELECTOR) & 3; + return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3; } +static int vmx_get_cpl(struct kvm_vcpu *vcpu) +{ + if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { + __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); + } + return to_vmx(vcpu)->cpl; +} + + static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; @@ -2148,6 +2265,8 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; + vmx_segment_cache_clear(vmx); + if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { vmcs_write16(sf->selector, var->selector); vmx->rmode.tr.selector = var->selector; @@ -2184,11 +2303,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, ar |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, ar); + __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) { - u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); + u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); *db = (ar >> 14) & 1; *l = (ar >> 13) & 1; @@ -2775,6 +2895,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) if (ret != 0) goto out; + vmx_segment_cache_clear(vmx); + seg_setup(VCPU_SREG_CS); /* * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode @@ -2904,7 +3026,10 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE) + int inc_eip = 0; + if (vcpu->arch.interrupt.soft) + inc_eip = vcpu->arch.event_exit_inst_len; + if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } @@ -2937,8 +3062,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) } ++vcpu->stat.nmi_injections; + vmx->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE) + if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } @@ -2961,6 +3087,8 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { if (!cpu_has_virtual_nmis()) return to_vmx(vcpu)->soft_vnmi_blocked; + if (to_vmx(vcpu)->nmi_known_unmasked) + return false; return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; } @@ -2974,6 +3102,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) vmx->vnmi_blocked_time = 0; } } else { + vmx->nmi_known_unmasked = !masked; if (masked) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -3091,7 +3220,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) enum emulation_result er; vect_info = vmx->idt_vectoring_info; - intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + intr_info = vmx->exit_intr_info; if (is_machine_check(intr_info)) return handle_machine_check(vcpu); @@ -3122,7 +3251,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) } error_code = 0; - rip = kvm_rip_read(vcpu); if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { @@ -3169,6 +3297,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) vmx->vcpu.arch.event_exit_inst_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; + rip = kvm_rip_read(vcpu); kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; break; @@ -3505,9 +3634,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) switch (type) { case INTR_TYPE_NMI_INTR: vcpu->arch.nmi_injected = false; - if (cpu_has_virtual_nmis()) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); + vmx_set_nmi_mask(vcpu, true); break; case INTR_TYPE_EXT_INTR: case INTR_TYPE_SOFT_INTR: @@ -3867,12 +3994,17 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { - u32 exit_intr_info = vmx->exit_intr_info; + u32 exit_intr_info; + + if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY + || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) + return; + + vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + exit_intr_info = vmx->exit_intr_info; /* Handle machine checks before interrupts are enabled */ - if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) - || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI - && is_machine_check(exit_intr_info))) + if (is_machine_check(exit_intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ @@ -3886,7 +4018,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { - u32 exit_intr_info = vmx->exit_intr_info; + u32 exit_intr_info; bool unblock_nmi; u8 vector; bool idtv_info_valid; @@ -3894,6 +4026,13 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; if (cpu_has_virtual_nmis()) { + if (vmx->nmi_known_unmasked) + return; + /* + * Can't use vmx->exit_intr_info since we're not sure what + * the exit reason is. + */ + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* @@ -3910,6 +4049,10 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) vector != DF_VECTOR && !idtv_info_valid) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); + else + vmx->nmi_known_unmasked = + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) + & GUEST_INTR_STATE_NMI); } else if (unlikely(vmx->soft_vnmi_blocked)) vmx->vnmi_blocked_time += ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); @@ -3946,8 +4089,7 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, * Clear bit "block by NMI" before VM entry if a NMI * delivery faulted. */ - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); + vmx_set_nmi_mask(&vmx->vcpu, false); break; case INTR_TYPE_SOFT_EXCEPTION: vmx->vcpu.arch.event_exit_inst_len = @@ -4124,7 +4266,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) ); vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) + | (1 << VCPU_EXREG_RFLAGS) + | (1 << VCPU_EXREG_CPL) | (1 << VCPU_EXREG_PDPTR) + | (1 << VCPU_EXREG_SEGMENTS) | (1 << VCPU_EXREG_CR3)); vcpu->arch.regs_dirty = 0; @@ -4134,7 +4279,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); @@ -4195,8 +4339,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vcpu; vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + err = -ENOMEM; if (!vmx->guest_msrs) { - err = -ENOMEM; goto uninit_vcpu; } @@ -4215,7 +4359,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (err) goto free_vmcs; if (vm_need_virtualize_apic_accesses(kvm)) - if (alloc_apic_access_page(kvm) != 0) + err = alloc_apic_access_page(kvm); + if (err) goto free_vmcs; if (enable_ept) { @@ -4368,6 +4513,13 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { } +static int vmx_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage) +{ + return X86EMUL_CONTINUE; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -4449,10 +4601,14 @@ static struct kvm_x86_ops vmx_x86_ops = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + .set_tsc_khz = vmx_set_tsc_khz, .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset = vmx_adjust_tsc_offset, + .compute_tsc_offset = vmx_compute_tsc_offset, .set_tdp_cr3 = vmx_set_cr3, + + .check_intercept = vmx_check_intercept, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 934b4c6b0bf..77c9d8673dc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -60,22 +60,12 @@ #include <asm/div64.h> #define MAX_IO_MSRS 256 -#define CR0_RESERVED_BITS \ - (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ - | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ - | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) -#define CR4_RESERVED_BITS \ - (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ - | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ - | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXSAVE \ - | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) - -#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) - #define KVM_MAX_MCE_BANKS 32 #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) +#define emul_to_vcpu(ctxt) \ + container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) + /* EFER defaults: * - enable syscall per default because its emulated by KVM * - enable LME and LMA per default on 64 bit KVM @@ -100,6 +90,11 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); int ignore_msrs = 0; module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); +bool kvm_has_tsc_control; +EXPORT_SYMBOL_GPL(kvm_has_tsc_control); +u32 kvm_max_guest_tsc_khz; +EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -157,6 +152,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { u64 __read_mostly host_xcr0; +int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); + static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { int i; @@ -361,8 +358,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) void kvm_inject_nmi(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_NMI, vcpu); kvm_make_request(KVM_REQ_EVENT, vcpu); + vcpu->arch.nmi_pending = 1; } EXPORT_SYMBOL_GPL(kvm_inject_nmi); @@ -982,7 +979,15 @@ static inline int kvm_tsc_changes_freq(void) return ret; } -static inline u64 nsec_to_cycles(u64 nsec) +static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.virtual_tsc_khz) + return vcpu->arch.virtual_tsc_khz; + else + return __this_cpu_read(cpu_tsc_khz); +} + +static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { u64 ret; @@ -990,25 +995,24 @@ static inline u64 nsec_to_cycles(u64 nsec) if (kvm_tsc_changes_freq()) printk_once(KERN_WARNING "kvm: unreliable cycle conversion on adjustable rate TSC\n"); - ret = nsec * __this_cpu_read(cpu_tsc_khz); + ret = nsec * vcpu_tsc_khz(vcpu); do_div(ret, USEC_PER_SEC); return ret; } -static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz) +static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz) { /* Compute a scale to convert nanoseconds in TSC cycles */ kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, - &kvm->arch.virtual_tsc_shift, - &kvm->arch.virtual_tsc_mult); - kvm->arch.virtual_tsc_khz = this_tsc_khz; + &vcpu->arch.tsc_catchup_shift, + &vcpu->arch.tsc_catchup_mult); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, - vcpu->kvm->arch.virtual_tsc_mult, - vcpu->kvm->arch.virtual_tsc_shift); + vcpu->arch.tsc_catchup_mult, + vcpu->arch.tsc_catchup_shift); tsc += vcpu->arch.last_tsc_write; return tsc; } @@ -1021,7 +1025,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) s64 sdiff; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); - offset = data - native_read_tsc(); + offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; sdiff = data - kvm->arch.last_tsc_write; @@ -1037,13 +1041,13 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) * In that case, for a reliable TSC, we can match TSC offsets, * or make a best guest using elapsed value. */ - if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) && + if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && elapsed < 5ULL * NSEC_PER_SEC) { if (!check_tsc_unstable()) { offset = kvm->arch.last_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { - u64 delta = nsec_to_cycles(elapsed); + u64 delta = nsec_to_cycles(vcpu, elapsed); offset += delta; pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } @@ -1075,8 +1079,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_save(flags); kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); kernel_ns = get_kernel_ns(); - this_tsc_khz = __this_cpu_read(cpu_tsc_khz); - + this_tsc_khz = vcpu_tsc_khz(v); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); @@ -1993,6 +1996,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_GET_TSC_KHZ: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -2019,6 +2023,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XCRS: r = cpu_has_xsave; break; + case KVM_CAP_TSC_CONTROL: + r = kvm_has_tsc_control; + break; default: r = 0; break; @@ -2120,8 +2127,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->vcpu_load(vcpu, cpu); if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { /* Make sure TSC doesn't go backwards */ - s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : - native_read_tsc() - vcpu->arch.last_host_tsc; + s64 tsc_delta; + u64 tsc; + + kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc); + tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : + tsc - vcpu->arch.last_guest_tsc; + if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { @@ -2139,7 +2151,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - vcpu->arch.last_host_tsc = native_read_tsc(); + kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); } static int is_efer_nx(void) @@ -2324,6 +2336,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + /* cpuid 0xC0000001.edx */ + const u32 kvm_supported_word5_x86_features = + F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | + F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | + F(PMM) | F(PMM_EN); + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); do_cpuid_1_ent(entry, function, index); @@ -2418,6 +2436,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | (1 << KVM_FEATURE_NOP_IO_DELAY) | (1 << KVM_FEATURE_CLOCKSOURCE2) | + (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); entry->ebx = 0; entry->ecx = 0; @@ -2432,6 +2451,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_supported_word6_x86_features; cpuid_mask(&entry->ecx, 6); break; + /*Add support for Centaur's CPUID instruction*/ + case 0xC0000000: + /*Just support up to 0xC0000004 now*/ + entry->eax = min(entry->eax, 0xC0000004); + break; + case 0xC0000001: + entry->edx &= kvm_supported_word5_x86_features; + cpuid_mask(&entry->edx, 5); + break; + case 0xC0000002: + case 0xC0000003: + case 0xC0000004: + /*Now nothing to do, reserved for the future*/ + break; } kvm_x86_ops->set_supported_cpuid(function, entry); @@ -2478,6 +2511,26 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, if (nent >= cpuid->nent) goto out_free; + /* Add support for Centaur's CPUID instruction. */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) { + do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0, + &nent, cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + limit = cpuid_entries[nent - 1].eax; + for (func = 0xC0000001; + func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, + &nent, cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + } + do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, cpuid->nent); @@ -3046,6 +3099,32 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; } + case KVM_SET_TSC_KHZ: { + u32 user_tsc_khz; + + r = -EINVAL; + if (!kvm_has_tsc_control) + break; + + user_tsc_khz = (u32)arg; + + if (user_tsc_khz >= kvm_max_guest_tsc_khz) + goto out; + + kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); + + r = 0; + goto out; + } + case KVM_GET_TSC_KHZ: { + r = -EIO; + if (check_tsc_unstable()) + goto out; + + r = vcpu_tsc_khz(vcpu); + + goto out; + } default: r = -EINVAL; } @@ -3595,20 +3674,43 @@ static void kvm_init_msr_list(void) static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, const void *v) { - if (vcpu->arch.apic && - !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) - return 0; + int handled = 0; + int n; - return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); + do { + n = min(len, 8); + if (!(vcpu->arch.apic && + !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v)) + && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) + break; + handled += n; + addr += n; + len -= n; + v += n; + } while (len); + + return handled; } static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) { - if (vcpu->arch.apic && - !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) - return 0; + int handled = 0; + int n; + + do { + n = min(len, 8); + if (!(vcpu->arch.apic && + !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v)) + && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) + break; + trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); + handled += n; + addr += n; + len -= n; + v += n; + } while (len); - return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); + return handled; } static void kvm_set_segment(struct kvm_vcpu *vcpu, @@ -3703,37 +3805,43 @@ out: } /* used for instruction fetching */ -static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, +static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access | PFERR_FETCH_MASK, exception); } -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, +static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } -static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, +static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); } -static int kvm_write_guest_virt_system(gva_t addr, void *val, +static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); void *data = val; int r = X86EMUL_CONTINUE; @@ -3761,13 +3869,15 @@ out: return r; } -static int emulator_read_emulated(unsigned long addr, +static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) + struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; + int handled; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -3786,7 +3896,7 @@ static int emulator_read_emulated(unsigned long addr, if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception) + if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; @@ -3794,18 +3904,24 @@ mmio: /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { - trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); + handled = vcpu_mmio_read(vcpu, gpa, bytes, val); + + if (handled == bytes) return X86EMUL_CONTINUE; - } + + gpa += handled; + bytes -= handled; + val += handled; trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); vcpu->mmio_needed = 1; vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->mmio_size = bytes; + vcpu->run->mmio.len = min(vcpu->mmio_size, 8); vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; + vcpu->mmio_index = 0; return X86EMUL_IO_NEEDED; } @@ -3829,6 +3945,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int handled; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3847,25 +3964,35 @@ mmio: /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + handled = vcpu_mmio_write(vcpu, gpa, bytes, val); + if (handled == bytes) return X86EMUL_CONTINUE; + gpa += handled; + bytes -= handled; + val += handled; + vcpu->mmio_needed = 1; + memcpy(vcpu->mmio_data, val, bytes); vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->mmio_size = bytes; + vcpu->run->mmio.len = min(vcpu->mmio_size, 8); vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; - memcpy(vcpu->run->mmio.data, val, bytes); + memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); + vcpu->mmio_index = 0; return X86EMUL_CONTINUE; } -int emulator_write_emulated(unsigned long addr, +int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, const void *val, unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) + struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + /* Crossing a page boundary? */ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { int rc, now; @@ -3893,13 +4020,14 @@ int emulator_write_emulated(unsigned long addr, (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) #endif -static int emulator_cmpxchg_emulated(unsigned long addr, +static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, const void *old, const void *new, unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) + struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; struct page *page; char *kaddr; @@ -3955,7 +4083,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, emul_write: printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); - return emulator_write_emulated(addr, new, bytes, exception, vcpu); + return emulator_write_emulated(ctxt, addr, new, bytes, exception); } static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) @@ -3974,9 +4102,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) } -static int emulator_pio_in_emulated(int size, unsigned short port, void *val, - unsigned int count, struct kvm_vcpu *vcpu) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + if (vcpu->arch.pio.count) goto data_avail; @@ -4004,10 +4135,12 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val, return 0; } -static int emulator_pio_out_emulated(int size, unsigned short port, - const void *val, unsigned int count, - struct kvm_vcpu *vcpu) +static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, + const void *val, unsigned int count) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + trace_kvm_pio(1, port, size, count); vcpu->arch.pio.port = port; @@ -4037,10 +4170,9 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) return kvm_x86_ops->get_segment_base(vcpu, seg); } -int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) +static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) { - kvm_mmu_invlpg(vcpu, address); - return X86EMUL_CONTINUE; + kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); } int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) @@ -4062,22 +4194,20 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); -int emulate_clts(struct kvm_vcpu *vcpu) +static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) { - kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); - kvm_x86_ops->fpu_activate(vcpu); - return X86EMUL_CONTINUE; + kvm_emulate_wbinvd(emul_to_vcpu(ctxt)); } -int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) +int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { - return _kvm_get_dr(vcpu, dr, dest); + return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); } -int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) +int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { - return __kvm_set_dr(vcpu, dr, value); + return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); } static u64 mk_cr_64(u64 curr_cr, u32 new_val) @@ -4085,8 +4215,9 @@ static u64 mk_cr_64(u64 curr_cr, u32 new_val) return (curr_cr & ~((1ULL << 32) - 1)) | new_val; } -static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) +static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); unsigned long value; switch (cr) { @@ -4113,8 +4244,9 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) return value; } -static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) +static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int res = 0; switch (cr) { @@ -4141,33 +4273,45 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) return res; } -static int emulator_get_cpl(struct kvm_vcpu *vcpu) +static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) +{ + return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); +} + +static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) +{ + kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt); +} + +static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - return kvm_x86_ops->get_cpl(vcpu); + kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt); } -static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) +static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_gdt(vcpu, dt); + kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt); } -static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) +static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_idt(vcpu, dt); + kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt); } -static unsigned long emulator_get_cached_segment_base(int seg, - struct kvm_vcpu *vcpu) +static unsigned long emulator_get_cached_segment_base( + struct x86_emulate_ctxt *ctxt, int seg) { - return get_segment_base(vcpu, seg); + return get_segment_base(emul_to_vcpu(ctxt), seg); } -static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3, - int seg, struct kvm_vcpu *vcpu) +static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, + struct desc_struct *desc, u32 *base3, + int seg) { struct kvm_segment var; - kvm_get_segment(vcpu, &var, seg); + kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); + *selector = var.selector; if (var.unusable) return false; @@ -4192,14 +4336,14 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3, return true; } -static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3, - int seg, struct kvm_vcpu *vcpu) +static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, + struct desc_struct *desc, u32 base3, + int seg) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); struct kvm_segment var; - /* needed to preserve selector */ - kvm_get_segment(vcpu, &var, seg); - + var.selector = selector; var.base = get_desc_base(desc); #ifdef CONFIG_X86_64 var.base |= ((u64)base3) << 32; @@ -4223,22 +4367,44 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3, return; } -static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu) +static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 *pdata) { - struct kvm_segment kvm_seg; + return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); +} - kvm_get_segment(vcpu, &kvm_seg, seg); - return kvm_seg.selector; +static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 data) +{ + return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); } -static void emulator_set_segment_selector(u16 sel, int seg, - struct kvm_vcpu *vcpu) +static void emulator_halt(struct x86_emulate_ctxt *ctxt) { - struct kvm_segment kvm_seg; + emul_to_vcpu(ctxt)->arch.halt_request = 1; +} - kvm_get_segment(vcpu, &kvm_seg, seg); - kvm_seg.selector = sel; - kvm_set_segment(vcpu, &kvm_seg, seg); +static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) +{ + preempt_disable(); + kvm_load_guest_fpu(emul_to_vcpu(ctxt)); + /* + * CR0.TS may reference the host fpu state, not the guest fpu state, + * so it may be clear at this point. + */ + clts(); +} + +static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) +{ + preempt_enable(); +} + +static int emulator_intercept(struct x86_emulate_ctxt *ctxt, + struct x86_instruction_info *info, + enum x86_intercept_stage stage) +{ + return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); } static struct x86_emulate_ops emulate_ops = { @@ -4248,22 +4414,29 @@ static struct x86_emulate_ops emulate_ops = { .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, + .invlpg = emulator_invlpg, .pio_in_emulated = emulator_pio_in_emulated, .pio_out_emulated = emulator_pio_out_emulated, - .get_cached_descriptor = emulator_get_cached_descriptor, - .set_cached_descriptor = emulator_set_cached_descriptor, - .get_segment_selector = emulator_get_segment_selector, - .set_segment_selector = emulator_set_segment_selector, + .get_segment = emulator_get_segment, + .set_segment = emulator_set_segment, .get_cached_segment_base = emulator_get_cached_segment_base, .get_gdt = emulator_get_gdt, .get_idt = emulator_get_idt, + .set_gdt = emulator_set_gdt, + .set_idt = emulator_set_idt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, - .set_msr = kvm_set_msr, - .get_msr = kvm_get_msr, + .set_msr = emulator_set_msr, + .get_msr = emulator_get_msr, + .halt = emulator_halt, + .wbinvd = emulator_wbinvd, + .fix_hypercall = emulator_fix_hypercall, + .get_fpu = emulator_get_fpu, + .put_fpu = emulator_put_fpu, + .intercept = emulator_intercept, }; static void cache_all_regs(struct kvm_vcpu *vcpu) @@ -4305,12 +4478,17 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; int cs_db, cs_l; + /* + * TODO: fix emulate.c to use guest_read/write_register + * instead of direct ->regs accesses, can save hundred cycles + * on Intel for instructions that don't read/change RSP, for + * for example. + */ cache_all_regs(vcpu); kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); vcpu->arch.emulate_ctxt.mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : @@ -4318,11 +4496,13 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu); memset(c, 0, sizeof(struct decode_cache)); memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; int ret; @@ -4331,7 +4511,8 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) vcpu->arch.emulate_ctxt.decode.op_bytes = 2; vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; - vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip; + vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip + + inc_eip; ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); if (ret != X86EMUL_CONTINUE) @@ -4340,7 +4521,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) vcpu->arch.emulate_ctxt.eip = c->eip; memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); if (irq == NMI_VECTOR) vcpu->arch.nmi_pending = false; @@ -4402,16 +4583,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, { int r; struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + bool writeback = true; kvm_clear_exception_queue(vcpu); - vcpu->arch.mmio_fault_cr2 = cr2; - /* - * TODO: fix emulate.c to use guest_read/write_register - * instead of direct ->regs accesses, can save hundred cycles - * on Intel for instructions that don't read/change RSP, for - * for example. - */ - cache_all_regs(vcpu); if (!(emulation_type & EMULTYPE_NO_DECODE)) { init_emulate_ctxt(vcpu); @@ -4442,13 +4616,19 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DONE; } - /* this is needed for vmware backdor interface to work since it + /* this is needed for vmware backdoor interface to work since it changes registers values during IO operation */ - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { + vcpu->arch.emulate_regs_need_sync_from_vcpu = false; + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + } restart: r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); + if (r == EMULATION_INTERCEPTED) + return EMULATE_DONE; + if (r == EMULATION_FAILED) { if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; @@ -4462,21 +4642,28 @@ restart: } else if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) vcpu->arch.pio.count = 0; + else + writeback = false; r = EMULATE_DO_MMIO; } else if (vcpu->mmio_needed) { - if (vcpu->mmio_is_write) - vcpu->mmio_needed = 0; + if (!vcpu->mmio_is_write) + writeback = false; r = EMULATE_DO_MMIO; } else if (r == EMULATION_RESTART) goto restart; else r = EMULATE_DONE; - toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + if (writeback) { + toggle_interruptibility(vcpu, + vcpu->arch.emulate_ctxt.interruptibility); + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + } else + vcpu->arch.emulate_regs_need_sync_to_vcpu = true; return r; } @@ -4485,7 +4672,8 @@ EXPORT_SYMBOL_GPL(x86_emulate_instruction); int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) { unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); - int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu); + int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, + size, port, &val, 1); /* do not return to emulator after return from userspace */ vcpu->arch.pio.count = 0; return ret; @@ -4879,8 +5067,9 @@ out: } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); -int kvm_fix_hypercall(struct kvm_vcpu *vcpu) +int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); @@ -4893,21 +5082,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); -} - -void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) -{ - struct desc_ptr dt = { limit, base }; - - kvm_x86_ops->set_gdt(vcpu, &dt); -} - -void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) -{ - struct desc_ptr dt = { limit, base }; - - kvm_x86_ops->set_idt(vcpu, &dt); + return emulator_write_emulated(&vcpu->arch.emulate_ctxt, + rip, instruction, 3, NULL); } static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) @@ -5170,6 +5346,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; + bool nmi_pending; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; @@ -5207,19 +5384,25 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 1; goto out; } - if (kvm_check_request(KVM_REQ_NMI, vcpu)) - vcpu->arch.nmi_pending = true; } r = kvm_mmu_reload(vcpu); if (unlikely(r)) goto out; + /* + * An NMI can be injected between local nmi_pending read and + * vcpu->arch.nmi_pending read inside inject_pending_event(). + * But in that case, KVM_REQ_EVENT will be set, which makes + * the race described above benign. + */ + nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending); + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { inject_pending_event(vcpu); /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) + if (nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) kvm_x86_ops->enable_irq_window(vcpu); @@ -5399,6 +5582,41 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) return r; } +static int complete_mmio(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + int r; + + if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) + return 1; + + if (vcpu->mmio_needed) { + vcpu->mmio_needed = 0; + if (!vcpu->mmio_is_write) + memcpy(vcpu->mmio_data + vcpu->mmio_index, + run->mmio.data, 8); + vcpu->mmio_index += 8; + if (vcpu->mmio_index < vcpu->mmio_size) { + run->exit_reason = KVM_EXIT_MMIO; + run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index; + memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8); + run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8); + run->mmio.is_write = vcpu->mmio_is_write; + vcpu->mmio_needed = 1; + return 0; + } + if (vcpu->mmio_is_write) + return 1; + vcpu->mmio_read_completed = 1; + } + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + if (r != EMULATE_DONE) + return 0; + return 1; +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -5425,20 +5643,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } - if (vcpu->arch.pio.count || vcpu->mmio_needed) { - if (vcpu->mmio_needed) { - memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); - vcpu->mmio_read_completed = 1; - vcpu->mmio_needed = 0; - } - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r != EMULATE_DONE) { - r = 0; - goto out; - } - } + r = complete_mmio(vcpu); + if (r <= 0) + goto out; + if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) kvm_register_write(vcpu, VCPU_REGS_RAX, kvm_run->hypercall.ret); @@ -5455,6 +5663,18 @@ out: int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + if (vcpu->arch.emulate_regs_need_sync_to_vcpu) { + /* + * We are here if userspace calls get_regs() in the middle of + * instruction emulation. Registers state needs to be copied + * back from emulation context to vcpu. Usrapace shouldn't do + * that usually, but some bad designed PV devices (vmware + * backdoor interface) need this to work + */ + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + } regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -5482,6 +5702,9 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + vcpu->arch.emulate_regs_need_sync_from_vcpu = true; + vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); @@ -5592,7 +5815,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); return EMULATE_DONE; } @@ -5974,8 +6197,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); - if (!kvm->arch.virtual_tsc_khz) - kvm_arch_set_tsc_khz(kvm, max_tsc_khz); + kvm_init_tsc_catchup(vcpu, max_tsc_khz); r = kvm_mmu_create(vcpu); if (r < 0) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c600da830ce..e407ed3df81 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -77,7 +77,7 @@ static inline u32 bit(int bitno) void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c index 38e6d174c49..9f0614daea8 100644 --- a/arch/x86/mm/pf_in.c +++ b/arch/x86/mm/pf_in.c @@ -414,22 +414,17 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) unsigned char *p; struct prefix_bits prf; int i; - unsigned long rv; p = (unsigned char *)ins_addr; p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(reg_rop); i++) - if (reg_rop[i] == opcode) { - rv = REG_READ; + if (reg_rop[i] == opcode) goto do_work; - } for (i = 0; i < ARRAY_SIZE(reg_wop); i++) - if (reg_wop[i] == opcode) { - rv = REG_WRITE; + if (reg_wop[i] == opcode) goto do_work; - } printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " "0x%02x\n", opcode); @@ -474,16 +469,13 @@ unsigned long get_ins_imm_val(unsigned long ins_addr) unsigned char *p; struct prefix_bits prf; int i; - unsigned long rv; p = (unsigned char *)ins_addr; p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(imm_wop); i++) - if (imm_wop[i] == opcode) { - rv = IMM_WRITE; + if (imm_wop[i] == opcode) goto do_work; - } printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " "0x%02x\n", opcode); diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index bfd0632fe65..b480d4207a4 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -36,7 +36,7 @@ int __init pci_xen_swiotlb_detect(void) /* If running as PV guest, either iommu=soft, or swiotlb=force will * activate this IOMMU. If running as PV privileged, activate it - * irregardlesss. + * irregardless. */ if ((xen_initial_domain() || swiotlb || swiotlb_force) && (xen_pv_domain())) diff --git a/arch/xtensa/configs/s6105_defconfig b/arch/xtensa/configs/s6105_defconfig index 42b7feba71b..4891abbf16b 100644 --- a/arch/xtensa/configs/s6105_defconfig +++ b/arch/xtensa/configs/s6105_defconfig @@ -23,7 +23,6 @@ CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" # CONFIG_EXPERIMENTAL=y CONFIG_BROKEN_ON_SMP=y -CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 CONFIG_LOCALVERSION="" CONFIG_LOCALVERSION_AUTO=y |