summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/alpha/Makefile1
-rw-r--r--arch/alpha/kernel/core_t2.c2
-rw-r--r--arch/alpha/kernel/pci.c17
-rw-r--r--arch/alpha/kernel/traps.c3
-rw-r--r--arch/arm/tools/mach-types126
-rw-r--r--arch/ia64/kernel/iosapic.c2
-rw-r--r--arch/ia64/kernel/setup.c3
-rw-r--r--arch/ia64/sn/kernel/sn2/sn2_smp.c2
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/kvmclock.c89
-rw-r--r--arch/x86/kernel/pvclock.c141
-rw-r--r--arch/x86/kvm/i8254.c9
-rw-r--r--arch/x86/kvm/lapic.c1
-rw-r--r--arch/x86/kvm/mmu.c19
-rw-r--r--arch/x86/kvm/vmx.c19
-rw-r--r--arch/x86/kvm/x86.c91
-rw-r--r--arch/x86/xen/Kconfig3
-rw-r--r--arch/x86/xen/enlighten.c56
-rw-r--r--arch/x86/xen/mmu.c75
-rw-r--r--arch/x86/xen/mmu.h24
-rw-r--r--arch/x86/xen/time.c132
-rw-r--r--arch/x86/xen/xen-head.S6
23 files changed, 482 insertions, 345 deletions
diff --git a/arch/alpha/Makefile b/arch/alpha/Makefile
index 4e1a8e2c454..4759fe751aa 100644
--- a/arch/alpha/Makefile
+++ b/arch/alpha/Makefile
@@ -13,6 +13,7 @@ NM := $(NM) -B
LDFLAGS_vmlinux := -static -N #-relax
CHECKFLAGS += -D__alpha__ -m64
cflags-y := -pipe -mno-fp-regs -ffixed-8 -msmall-data
+cflags-y += $(call cc-option, -fno-jump-tables)
cpuflags-$(CONFIG_ALPHA_EV4) := -mcpu=ev4
cpuflags-$(CONFIG_ALPHA_EV5) := -mcpu=ev5
diff --git a/arch/alpha/kernel/core_t2.c b/arch/alpha/kernel/core_t2.c
index c0750291b44..d9980d47ab8 100644
--- a/arch/alpha/kernel/core_t2.c
+++ b/arch/alpha/kernel/core_t2.c
@@ -74,6 +74,8 @@
# define DBG(args)
#endif
+DEFINE_SPINLOCK(t2_hae_lock);
+
static volatile unsigned int t2_mcheck_any_expected;
static volatile unsigned int t2_mcheck_last_taken;
diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c
index 36ab22a7ea1..5cf45fc5134 100644
--- a/arch/alpha/kernel/pci.c
+++ b/arch/alpha/kernel/pci.c
@@ -71,6 +71,23 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82378, quirk_i
static void __init
quirk_cypress(struct pci_dev *dev)
{
+ /* The Notorious Cy82C693 chip. */
+
+ /* The generic legacy mode IDE fixup in drivers/pci/probe.c
+ doesn't work correctly with the Cypress IDE controller as
+ it has non-standard register layout. Fix that. */
+ if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE) {
+ dev->resource[2].start = dev->resource[3].start = 0;
+ dev->resource[2].end = dev->resource[3].end = 0;
+ dev->resource[2].flags = dev->resource[3].flags = 0;
+ if (PCI_FUNC(dev->devfn) == 2) {
+ dev->resource[0].start = 0x170;
+ dev->resource[0].end = 0x177;
+ dev->resource[1].start = 0x376;
+ dev->resource[1].end = 0x376;
+ }
+ }
+
/* The Cypress bridge responds on the PCI bus in the address range
0xffff0000-0xffffffff (conventional x86 BIOS ROM). There is no
way to turn this off. The bridge also supports several extended
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index dc57790250d..c778779007f 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -447,7 +447,7 @@ struct unaligned_stat {
/* Macro for exception fixup code to access integer registers. */
-#define una_reg(r) (regs->regs[(r) >= 16 && (r) <= 18 ? (r)+19 : (r)])
+#define una_reg(r) (_regs[(r) >= 16 && (r) <= 18 ? (r)+19 : (r)])
asmlinkage void
@@ -456,6 +456,7 @@ do_entUna(void * va, unsigned long opcode, unsigned long reg,
{
long error, tmp1, tmp2, tmp3, tmp4;
unsigned long pc = regs->pc - 4;
+ unsigned long *_regs = regs->regs;
const struct exception_table_entry *fixup;
unaligned[0].count++;
diff --git a/arch/arm/tools/mach-types b/arch/arm/tools/mach-types
index 207a8b5a0c4..0be5630ff56 100644
--- a/arch/arm/tools/mach-types
+++ b/arch/arm/tools/mach-types
@@ -12,7 +12,7 @@
#
# http://www.arm.linux.org.uk/developer/machines/?action=new
#
-# Last update: Sat Apr 19 11:23:38 2008
+# Last update: Mon Jul 7 16:25:39 2008
#
# machine_is_xxx CONFIG_xxxx MACH_TYPE_xxx number
#
@@ -560,7 +560,6 @@ husky MACH_HUSKY HUSKY 543
boxer MACH_BOXER BOXER 544
shepherd MACH_SHEPHERD SHEPHERD 545
aml42800aa MACH_AML42800AA AML42800AA 546
-ml674001 MACH_MACH_TYPE_ML674001 MACH_TYPE_ML674001 547
lpc2294 MACH_LPC2294 LPC2294 548
switchgrass MACH_SWITCHGRASS SWITCHGRASS 549
ens_cmu MACH_ENS_CMU ENS_CMU 550
@@ -748,7 +747,6 @@ anubis MACH_ANUBIS ANUBIS 734
ite8152 MACH_ITE8152 ITE8152 735
lpc3xxx MACH_LPC3XXX LPC3XXX 736
puppeteer MACH_PUPPETEER PUPPETEER 737
-vt001 MACH_MACH_VADATECH MACH_VADATECH 738
e570 MACH_E570 E570 739
x50 MACH_X50 X50 740
recon MACH_RECON RECON 741
@@ -839,7 +837,7 @@ ccxp270 MACH_CCXP CCXP 825
omap_gsample MACH_OMAP_GSAMPLE OMAP_GSAMPLE 826
realview_eb MACH_REALVIEW_EB REALVIEW_EB 827
samoa MACH_SAMOA SAMOA 828
-t3xscale MACH_T3XSCALE T3XSCALE 829
+palmt3 MACH_PALMT3 PALMT3 829
i878 MACH_I878 I878 830
borzoi MACH_BORZOI BORZOI 831
gecko MACH_GECKO GECKO 832
@@ -895,7 +893,7 @@ mio8390 MACH_MIO8390 MIO8390 881
omi_board MACH_OMI_BOARD OMI_BOARD 882
mx21civ MACH_MX21CIV MX21CIV 883
mahi_cdac MACH_MAHI_CDAC MAHI_CDAC 884
-xscale_palmtx MACH_XSCALE_PALMTX XSCALE_PALMTX 885
+palmtx MACH_PALMTX PALMTX 885
s3c2413 MACH_S3C2413 S3C2413 887
samsys_ep0 MACH_SAMSYS_EP0 SAMSYS_EP0 888
wg302v1 MACH_WG302V1 WG302V1 889
@@ -918,7 +916,7 @@ nxdb500 MACH_NXDB500 NXDB500 905
apf9328 MACH_APF9328 APF9328 906
omap_wipoq MACH_OMAP_WIPOQ OMAP_WIPOQ 907
omap_twip MACH_OMAP_TWIP OMAP_TWIP 908
-xscale_treo650 MACH_XSCALE_PALMTREO650 XSCALE_PALMTREO650 909
+palmtreo650 MACH_PALMTREO650 PALMTREO650 909
acumen MACH_ACUMEN ACUMEN 910
xp100 MACH_XP100 XP100 911
fs2410 MACH_FS2410 FS2410 912
@@ -926,8 +924,8 @@ pxa270_cerf MACH_PXA270_CERF PXA270_CERF 913
sq2ftlpalm MACH_SQ2FTLPALM SQ2FTLPALM 914
bsemserver MACH_BSEMSERVER BSEMSERVER 915
netclient MACH_NETCLIENT NETCLIENT 916
-xscale_palmtt5 MACH_XSCALE_PALMTT5 XSCALE_PALMTT5 917
-xscale_palmtc MACH_OMAP_PALMTC OMAP_PALMTC 918
+palmt5 MACH_PALMT5 PALMT5 917
+palmtc MACH_PALMTC PALMTC 918
omap_apollon MACH_OMAP_APOLLON OMAP_APOLLON 919
mxc30030evb MACH_MXC30030EVB MXC30030EVB 920
rea_2d MACH_REA_2D REA_2D 921
@@ -1220,7 +1218,6 @@ empca400 MACH_EMPCA400 EMPCA400 1211
em7210 MACH_EM7210 EM7210 1212
htchermes MACH_HTCHERMES HTCHERMES 1213
eti_c1 MACH_ETI_C1 ETI_C1 1214
-mach_dep2410 MACH_MACH_DEP2410 MACH_DEP2410 1215
ac100 MACH_AC100 AC100 1216
sneetch MACH_SNEETCH SNEETCH 1217
studentmate MACH_STUDENTMATE STUDENTMATE 1218
@@ -1421,10 +1418,10 @@ looxc550 MACH_LOOXC550 LOOXC550 1417
cnty_titan MACH_CNTY_TITAN CNTY_TITAN 1418
app3xx MACH_APP3XX APP3XX 1419
sideoatsgrama MACH_SIDEOATSGRAMA SIDEOATSGRAMA 1420
-xscale_palmt700p MACH_XSCALE_PALMT700P XSCALE_PALMT700P 1421
-xscale_palmt700w MACH_XSCALE_PALMT700W XSCALE_PALMT700W 1422
-xscale_palmt750 MACH_XSCALE_PALMT750 XSCALE_PALMT750 1423
-xscale_palmt755p MACH_XSCALE_PALMT755P XSCALE_PALMT755P 1424
+palmtreo700p MACH_PALMTREO700P PALMTREO700P 1421
+palmtreo700w MACH_PALMTREO700W PALMTREO700W 1422
+palmtreo750 MACH_PALMTREO750 PALMTREO750 1423
+palmtreo755p MACH_PALMTREO755P PALMTREO755P 1424
ezreganut9200 MACH_EZREGANUT9200 EZREGANUT9200 1425
sarge MACH_SARGE SARGE 1426
a696 MACH_A696 A696 1427
@@ -1463,7 +1460,7 @@ artemis MACH_ARTEMIS ARTEMIS 1462
htctitan MACH_HTCTITAN HTCTITAN 1463
qranium MACH_QRANIUM QRANIUM 1464
adx_wsc2 MACH_ADX_WSC2 ADX_WSC2 1465
-adx_medcom MACH_ADX_MEDINET ADX_MEDINET 1466
+adx_medcom MACH_ADX_MEDCOM ADX_MEDCOM 1466
bboard MACH_BBOARD BBOARD 1467
cambria MACH_CAMBRIA CAMBRIA 1468
mt7xxx MACH_MT7XXX MT7XXX 1469
@@ -1519,7 +1516,7 @@ wp188 MACH_WP188 WP188 1518
corsica MACH_CORSICA CORSICA 1519
bigeye MACH_BIGEYE BIGEYE 1520
tll5000 MACH_TLL5000 TLL5000 1522
-hni270 MACH_HNI_X270 HNI_X270 1523
+bebot MACH_BEBOT BEBOT 1523
qong MACH_QONG QONG 1524
tcompact MACH_TCOMPACT TCOMPACT 1525
puma5 MACH_PUMA5 PUMA5 1526
@@ -1636,7 +1633,6 @@ awlug4lcu MACH_AWLUG4LCU AWLUG4LCU 1637
palermoc MACH_PALERMOC PALERMOC 1638
omap_ldp MACH_OMAP_LDP OMAP_LDP 1639
ip500 MACH_IP500 IP500 1640
-mx35ads MACH_MACH_MX35ADS MACH_MX35ADS 1641
ase2 MACH_ASE2 ASE2 1642
mx35evb MACH_MX35EVB MX35EVB 1643
aml_m8050 MACH_AML_M8050 AML_M8050 1644
@@ -1647,7 +1643,7 @@ badger MACH_BADGER BADGER 1648
trizeps4wl MACH_TRIZEPS4WL TRIZEPS4WL 1649
trizeps5 MACH_TRIZEPS5 TRIZEPS5 1650
marlin MACH_MARLIN MARLIN 1651
-ts7800 MACH_TS7800 TS7800 1652
+ts78xx MACH_TS78XX TS78XX 1652
hpipaq214 MACH_HPIPAQ214 HPIPAQ214 1653
at572d940dcm MACH_AT572D940DCM AT572D940DCM 1654
ne1board MACH_NE1BOARD NE1BOARD 1655
@@ -1720,3 +1716,99 @@ htc_kaiser MACH_HTC_KAISER HTC_KAISER 1724
lg_ks20 MACH_LG_KS20 LG_KS20 1725
hhgps MACH_HHGPS HHGPS 1726
nokia_n810_wimax MACH_NOKIA_N810_WIMAX NOKIA_N810_WIMAX 1727
+insight MACH_INSIGHT INSIGHT 1728
+sapphire MACH_SAPPHIRE SAPPHIRE 1729
+csb637xo MACH_CSB637XO CSB637XO 1730
+evisiong MACH_EVISIONG EVISIONG 1731
+stmp37xx MACH_STMP37XX STMP37XX 1732
+stmp378x MACH_STMP38XX STMP38XX 1733
+tnt MACH_TNT TNT 1734
+tbxt MACH_TBXT TBXT 1735
+playmate MACH_PLAYMATE PLAYMATE 1736
+pns10 MACH_PNS10 PNS10 1737
+eznavi MACH_EZNAVI EZNAVI 1738
+ps4000 MACH_PS4000 PS4000 1739
+ezx_a780 MACH_EZX_A780 EZX_A780 1740
+ezx_e680 MACH_EZX_E680 EZX_E680 1741
+ezx_a1200 MACH_EZX_A1200 EZX_A1200 1742
+ezx_e6 MACH_EZX_E6 EZX_E6 1743
+ezx_e2 MACH_EZX_E2 EZX_E2 1744
+ezx_a910 MACH_EZX_A910 EZX_A910 1745
+cwmx31 MACH_CWMX31 CWMX31 1746
+sl2312 MACH_SL2312 SL2312 1747
+blenny MACH_BLENNY BLENNY 1748
+ds107 MACH_DS107 DS107 1749
+dsx07 MACH_DSX07 DSX07 1750
+picocom1 MACH_PICOCOM1 PICOCOM1 1751
+lynx_wolverine MACH_LYNX_WOLVERINE LYNX_WOLVERINE 1752
+ubisys_p9_sc19 MACH_UBISYS_P9_SC19 UBISYS_P9_SC19 1753
+kratos_low MACH_KRATOS_LOW KRATOS_LOW 1754
+m700 MACH_M700 M700 1755
+edmini_v2 MACH_EDMINI_V2 EDMINI_V2 1756
+zipit2 MACH_ZIPIT2 ZIPIT2 1757
+hslfemtocell MACH_HSLFEMTOCELL HSLFEMTOCELL 1758
+daintree_at91 MACH_DAINTREE_AT91 DAINTREE_AT91 1759
+sg560usb MACH_SG560USB SG560USB 1760
+omap3_pandora MACH_OMAP3_PANDORA OMAP3_PANDORA 1761
+usr8200 MACH_USR8200 USR8200 1762
+s1s65k MACH_S1S65K S1S65K 1763
+s2s65a MACH_S2S65A S2S65A 1764
+icore MACH_ICORE ICORE 1765
+mss2 MACH_MSS2 MSS2 1766
+belmont MACH_BELMONT BELMONT 1767
+asusp525 MACH_ASUSP525 ASUSP525 1768
+lb88rc8480 MACH_LB88RC8480 LB88RC8480 1769
+hipxa MACH_HIPXA HIPXA 1770
+mx25_3ds MACH_MX25_3DS MX25_3DS 1771
+m800 MACH_M800 M800 1772
+omap3530_lv_som MACH_OMAP3530_LV_SOM OMAP3530_LV_SOM 1773
+prima_evb MACH_PRIMA_EVB PRIMA_EVB 1774
+mx31bt1 MACH_MX31BT1 MX31BT1 1775
+atlas4_evb MACH_ATLAS4_EVB ATLAS4_EVB 1776
+mx31cicada MACH_MX31CICADA MX31CICADA 1777
+mi424wr MACH_MI424WR MI424WR 1778
+axs_ultrax MACH_AXS_ULTRAX AXS_ULTRAX 1779
+at572d940deb MACH_AT572D940DEB AT572D940DEB 1780
+davinci_da8xx_evm MACH_DAVINCI_DA8XX_EVM DAVINCI_DA8XX_EVM 1781
+ep9302 MACH_EP9302 EP9302 1782
+at572d940hfeb MACH_AT572D940HFEB AT572D940HFEB 1783
+cybook3 MACH_CYBOOK3 CYBOOK3 1784
+wdg002 MACH_WDG002 WDG002 1785
+sg560adsl MACH_SG560ADSL SG560ADSL 1786
+nextio_n2800_ica MACH_NEXTIO_N2800_ICA NEXTIO_N2800_ICA 1787
+marvell_newdb MACH_MARVELL_NEWDB MARVELL_NEWDB 1789
+vandihud MACH_VANDIHUD VANDIHUD 1790
+magx_e8 MACH_MAGX_E8 MAGX_E8 1791
+magx_z6 MACH_MAGX_Z6 MAGX_Z6 1792
+magx_v8 MACH_MAGX_V8 MAGX_V8 1793
+magx_u9 MACH_MAGX_U9 MAGX_U9 1794
+toughcf08 MACH_TOUGHCF08 TOUGHCF08 1795
+zw4400 MACH_ZW4400 ZW4400 1796
+marat91 MACH_MARAT91 MARAT91 1797
+overo MACH_OVERO OVERO 1798
+at2440evb MACH_AT2440EVB AT2440EVB 1799
+neocore926 MACH_NEOCORE926 NEOCORE926 1800
+wnr854t MACH_WNR854T WNR854T 1801
+imx27 MACH_IMX27 IMX27 1802
+moose_db MACH_MOOSE_DB MOOSE_DB 1803
+fab4 MACH_FAB4 FAB4 1804
+htcdiamond MACH_HTCDIAMOND HTCDIAMOND 1805
+fiona MACH_FIONA FIONA 1806
+mxc30030_x MACH_MXC30030_X MXC30030_X 1807
+bmp1000 MACH_BMP1000 BMP1000 1808
+logi9200 MACH_LOGI9200 LOGI9200 1809
+tqma31 MACH_TQMA31 TQMA31 1810
+ccw9p9215js MACH_CCW9P9215JS CCW9P9215JS 1811
+rd88f5181l_ge MACH_RD88F5181L_GE RD88F5181L_GE 1812
+sifmain MACH_SIFMAIN SIFMAIN 1813
+sam9_l9261 MACH_SAM9_L9261 SAM9_L9261 1814
+cc9m2443js MACH_CC9M2443JS CC9M2443JS 1815
+xaria300 MACH_XARIA300 XARIA300 1816
+it9200 MACH_IT9200 IT9200 1817
+rd88f5181l_fxo MACH_RD88F5181L_FXO RD88F5181L_FXO 1818
+kriss_sensor MACH_KRISS_SENSOR KRISS_SENSOR 1819
+pilz_pmi5 MACH_PILZ_PMI5 PILZ_PMI5 1820
+jade MACH_JADE JADE 1821
+ks8695_softplc MACH_KS8695_SOFTPLC KS8695_SOFTPLC 1822
+gprisc4 MACH_GPRISC4 GPRISC4 1823
+stamp9260 MACH_STAMP9260 STAMP9260 1824
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 082c31dcfd9..39752cdef6f 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -558,8 +558,6 @@ static struct iosapic_rte_info * __init_refok iosapic_alloc_rte (void)
if (!iosapic_kmalloc_ok && list_empty(&free_rte_list)) {
rte = alloc_bootmem(sizeof(struct iosapic_rte_info) *
NR_PREALLOCATE_RTE_ENTRIES);
- if (!rte)
- return NULL;
for (i = 0; i < NR_PREALLOCATE_RTE_ENTRIES; i++, rte++)
list_add(&rte->rte_list, &free_rte_list);
}
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index f48a809c686..4ae15c8c248 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -578,8 +578,6 @@ setup_arch (char **cmdline_p)
cpu_init(); /* initialize the bootstrap CPU */
mmu_context_init(); /* initialize context_id bitmap */
- check_sal_cache_flush();
-
#ifdef CONFIG_ACPI
acpi_boot_init();
#endif
@@ -607,6 +605,7 @@ setup_arch (char **cmdline_p)
ia64_mca_init();
platform_setup(cmdline_p);
+ check_sal_cache_flush();
paging_init();
}
diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index 49d3120415e..e585f9a2afb 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -512,6 +512,8 @@ static ssize_t sn2_ptc_proc_write(struct file *file, const char __user *user, si
int cpu;
char optstr[64];
+ if (count == 0 || count > sizeof(optstr))
+ return -EINVAL;
if (copy_from_user(optstr, user, count))
return -EFAULT;
optstr[count - 1] = '\0';
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 52e18e6d2ba..e0edaaa6920 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -383,6 +383,7 @@ config VMI
config KVM_CLOCK
bool "KVM paravirtualized clock"
select PARAVIRT
+ select PARAVIRT_CLOCK
depends on !(X86_VISWS || X86_VOYAGER)
help
Turning on this option will allow you to run a paravirtualized clock
@@ -410,6 +411,10 @@ config PARAVIRT
over full virtualization. However, when run without a hypervisor
the kernel is theoretically slower and slightly larger.
+config PARAVIRT_CLOCK
+ bool
+ default n
+
endif
config MEMTEST_BOOTPARAM
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b472..77807d4769c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
obj-$(CONFIG_KVM_GUEST) += kvm.o
obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
+obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 08a30986d47..87edf1ceb1d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -18,6 +18,7 @@
#include <linux/clocksource.h>
#include <linux/kvm_para.h>
+#include <asm/pvclock.h>
#include <asm/arch_hooks.h>
#include <asm/msr.h>
#include <asm/apic.h>
@@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg)
early_param("no-kvmclock", parse_no_kvmclock);
/* The hypervisor will put information about time periodically here */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
-#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
+static struct pvclock_wall_clock wall_clock;
-static inline u64 kvm_get_delta(u64 last_tsc)
-{
- int cpu = smp_processor_id();
- u64 delta = native_read_tsc() - last_tsc;
- return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
-}
-
-static struct kvm_wall_clock wall_clock;
-static cycle_t kvm_clock_read(void);
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for
@@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void);
*/
static unsigned long kvm_get_wallclock(void)
{
- u32 wc_sec, wc_nsec;
- u64 delta;
+ struct pvclock_vcpu_time_info *vcpu_time;
struct timespec ts;
- int version, nsec;
int low, high;
low = (int)__pa(&wall_clock);
high = ((u64)__pa(&wall_clock) >> 32);
+ native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
- delta = kvm_clock_read();
+ vcpu_time = &get_cpu_var(hv_clock);
+ pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
+ put_cpu_var(hv_clock);
- native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
- do {
- version = wall_clock.wc_version;
- rmb();
- wc_sec = wall_clock.wc_sec;
- wc_nsec = wall_clock.wc_nsec;
- rmb();
- } while ((wall_clock.wc_version != version) || (version & 1));
-
- delta = kvm_clock_read() - delta;
- delta += wc_nsec;
- nsec = do_div(delta, NSEC_PER_SEC);
- set_normalized_timespec(&ts, wc_sec + delta, nsec);
- /*
- * Of all mechanisms of time adjustment I've tested, this one
- * was the champion!
- */
- return ts.tv_sec + 1;
+ return ts.tv_sec;
}
static int kvm_set_wallclock(unsigned long now)
{
- return 0;
+ return -1;
}
-/*
- * This is our read_clock function. The host puts an tsc timestamp each time
- * it updates a new time. Without the tsc adjustment, we can have a situation
- * in which a vcpu starts to run earlier (smaller system_time), but probes
- * time later (compared to another vcpu), leading to backwards time
- */
static cycle_t kvm_clock_read(void)
{
- u64 last_tsc, now;
- int cpu;
+ struct pvclock_vcpu_time_info *src;
+ cycle_t ret;
- preempt_disable();
- cpu = smp_processor_id();
-
- last_tsc = get_clock(cpu, tsc_timestamp);
- now = get_clock(cpu, system_time);
-
- now += kvm_get_delta(last_tsc);
- preempt_enable();
-
- return now;
+ src = &get_cpu_var(hv_clock);
+ ret = pvclock_clocksource_read(src);
+ put_cpu_var(hv_clock);
+ return ret;
}
+
static struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_read,
@@ -123,13 +88,14 @@ static struct clocksource kvm_clock = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
-static int kvm_register_clock(void)
+static int kvm_register_clock(char *txt)
{
int cpu = smp_processor_id();
int low, high;
low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
-
+ printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
+ cpu, high, low, txt);
return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
}
@@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void)
* Now that the first cpu already had this clocksource initialized,
* we shouldn't fail.
*/
- WARN_ON(kvm_register_clock());
+ WARN_ON(kvm_register_clock("secondary cpu clock"));
/* ok, done with our trickery, call native */
setup_secondary_APIC_clock();
}
#endif
+#ifdef CONFIG_SMP
+void __init kvm_smp_prepare_boot_cpu(void)
+{
+ WARN_ON(kvm_register_clock("primary cpu clock"));
+ native_smp_prepare_boot_cpu();
+}
+#endif
+
/*
* After the clock is registered, the host will keep writing to the
* registered memory location. If the guest happens to shutdown, this memory
@@ -174,7 +148,7 @@ void __init kvmclock_init(void)
return;
if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
- if (kvm_register_clock())
+ if (kvm_register_clock("boot clock"))
return;
pv_time_ops.get_wallclock = kvm_get_wallclock;
pv_time_ops.set_wallclock = kvm_set_wallclock;
@@ -182,6 +156,9 @@ void __init kvmclock_init(void)
#ifdef CONFIG_X86_LOCAL_APIC
pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
#endif
+#ifdef CONFIG_SMP
+ smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+#endif
machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC
machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
new file mode 100644
index 00000000000..05fbe9a0325
--- /dev/null
+++ b/arch/x86/kernel/pvclock.c
@@ -0,0 +1,141 @@
+/* paravirtual clock -- common code used by kvm/xen
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <asm/pvclock.h>
+
+/*
+ * These are perodically updated
+ * xen: magic shared_info page
+ * kvm: gpa registered via msr
+ * and then copied here.
+ */
+struct pvclock_shadow_time {
+ u64 tsc_timestamp; /* TSC at last update of time vals. */
+ u64 system_timestamp; /* Time, in nanosecs, since boot. */
+ u32 tsc_to_nsec_mul;
+ int tsc_shift;
+ u32 version;
+};
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+ u64 product;
+#ifdef __i386__
+ u32 tmp1, tmp2;
+#endif
+
+ if (shift < 0)
+ delta >>= -shift;
+ else
+ delta <<= shift;
+
+#ifdef __i386__
+ __asm__ (
+ "mul %5 ; "
+ "mov %4,%%eax ; "
+ "mov %%edx,%4 ; "
+ "mul %5 ; "
+ "xor %5,%5 ; "
+ "add %4,%%eax ; "
+ "adc %5,%%edx ; "
+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+ : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+ __asm__ (
+ "mul %%rdx ; shrd $32,%%rdx,%%rax"
+ : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+ return product;
+}
+
+static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
+{
+ u64 delta = native_read_tsc() - shadow->tsc_timestamp;
+ return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+/*
+ * Reads a consistent set of time-base values from hypervisor,
+ * into a shadow data area.
+ */
+static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
+ struct pvclock_vcpu_time_info *src)
+{
+ do {
+ dst->version = src->version;
+ rmb(); /* fetch version before data */
+ dst->tsc_timestamp = src->tsc_timestamp;
+ dst->system_timestamp = src->system_time;
+ dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
+ dst->tsc_shift = src->tsc_shift;
+ rmb(); /* test version after fetching data */
+ } while ((src->version & 1) || (dst->version != src->version));
+
+ return dst->version;
+}
+
+cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+{
+ struct pvclock_shadow_time shadow;
+ unsigned version;
+ cycle_t ret, offset;
+
+ do {
+ version = pvclock_get_time_values(&shadow, src);
+ barrier();
+ offset = pvclock_get_nsec_offset(&shadow);
+ ret = shadow.system_timestamp + offset;
+ barrier();
+ } while (version != src->version);
+
+ return ret;
+}
+
+void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
+ struct pvclock_vcpu_time_info *vcpu_time,
+ struct timespec *ts)
+{
+ u32 version;
+ u64 delta;
+ struct timespec now;
+
+ /* get wallclock at system boot */
+ do {
+ version = wall_clock->version;
+ rmb(); /* fetch version before time */
+ now.tv_sec = wall_clock->sec;
+ now.tv_nsec = wall_clock->nsec;
+ rmb(); /* fetch time before checking version */
+ } while ((wall_clock->version & 1) || (version != wall_clock->version));
+
+ delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
+ delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+
+ now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+ now.tv_sec = delta;
+
+ set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index f2f5d260874..3829aa7b663 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,9 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
atomic_inc(&pt->pending);
smp_mb__after_atomic_inc();
- if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
- vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- wake_up_interruptible(&vcpu0->wq);
+ if (vcpu0) {
+ set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
+ if (waitqueue_active(&vcpu0->wq)) {
+ vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ wake_up_interruptible(&vcpu0->wq);
+ }
}
pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c297c50eba6..ebc03f5ae16 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
wait_queue_head_t *q = &apic->vcpu->wq;
atomic_inc(&apic->timer.pending);
+ set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
if (waitqueue_active(q)) {
apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
wake_up_interruptible(q);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ee3f53098f0..7e7c3969f7a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
rmap_remove(kvm, spte);
--kvm->stat.lpages;
set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ spte = NULL;
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
@@ -1082,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
struct kvm_mmu_page *shadow;
spte |= PT_WRITABLE_MASK;
- if (user_fault) {
- mmu_unshadow(vcpu->kvm, gfn);
- goto unshadowed;
- }
shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
if (shadow ||
@@ -1102,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
}
}
-unshadowed:
-
if (pte_access & ACC_WRITE_MASK)
mark_page_dirty(vcpu->kvm, gfn);
@@ -1580,11 +1575,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
u64 *spte,
const void *new)
{
- if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
- && !vcpu->arch.update_pte.largepage) {
- ++vcpu->kvm->stat.mmu_pde_zapped;
- return;
- }
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+ if (!vcpu->arch.update_pte.largepage ||
+ sp->role.glevels == PT32_ROOT_LEVEL) {
+ ++vcpu->kvm->stat.mmu_pde_zapped;
+ return;
+ }
+ }
++vcpu->kvm->stat.mmu_pte_updated;
if (sp->role.glevels == PT32_ROOT_LEVEL)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 02efbe75f31..540e9517907 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
load_transition_efer(vmx);
}
-static void vmx_load_host_state(struct vcpu_vmx *vmx)
+static void __vmx_load_host_state(struct vcpu_vmx *vmx)
{
unsigned long flags;
@@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
reload_host_efer(vmx);
}
+static void vmx_load_host_state(struct vcpu_vmx *vmx)
+{
+ preempt_disable();
+ __vmx_load_host_state(vmx);
+ preempt_enable();
+}
+
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
@@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
- vmx_load_host_state(to_vmx(vcpu));
+ __vmx_load_host_state(to_vmx(vcpu));
}
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
switch (msr_index) {
#ifdef CONFIG_X86_64
case MSR_EFER:
+ vmx_load_host_state(vmx);
ret = kvm_set_msr_common(vcpu, msr_index, data);
- if (vmx->host_state.loaded) {
- reload_host_efer(vmx);
- load_transition_efer(vmx);
- }
break;
case MSR_FS_BASE:
vmcs_writel(GUEST_FS_BASE, data);
@@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
guest_write_tsc(data);
break;
default:
+ vmx_load_host_state(vmx);
msr = find_msr_entry(vmx, msr_index);
if (msr) {
msr->data = data;
- if (vmx->host_state.loaded)
- load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
break;
}
ret = kvm_set_msr_common(vcpu, msr_index, data);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00acf1301a1..63a77caa59f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
static int version;
- struct kvm_wall_clock wc;
- struct timespec wc_ts;
+ struct pvclock_wall_clock wc;
+ struct timespec now, sys, boot;
if (!wall_clock)
return;
@@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
- wc_ts = current_kernel_time();
- wc.wc_sec = wc_ts.tv_sec;
- wc.wc_nsec = wc_ts.tv_nsec;
- wc.wc_version = version;
+ /*
+ * The guest calculates current wall clock time by adding
+ * system time (updated by kvm_write_guest_time below) to the
+ * wall clock specified here. guest system time equals host
+ * system time for us, thus we must fill in host boot time here.
+ */
+ now = current_kernel_time();
+ ktime_get_ts(&sys);
+ boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
+
+ wc.sec = boot.tv_sec;
+ wc.nsec = boot.tv_nsec;
+ wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
@@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
}
+static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
+{
+ uint32_t quotient, remainder;
+
+ /* Don't try to replace with do_div(), this one calculates
+ * "(dividend << 32) / divisor" */
+ __asm__ ( "divl %4"
+ : "=a" (quotient), "=d" (remainder)
+ : "0" (0), "1" (dividend), "r" (divisor) );
+ return quotient;
+}
+
+static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
+{
+ uint64_t nsecs = 1000000000LL;
+ int32_t shift = 0;
+ uint64_t tps64;
+ uint32_t tps32;
+
+ tps64 = tsc_khz * 1000LL;
+ while (tps64 > nsecs*2) {
+ tps64 >>= 1;
+ shift--;
+ }
+
+ tps32 = (uint32_t)tps64;
+ while (tps32 <= (uint32_t)nsecs) {
+ tps32 <<= 1;
+ shift++;
+ }
+
+ hv_clock->tsc_shift = shift;
+ hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
+
+ pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
+ __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
+ hv_clock->tsc_to_system_mul);
+}
+
static void kvm_write_guest_time(struct kvm_vcpu *v)
{
struct timespec ts;
@@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
if ((!vcpu->time_page))
return;
+ if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
+ kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
+ vcpu->hv_clock_tsc_khz = tsc_khz;
+ }
+
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
@@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
/*
* The interface expects us to write an even number signaling that the
* update is finished. Since the guest won't see the intermediate
- * state, we just write "2" at the end
+ * state, we just increase by 2 at the end.
*/
- vcpu->hv_clock.version = 2;
+ vcpu->hv_clock.version += 2;
shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
- sizeof(vcpu->hv_clock));
+ sizeof(vcpu->hv_clock));
kunmap_atomic(shared_kaddr, KM_USER0);
@@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
/* ...but clean it before doing the actual write */
vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
- vcpu->arch.hv_clock.tsc_to_system_mul =
- clocksource_khz2mult(tsc_khz, 22);
- vcpu->arch.hv_clock.tsc_shift = 22;
-
down_read(&current->mm->mmap_sem);
vcpu->arch.time_page =
gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
@@ -2759,6 +2808,8 @@ again:
if (vcpu->requests) {
if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
__kvm_migrate_timers(vcpu);
+ if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+ kvm_x86_ops->tlb_flush(vcpu);
if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
&vcpu->requests)) {
kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -2772,6 +2823,7 @@ again:
}
}
+ clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
kvm_inject_pending_timer_irqs(vcpu);
preempt_disable();
@@ -2781,21 +2833,13 @@ again:
local_irq_disable();
- if (need_resched()) {
+ if (vcpu->requests || need_resched()) {
local_irq_enable();
preempt_enable();
r = 1;
goto out;
}
- if (vcpu->requests)
- if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
- local_irq_enable();
- preempt_enable();
- r = 1;
- goto out;
- }
-
if (signal_pending(current)) {
local_irq_enable();
preempt_enable();
@@ -2825,9 +2869,6 @@ again:
kvm_guest_enter();
- if (vcpu->requests)
- if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
- kvm_x86_ops->tlb_flush(vcpu);
KVMTRACE_0D(VMENTRY, vcpu, entryexit);
kvm_x86_ops->run(vcpu, kvm_run);
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 2e641be2737..6c388e593bc 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -5,8 +5,9 @@
config XEN
bool "Xen guest support"
select PARAVIRT
+ select PARAVIRT_CLOCK
depends on X86_32
- depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER)
+ depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c8a56e457d6..f09c1c69c37 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
static __init void xen_pagetable_setup_start(pgd_t *base)
{
pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+ int i;
/* special set_pte for pagetable initialization */
pv_mmu_ops.set_pte = xen_set_pte_init;
init_mm.pgd = base;
/*
- * copy top-level of Xen-supplied pagetable into place. For
- * !PAE we can use this as-is, but for PAE it is a stand-in
- * while we copy the pmd pages.
+ * copy top-level of Xen-supplied pagetable into place. This
+ * is a stand-in while we copy the pmd pages.
*/
memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
- if (PTRS_PER_PMD > 1) {
- int i;
- /*
- * For PAE, need to allocate new pmds, rather than
- * share Xen's, since Xen doesn't like pmd's being
- * shared between address spaces.
- */
- for (i = 0; i < PTRS_PER_PGD; i++) {
- if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
- pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ /*
+ * For PAE, need to allocate new pmds, rather than
+ * share Xen's, since Xen doesn't like pmd's being
+ * shared between address spaces.
+ */
+ for (i = 0; i < PTRS_PER_PGD; i++) {
+ if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
+ pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
- memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
- PAGE_SIZE);
+ memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+ PAGE_SIZE);
- make_lowmem_page_readonly(pmd);
+ make_lowmem_page_readonly(pmd);
- set_pgd(&base[i], __pgd(1 + __pa(pmd)));
- } else
- pgd_clear(&base[i]);
- }
+ set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+ } else
+ pgd_clear(&base[i]);
}
/* make sure zero_page is mapped RO so we can use it in pagetables */
@@ -873,17 +870,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
/* Actually pin the pagetable down, but we can't set PG_pinned
yet because the page structures don't exist yet. */
- {
- unsigned level;
-
-#ifdef CONFIG_X86_PAE
- level = MMUEXT_PIN_L3_TABLE;
-#else
- level = MMUEXT_PIN_L2_TABLE;
-#endif
-
- pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
- }
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
}
/* This is called once we have the cpu_possible_map */
@@ -1093,7 +1080,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.make_pte = xen_make_pte,
.make_pgd = xen_make_pgd,
-#ifdef CONFIG_X86_PAE
.set_pte_atomic = xen_set_pte_atomic,
.set_pte_present = xen_set_pte_at,
.set_pud = xen_set_pud,
@@ -1102,7 +1088,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.make_pmd = xen_make_pmd,
.pmd_val = xen_pmd_val,
-#endif /* PAE */
.activate_mm = xen_activate_mm,
.dup_mmap = xen_dup_mmap,
@@ -1228,6 +1213,11 @@ asmlinkage void __init xen_start_kernel(void)
if (xen_feature(XENFEAT_supervisor_mode_kernel))
pv_info.kernel_rpl = 0;
+ /* Prevent unwanted bits from being set in PTEs. */
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
+ if (!is_initial_xendomain())
+ __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
+
/* set the limit of our address space */
xen_reserve_top();
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3525ef523a7..df40bf74ea7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -179,50 +179,56 @@ out:
preempt_enable();
}
-pteval_t xen_pte_val(pte_t pte)
+/* Assume pteval_t is equivalent to all the other *val_t types. */
+static pteval_t pte_mfn_to_pfn(pteval_t val)
+{
+ if (val & _PAGE_PRESENT) {
+ unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT;
+ pteval_t flags = val & ~PTE_MASK;
+ val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
+ }
+
+ return val;
+}
+
+static pteval_t pte_pfn_to_mfn(pteval_t val)
{
- pteval_t ret = pte.pte;
+ if (val & _PAGE_PRESENT) {
+ unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT;
+ pteval_t flags = val & ~PTE_MASK;
+ val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
+ }
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+ return val;
+}
- return ret;
+pteval_t xen_pte_val(pte_t pte)
+{
+ return pte_mfn_to_pfn(pte.pte);
}
pgdval_t xen_pgd_val(pgd_t pgd)
{
- pgdval_t ret = pgd.pgd;
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
- return ret;
+ return pte_mfn_to_pfn(pgd.pgd);
}
pte_t xen_make_pte(pteval_t pte)
{
- if (pte & _PAGE_PRESENT) {
- pte = phys_to_machine(XPADDR(pte)).maddr;
- pte &= ~(_PAGE_PCD | _PAGE_PWT);
- }
-
- return (pte_t){ .pte = pte };
+ pte = pte_pfn_to_mfn(pte);
+ return native_make_pte(pte);
}
pgd_t xen_make_pgd(pgdval_t pgd)
{
- if (pgd & _PAGE_PRESENT)
- pgd = phys_to_machine(XPADDR(pgd)).maddr;
-
- return (pgd_t){ pgd };
+ pgd = pte_pfn_to_mfn(pgd);
+ return native_make_pgd(pgd);
}
pmdval_t xen_pmd_val(pmd_t pmd)
{
- pmdval_t ret = native_pmd_val(pmd);
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
- return ret;
+ return pte_mfn_to_pfn(pmd.pmd);
}
-#ifdef CONFIG_X86_PAE
+
void xen_set_pud(pud_t *ptr, pud_t val)
{
struct multicall_space mcs;
@@ -267,17 +273,9 @@ void xen_pmd_clear(pmd_t *pmdp)
pmd_t xen_make_pmd(pmdval_t pmd)
{
- if (pmd & _PAGE_PRESENT)
- pmd = phys_to_machine(XPADDR(pmd)).maddr;
-
+ pmd = pte_pfn_to_mfn(pmd);
return native_make_pmd(pmd);
}
-#else /* !PAE */
-void xen_set_pte(pte_t *ptep, pte_t pte)
-{
- *ptep = pte;
-}
-#endif /* CONFIG_X86_PAE */
/*
(Yet another) pagetable walker. This one is intended for pinning a
@@ -430,8 +428,6 @@ static int pin_page(struct page *page, enum pt_level level)
read-only, and can be pinned. */
void xen_pgd_pin(pgd_t *pgd)
{
- unsigned level;
-
xen_mc_batch();
if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
@@ -441,14 +437,7 @@ void xen_pgd_pin(pgd_t *pgd)
xen_mc_batch();
}
-#ifdef CONFIG_X86_PAE
- level = MMUEXT_PIN_L3_TABLE;
-#else
- level = MMUEXT_PIN_L2_TABLE;
-#endif
-
- xen_do_pin(level, PFN_DOWN(__pa(pgd)));
-
+ xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
xen_mc_issue(0);
}
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index b5e189b1519..5fe961caffd 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -37,14 +37,13 @@ void xen_exit_mmap(struct mm_struct *mm);
void xen_pgd_pin(pgd_t *pgd);
//void xen_pgd_unpin(pgd_t *pgd);
-#ifdef CONFIG_X86_PAE
-unsigned long long xen_pte_val(pte_t);
-unsigned long long xen_pmd_val(pmd_t);
-unsigned long long xen_pgd_val(pgd_t);
+pteval_t xen_pte_val(pte_t);
+pmdval_t xen_pmd_val(pmd_t);
+pgdval_t xen_pgd_val(pgd_t);
-pte_t xen_make_pte(unsigned long long);
-pmd_t xen_make_pmd(unsigned long long);
-pgd_t xen_make_pgd(unsigned long long);
+pte_t xen_make_pte(pteval_t);
+pmd_t xen_make_pmd(pmdval_t);
+pgd_t xen_make_pgd(pgdval_t);
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
@@ -53,15 +52,4 @@ void xen_set_pud(pud_t *ptr, pud_t val);
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_pmd_clear(pmd_t *pmdp);
-
-#else
-unsigned long xen_pte_val(pte_t);
-unsigned long xen_pmd_val(pmd_t);
-unsigned long xen_pgd_val(pgd_t);
-
-pte_t xen_make_pte(unsigned long);
-pmd_t xen_make_pmd(unsigned long);
-pgd_t xen_make_pgd(unsigned long);
-#endif
-
#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 52b2e385698..41e217503c9 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -14,6 +14,7 @@
#include <linux/kernel_stat.h>
#include <linux/math64.h>
+#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
@@ -31,17 +32,6 @@
static cycle_t xen_clocksource_read(void);
-/* These are perodically updated in shared_info, and then copied here. */
-struct shadow_time_info {
- u64 tsc_timestamp; /* TSC at last update of time vals. */
- u64 system_timestamp; /* Time, in nanosecs, since boot. */
- u32 tsc_to_nsec_mul;
- int tsc_shift;
- u32 version;
-};
-
-static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
-
/* runstate info updated by Xen */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
@@ -211,7 +201,7 @@ unsigned long long xen_sched_clock(void)
unsigned long xen_cpu_khz(void)
{
u64 xen_khz = 1000000ULL << 32;
- const struct vcpu_time_info *info =
+ const struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
do_div(xen_khz, info->tsc_to_system_mul);
@@ -223,121 +213,26 @@ unsigned long xen_cpu_khz(void)
return xen_khz;
}
-/*
- * Reads a consistent set of time-base values from Xen, into a shadow data
- * area.
- */
-static unsigned get_time_values_from_xen(void)
-{
- struct vcpu_time_info *src;
- struct shadow_time_info *dst;
-
- /* src is shared memory with the hypervisor, so we need to
- make sure we get a consistent snapshot, even in the face of
- being preempted. */
- src = &__get_cpu_var(xen_vcpu)->time;
- dst = &__get_cpu_var(shadow_time);
-
- do {
- dst->version = src->version;
- rmb(); /* fetch version before data */
- dst->tsc_timestamp = src->tsc_timestamp;
- dst->system_timestamp = src->system_time;
- dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
- dst->tsc_shift = src->tsc_shift;
- rmb(); /* test version after fetching data */
- } while ((src->version & 1) | (dst->version ^ src->version));
-
- return dst->version;
-}
-
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
-{
- u64 product;
-#ifdef __i386__
- u32 tmp1, tmp2;
-#endif
-
- if (shift < 0)
- delta >>= -shift;
- else
- delta <<= shift;
-
-#ifdef __i386__
- __asm__ (
- "mul %5 ; "
- "mov %4,%%eax ; "
- "mov %%edx,%4 ; "
- "mul %5 ; "
- "xor %5,%5 ; "
- "add %4,%%eax ; "
- "adc %5,%%edx ; "
- : "=A" (product), "=r" (tmp1), "=r" (tmp2)
- : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif __x86_64__
- __asm__ (
- "mul %%rdx ; shrd $32,%%rdx,%%rax"
- : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
-#else
-#error implement me!
-#endif
-
- return product;
-}
-
-static u64 get_nsec_offset(struct shadow_time_info *shadow)
-{
- u64 now, delta;
- now = native_read_tsc();
- delta = now - shadow->tsc_timestamp;
- return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
-}
-
static cycle_t xen_clocksource_read(void)
{
- struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+ struct pvclock_vcpu_time_info *src;
cycle_t ret;
- unsigned version;
-
- do {
- version = get_time_values_from_xen();
- barrier();
- ret = shadow->system_timestamp + get_nsec_offset(shadow);
- barrier();
- } while (version != __get_cpu_var(xen_vcpu)->time.version);
-
- put_cpu_var(shadow_time);
+ src = &get_cpu_var(xen_vcpu)->time;
+ ret = pvclock_clocksource_read(src);
+ put_cpu_var(xen_vcpu);
return ret;
}
static void xen_read_wallclock(struct timespec *ts)
{
- const struct shared_info *s = HYPERVISOR_shared_info;
- u32 version;
- u64 delta;
- struct timespec now;
-
- /* get wallclock at system boot */
- do {
- version = s->wc_version;
- rmb(); /* fetch version before time */
- now.tv_sec = s->wc_sec;
- now.tv_nsec = s->wc_nsec;
- rmb(); /* fetch time before checking version */
- } while ((s->wc_version & 1) | (version ^ s->wc_version));
+ struct shared_info *s = HYPERVISOR_shared_info;
+ struct pvclock_wall_clock *wall_clock = &(s->wc);
+ struct pvclock_vcpu_time_info *vcpu_time;
- delta = xen_clocksource_read(); /* time since system boot */
- delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
-
- now.tv_nsec = do_div(delta, NSEC_PER_SEC);
- now.tv_sec = delta;
-
- set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+ vcpu_time = &get_cpu_var(xen_vcpu)->time;
+ pvclock_read_wallclock(wall_clock, vcpu_time, ts);
+ put_cpu_var(xen_vcpu);
}
unsigned long xen_get_wallclock(void)
@@ -345,7 +240,6 @@ unsigned long xen_get_wallclock(void)
struct timespec ts;
xen_read_wallclock(&ts);
-
return ts.tv_sec;
}
@@ -569,8 +463,6 @@ __init void xen_time_init(void)
{
int cpu = smp_processor_id();
- get_time_values_from_xen();
-
clocksource_register(&xen_clocksource);
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 288d587ce73..6ec3b4f7719 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -17,7 +17,7 @@ ENTRY(startup_xen)
__FINIT
-.pushsection .bss.page_aligned
+.pushsection .text
.align PAGE_SIZE_asm
ENTRY(hypercall_page)
.skip 0x1000
@@ -30,11 +30,7 @@ ENTRY(hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
-#ifdef CONFIG_X86_PAE
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
-#else
- ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
-#endif
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
#endif /*CONFIG_XEN */