32 files changed, 869 insertions, 436 deletions
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 64e450dddb4..167fd89f870 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -1150,16 +1150,13 @@ osf_usleep_thread(struct timeval32 __user *sleep, struct timeval32 __user *remai
 	if (get_tv32(&tmp, sleep))
 		goto fault;
 
-	ticks = tmp.tv_usec;
-	ticks = (ticks + (1000000 / HZ) - 1) / (1000000 / HZ);
-	ticks += tmp.tv_sec * HZ;
+	ticks = timeval_to_jiffies(&tmp);
 
 	current->state = TASK_INTERRUPTIBLE;
 	ticks = schedule_timeout(ticks);
 
 	if (remain) {
-		tmp.tv_sec = ticks / HZ;
-		tmp.tv_usec = ticks % HZ;
+		jiffies_to_timeval(ticks, &tmp);
 		if (put_tv32(remain, &tmp))
 			goto fault;
 	}
diff --git a/arch/arm/mach-s3c2410/clock.c b/arch/arm/mach-s3c2410/clock.c
index e23f534d4e1..8d986b8401c 100644
--- a/arch/arm/mach-s3c2410/clock.c
+++ b/arch/arm/mach-s3c2410/clock.c
@@ -478,7 +478,7 @@ static int s3c2440_clk_add(struct sys_device *sysdev)
 {
 	unsigned long upllcon = __raw_readl(S3C2410_UPLLCON);
 
-	s3c2440_clk_upll.rate = s3c2410_get_pll(upllcon, clk_xtal.rate) * 2;
+	s3c2440_clk_upll.rate = s3c2410_get_pll(upllcon, clk_xtal.rate);
 
 	printk("S3C2440: Clock Support, UPLL %ld.%03ld MHz\n",
 	       print_mhz(s3c2440_clk_upll.rate));
diff --git a/arch/arm/mach-s3c2410/s3c2440.c b/arch/arm/mach-s3c2410/s3c2440.c
index 9a8cc5ae225..d4c8281b55f 100644
--- a/arch/arm/mach-s3c2410/s3c2440.c
+++ b/arch/arm/mach-s3c2410/s3c2440.c
@@ -192,9 +192,11 @@ void __init s3c2440_map_io(struct map_desc *mach_desc, int size)
 
 	iotable_init(s3c2440_iodesc, ARRAY_SIZE(s3c2440_iodesc));
 	iotable_init(mach_desc, size);
+
 	/* rename any peripherals used differing from the s3c2410 */
 
-	s3c_device_i2c.name = "s3c2440-i2c";
+	s3c_device_i2c.name  = "s3c2440-i2c";
+	s3c_device_nand.name = "s3c2440-nand";
 
 	/* change irq for watchdog */
 
@@ -225,7 +227,7 @@ void __init s3c2440_init_clocks(int xtal)
 		break;
 
 	case S3C2440_CLKDIVN_HDIVN_2:
-		hdiv = 1;
+		hdiv = 2;
 		break;
 
 	case S3C2440_CLKDIVN_HDIVN_4_8:
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index c4fc6be629d..48bac7da8c7 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -412,21 +412,20 @@ config CPU_BPREDICT_DISABLE
 
 config TLS_REG_EMUL
 	bool
-	default y if (SMP || CPU_32v6) && (CPU_32v5 || CPU_32v4 || CPU_32v3)
+	default y if SMP && (CPU_32v5 || CPU_32v4 || CPU_32v3)
 	help
-	  We might be running on an ARMv6+ processor which should have the TLS
-	  register but for some reason we can't use it, or maybe an SMP system
-	  using a pre-ARMv6 processor (there are apparently a few prototypes
-	  like that in existence) and therefore access to that register must
-	  be emulated.
+	  An SMP system using a pre-ARMv6 processor (there are apparently
+	  a few prototypes like that in existence) and therefore access to
+	  that required register must be emulated.
 
 config HAS_TLS_REG
 	bool
-	depends on CPU_32v6
-	default y if !TLS_REG_EMUL
+	depends on !TLS_REG_EMUL
+	default y if SMP || CPU_32v7
 	help
 	  This selects support for the CP15 thread register.
-	  It is defined to be available on ARMv6 or later.  If a particular
-	  ARMv6 or later CPU doesn't support it then it must omc;ide "select
-	  TLS_REG_EMUL" along with its other caracteristics.
+	  It is defined to be available on some ARMv6 processors (including
+	  all SMP capable ARMv6's) or later processors.  User space may
+	  assume directly accessing that register and always obtain the
+	  expected value only on ARMv7 and above.
 
diff --git a/arch/arm/mm/copypage-v4mc.S b/arch/arm/mm/copypage-v4mc.S
deleted file mode 100644
index 305af3dab3d..00000000000
--- a/arch/arm/mm/copypage-v4mc.S
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- *  linux/arch/arm/lib/copy_page-armv4mc.S
- *
- *  Copyright (C) 1995-2001 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- *  ASM optimised string functions
- */
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/constants.h>
-
-	.text
-	.align	5
-/*
- * ARMv4 mini-dcache optimised copy_user_page
- *
- * We flush the destination cache lines just before we write the data into the
- * corresponding address.  Since the Dcache is read-allocate, this removes the
- * Dcache aliasing issue.  The writes will be forwarded to the write buffer,
- * and merged as appropriate.
- *
- * Note: We rely on all ARMv4 processors implementing the "invalidate D line"
- * instruction.  If your processor does not supply this, you have to write your
- * own copy_user_page that does the right thing.
- */
-ENTRY(v4_mc_copy_user_page)
-	stmfd	sp!, {r4, lr}			@ 2
-	mov	r4, r0
-	mov	r0, r1
-	bl	map_page_minicache
-	mov	r1, #PAGE_SZ/64			@ 1
-	ldmia	r0!, {r2, r3, ip, lr}		@ 4
-1:	mcr	p15, 0, r4, c7, c6, 1		@ 1   invalidate D line
-	stmia	r4!, {r2, r3, ip, lr}		@ 4
-	ldmia	r0!, {r2, r3, ip, lr}		@ 4+1
-	stmia	r4!, {r2, r3, ip, lr}		@ 4
-	ldmia	r0!, {r2, r3, ip, lr}		@ 4
-	mcr	p15, 0, r4, c7, c6, 1		@ 1   invalidate D line
-	stmia	r4!, {r2, r3, ip, lr}		@ 4
-	ldmia	r0!, {r2, r3, ip, lr}		@ 4
-	subs	r1, r1, #1			@ 1
-	stmia	r4!, {r2, r3, ip, lr}		@ 4
-	ldmneia	r0!, {r2, r3, ip, lr}		@ 4
-	bne	1b				@ 1
-	ldmfd	sp!, {r4, pc}			@ 3
-
-	.align	5
-/*
- * ARMv4 optimised clear_user_page
- *
- * Same story as above.
- */
-ENTRY(v4_mc_clear_user_page)
-	str	lr, [sp, #-4]!
-	mov	r1, #PAGE_SZ/64			@ 1
-	mov	r2, #0				@ 1
-	mov	r3, #0				@ 1
-	mov	ip, #0				@ 1
-	mov	lr, #0				@ 1
-1:	mcr	p15, 0, r0, c7, c6, 1		@ 1   invalidate D line
-	stmia	r0!, {r2, r3, ip, lr}		@ 4
-	stmia	r0!, {r2, r3, ip, lr}		@ 4
-	mcr	p15, 0, r0, c7, c6, 1		@ 1   invalidate D line
-	stmia	r0!, {r2, r3, ip, lr}		@ 4
-	stmia	r0!, {r2, r3, ip, lr}		@ 4
-	subs	r1, r1, #1			@ 1
-	bne	1b				@ 1
-	ldr	pc, [sp], #4
-
-	__INITDATA
-
-	.type	v4_mc_user_fns, #object
-ENTRY(v4_mc_user_fns)
-	.long	v4_mc_clear_user_page
-	.long	v4_mc_copy_user_page
-	.size	v4_mc_user_fns, . - v4_mc_user_fns
diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c
new file mode 100644
index 00000000000..fc69dccdace
--- /dev/null
+++ b/arch/arm/mm/copypage-v4mc.c
@@ -0,0 +1,111 @@
+/*
+ *  linux/arch/arm/lib/copypage-armv4mc.S
+ *
+ *  Copyright (C) 1995-2005 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This handles the mini data cache, as found on SA11x0 and XScale
+ * processors.  When we copy a user page page, we map it in such a way
+ * that accesses to this page will not touch the main data cache, but
+ * will be cached in the mini data cache.  This prevents us thrashing
+ * the main data cache on page faults.
+ */
+#include <linux/init.h>
+#include <linux/mm.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+
+/*
+ * 0xffff8000 to 0xffffffff is reserved for any ARM architecture
+ * specific hacks for copying pages efficiently.
+ */
+#define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \
+				  L_PTE_CACHEABLE)
+
+#define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
+
+static DEFINE_SPINLOCK(minicache_lock);
+
+/*
+ * ARMv4 mini-dcache optimised copy_user_page
+ *
+ * We flush the destination cache lines just before we write the data into the
+ * corresponding address.  Since the Dcache is read-allocate, this removes the
+ * Dcache aliasing issue.  The writes will be forwarded to the write buffer,
+ * and merged as appropriate.
+ *
+ * Note: We rely on all ARMv4 processors implementing the "invalidate D line"
+ * instruction.  If your processor does not supply this, you have to write your
+ * own copy_user_page that does the right thing.
+ */
+static void __attribute__((naked))
+mc_copy_user_page(void *from, void *to)
+{
+	asm volatile(
+	"stmfd	sp!, {r4, lr}			@ 2\n\
+	mov	r4, %2				@ 1\n\
+	ldmia	%0!, {r2, r3, ip, lr}		@ 4\n\
+1:	mcr	p15, 0, %1, c7, c6, 1		@ 1   invalidate D line\n\
+	stmia	%1!, {r2, r3, ip, lr}		@ 4\n\
+	ldmia	%0!, {r2, r3, ip, lr}		@ 4+1\n\
+	stmia	%1!, {r2, r3, ip, lr}		@ 4\n\
+	ldmia	%0!, {r2, r3, ip, lr}		@ 4\n\
+	mcr	p15, 0, %1, c7, c6, 1		@ 1   invalidate D line\n\
+	stmia	%1!, {r2, r3, ip, lr}		@ 4\n\
+	ldmia	%0!, {r2, r3, ip, lr}		@ 4\n\
+	subs	r4, r4, #1			@ 1\n\
+	stmia	%1!, {r2, r3, ip, lr}		@ 4\n\
+	ldmneia	%0!, {r2, r3, ip, lr}		@ 4\n\
+	bne	1b				@ 1\n\
+	ldmfd	sp!, {r4, pc}			@ 3"
+	:
+	: "r" (from), "r" (to), "I" (PAGE_SIZE / 64));
+}
+
+void v4_mc_copy_user_page(void *kto, const void *kfrom, unsigned long vaddr)
+{
+	spin_lock(&minicache_lock);
+
+	set_pte(TOP_PTE(0xffff8000), pfn_pte(__pa(kfrom) >> PAGE_SHIFT, minicache_pgprot));
+	flush_tlb_kernel_page(0xffff8000);
+
+	mc_copy_user_page((void *)0xffff8000, kto);
+
+	spin_unlock(&minicache_lock);
+}
+
+/*
+ * ARMv4 optimised clear_user_page
+ */
+void __attribute__((naked))
+v4_mc_clear_user_page(void *kaddr, unsigned long vaddr)
+{
+	asm volatile(
+	"str	lr, [sp, #-4]!\n\
+	mov	r1, %0				@ 1\n\
+	mov	r2, #0				@ 1\n\
+	mov	r3, #0				@ 1\n\
+	mov	ip, #0				@ 1\n\
+	mov	lr, #0				@ 1\n\
+1:	mcr	p15, 0, r0, c7, c6, 1		@ 1   invalidate D line\n\
+	stmia	r0!, {r2, r3, ip, lr}		@ 4\n\
+	stmia	r0!, {r2, r3, ip, lr}		@ 4\n\
+	mcr	p15, 0, r0, c7, c6, 1		@ 1   invalidate D line\n\
+	stmia	r0!, {r2, r3, ip, lr}		@ 4\n\
+	stmia	r0!, {r2, r3, ip, lr}		@ 4\n\
+	subs	r1, r1, #1			@ 1\n\
+	bne	1b				@ 1\n\
+	ldr	pc, [sp], #4"
+	:
+	: "I" (PAGE_SIZE / 64));
+}
+
+struct cpu_user_fns v4_mc_user_fns __initdata = {
+	.cpu_clear_user_page	= v4_mc_clear_user_page, 
+	.cpu_copy_user_page	= v4_mc_copy_user_page,
+};
diff --git a/arch/arm/mm/copypage-v6.c b/arch/arm/mm/copypage-v6.c
index 694ac820885..a8c00236bd3 100644
--- a/arch/arm/mm/copypage-v6.c
+++ b/arch/arm/mm/copypage-v6.c
@@ -26,8 +26,8 @@
 #define to_address	(0xffffc000)
 #define to_pgprot	PAGE_KERNEL
 
-static pte_t *from_pte;
-static pte_t *to_pte;
+#define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
+
 static DEFINE_SPINLOCK(v6_lock);
 
 #define DCACHE_COLOUR(vaddr) ((vaddr & (SHMLBA - 1)) >> PAGE_SHIFT)
@@ -74,8 +74,8 @@ void v6_copy_user_page_aliasing(void *kto, const void *kfrom, unsigned long vadd
 	 */
 	spin_lock(&v6_lock);
 
-	set_pte(from_pte + offset, pfn_pte(__pa(kfrom) >> PAGE_SHIFT, from_pgprot));
-	set_pte(to_pte + offset, pfn_pte(__pa(kto) >> PAGE_SHIFT, to_pgprot));
+	set_pte(TOP_PTE(from_address) + offset, pfn_pte(__pa(kfrom) >> PAGE_SHIFT, from_pgprot));
+	set_pte(TOP_PTE(to_address) + offset, pfn_pte(__pa(kto) >> PAGE_SHIFT, to_pgprot));
 
 	from = from_address + (offset << PAGE_SHIFT);
 	to   = to_address + (offset << PAGE_SHIFT);
@@ -114,7 +114,7 @@ void v6_clear_user_page_aliasing(void *kaddr, unsigned long vaddr)
 	 */
 	spin_lock(&v6_lock);
 
-	set_pte(to_pte + offset, pfn_pte(__pa(kaddr) >> PAGE_SHIFT, to_pgprot));
+	set_pte(TOP_PTE(to_address) + offset, pfn_pte(__pa(kaddr) >> PAGE_SHIFT, to_pgprot));
 	flush_tlb_kernel_page(to);
 	clear_page((void *)to);
 
@@ -129,21 +129,6 @@ struct cpu_user_fns v6_user_fns __initdata = {
 static int __init v6_userpage_init(void)
 {
 	if (cache_is_vipt_aliasing()) {
-		pgd_t *pgd;
-		pmd_t *pmd;
-
-		pgd = pgd_offset_k(from_address);
-		pmd = pmd_alloc(&init_mm, pgd, from_address);
-		if (!pmd)
-			BUG();
-		from_pte = pte_alloc_kernel(&init_mm, pmd, from_address);
-		if (!from_pte)
-			BUG();
-
-		to_pte = pte_alloc_kernel(&init_mm, pmd, to_address);
-		if (!to_pte)
-			BUG();
-
 		cpu_user.cpu_clear_user_page = v6_clear_user_page_aliasing;
 		cpu_user.cpu_copy_user_page = v6_copy_user_page_aliasing;
 	}
@@ -151,5 +136,4 @@ static int __init v6_userpage_init(void)
 	return 0;
 }
 
-__initcall(v6_userpage_init);
-
+core_initcall(v6_userpage_init);
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index c6de48d8950..4085ed983e4 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -13,6 +13,29 @@
 
 #include <asm/cacheflush.h>
 #include <asm/system.h>
+#include <asm/tlbflush.h>
+
+#ifdef CONFIG_CPU_CACHE_VIPT
+#define ALIAS_FLUSH_START	0xffff4000
+
+#define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
+
+static void flush_pfn_alias(unsigned long pfn, unsigned long vaddr)
+{
+	unsigned long to = ALIAS_FLUSH_START + (CACHE_COLOUR(vaddr) << PAGE_SHIFT);
+
+	set_pte(TOP_PTE(to), pfn_pte(pfn, PAGE_KERNEL));
+	flush_tlb_kernel_page(to);
+
+	asm(	"mcrr	p15, 0, %1, %0, c14\n"
+	"	mcrr	p15, 0, %1, %0, c5\n"
+	    :
+	    : "r" (to), "r" (to + PAGE_SIZE - L1_CACHE_BYTES)
+	    : "cc");
+}
+#else
+#define flush_pfn_alias(pfn,vaddr)	do { } while (0)
+#endif
 
 static void __flush_dcache_page(struct address_space *mapping, struct page *page)
 {
@@ -37,6 +60,18 @@ static void __flush_dcache_page(struct address_space *mapping, struct page *page
 		return;
 
 	/*
+	 * This is a page cache page.  If we have a VIPT cache, we
+	 * only need to do one flush - which would be at the relevant
+	 * userspace colour, which is congruent with page->index.
+	 */
+	if (cache_is_vipt()) {
+		if (cache_is_vipt_aliasing())
+			flush_pfn_alias(page_to_pfn(page),
+					page->index << PAGE_CACHE_SHIFT);
+		return;
+	}
+
+	/*
 	 * There are possible user space mappings of this page:
 	 * - VIVT cache: we need to also write back and invalidate all user
 	 *   data in the current VM view associated with this page.
@@ -57,8 +92,6 @@ static void __flush_dcache_page(struct address_space *mapping, struct page *page
 			continue;
 		offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
 		flush_cache_page(mpnt, mpnt->vm_start + offset, page_to_pfn(page));
-		if (cache_is_vipt())
-			break;
 	}
 	flush_dcache_mmap_unlock(mapping);
 }
diff --git a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c
index 585dfb8e20b..2c2b93d77d4 100644
--- a/arch/arm/mm/mm-armv.c
+++ b/arch/arm/mm/mm-armv.c
@@ -37,6 +37,8 @@ pgprot_t pgprot_kernel;
 
 EXPORT_SYMBOL(pgprot_kernel);
 
+pmd_t *top_pmd;
+
 struct cachepolicy {
 	const char	policy[16];
 	unsigned int	cr_mask;
@@ -142,6 +144,16 @@ __setup("noalign", noalign_setup);
 
 #define FIRST_KERNEL_PGD_NR	(FIRST_USER_PGD_NR + USER_PTRS_PER_PGD)
 
+static inline pmd_t *pmd_off(pgd_t *pgd, unsigned long virt)
+{
+	return pmd_offset(pgd, virt);
+}
+
+static inline pmd_t *pmd_off_k(unsigned long virt)
+{
+	return pmd_off(pgd_offset_k(virt), virt);
+}
+
 /*
  * need to get a 16k page for level 1
  */
@@ -220,7 +232,7 @@ void free_pgd_slow(pgd_t *pgd)
 		return;
 
 	/* pgd is always present and good */
-	pmd = (pmd_t *)pgd;
+	pmd = pmd_off(pgd, 0);
 	if (pmd_none(*pmd))
 		goto free;
 	if (pmd_bad(*pmd)) {
@@ -246,9 +258,8 @@ free:
 static inline void
 alloc_init_section(unsigned long virt, unsigned long phys, int prot)
 {
-	pmd_t *pmdp;
+	pmd_t *pmdp = pmd_off_k(virt);
 
-	pmdp = pmd_offset(pgd_offset_k(virt), virt);
 	if (virt & (1 << 20))
 		pmdp++;
 
@@ -283,11 +294,9 @@ alloc_init_supersection(unsigned long virt, unsigned long phys, int prot)
 static inline void
 alloc_init_page(unsigned long virt, unsigned long phys, unsigned int prot_l1, pgprot_t prot)
 {
-	pmd_t *pmdp;
+	pmd_t *pmdp = pmd_off_k(virt);
 	pte_t *ptep;
 
-	pmdp = pmd_offset(pgd_offset_k(virt), virt);
-
 	if (pmd_none(*pmdp)) {
 		unsigned long pmdval;
 		ptep = alloc_bootmem_low_pages(2 * PTRS_PER_PTE *
@@ -310,7 +319,7 @@ alloc_init_page(unsigned long virt, unsigned long phys, unsigned int prot_l1, pg
  */
 static inline void clear_mapping(unsigned long virt)
 {
-	pmd_clear(pmd_offset(pgd_offset_k(virt), virt));
+	pmd_clear(pmd_off_k(virt));
 }
 
 struct mem_types {
@@ -578,7 +587,7 @@ void setup_mm_for_reboot(char mode)
 			 PMD_TYPE_SECT;
 		if (cpu_arch <= CPU_ARCH_ARMv5)
 			pmdval |= PMD_BIT4;
-		pmd = pmd_offset(pgd + i, i << PGDIR_SHIFT);
+		pmd = pmd_off(pgd, i << PGDIR_SHIFT);
 		pmd[0] = __pmd(pmdval);
 		pmd[1] = __pmd(pmdval + (1 << (PGDIR_SHIFT - 1)));
 		flush_pmd_entry(pmd);
@@ -675,6 +684,8 @@ void __init memtable_init(struct meminfo *mi)
 
 	flush_cache_all();
 	flush_tlb_all();
+
+	top_pmd = pmd_off_k(0xffff0000);
 }
 
 /*
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index 16dbc4151be..fa34a06c0d7 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -24,9 +24,6 @@ __asm__(".align 4\nvide: ret");
 
 static void __init init_amd(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_X86_SMP
-	int cpu = c == &boot_cpu_data ? 0 : c - cpu_data;
-#endif
 	u32 l, h;
 	int mbytes = num_physpages >> (20-PAGE_SHIFT);
 	int r;
@@ -205,7 +202,9 @@ static void __init init_amd(struct cpuinfo_x86 *c)
 	 * of two.
 	 */
 	if (c->x86_num_cores > 1) {
-		cpu_core_id[cpu] = cpu >> hweight32(c->x86_num_cores - 1);
+		int cpu = smp_processor_id();
+		/* Fix up the APIC ID following AMD specifications. */
+		cpu_core_id[cpu] >>= hweight32(c->x86_num_cores - 1);
 		printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
 		       cpu, c->x86_num_cores, cpu_core_id[cpu]);
 	}
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 6be0310e3cd..11e6e6f23fa 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -243,6 +243,13 @@ static void __init early_cpu_detect(void)
 	}
 
 	early_intel_workaround(c);
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_HT
+	phys_proc_id[smp_processor_id()] =
+#endif
+	cpu_core_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
+#endif
 }
 
 void __init generic_identify(struct cpuinfo_x86 * c)
diff --git a/arch/i386/pci/fixup.c b/arch/i386/pci/fixup.c
index be52c5ac4e0..8e8e895e1b5 100644
--- a/arch/i386/pci/fixup.c
+++ b/arch/i386/pci/fixup.c
@@ -253,7 +253,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE2, pci
 #define MAX_PCIEROOT	6
 static int quirk_aspm_offset[MAX_PCIEROOT << 3];
 
-#define GET_INDEX(a, b) (((a - PCI_DEVICE_ID_INTEL_MCH_PA) << 3) + b)
+#define GET_INDEX(a, b) ((((a) - PCI_DEVICE_ID_INTEL_MCH_PA) << 3) + ((b) & 7))
 
 static int quirk_pcie_aspm_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *value)
 {
diff --git a/arch/ia64/ia32/ia32_ioctl.c b/arch/ia64/ia32/ia32_ioctl.c
index 9845dabe261..164b211f417 100644
--- a/arch/ia64/ia32/ia32_ioctl.c
+++ b/arch/ia64/ia32/ia32_ioctl.c
@@ -13,7 +13,6 @@
   
 #define	INCLUDES
 #include "compat_ioctl.c"
-#include <asm/ioctl32.h>
 
 #define IOCTL_NR(a)	((a) & ~(_IOC_SIZEMASK << _IOC_SIZESHIFT))
 
diff --git a/arch/mips/vr41xx/common/pmu.c b/arch/mips/vr41xx/common/pmu.c
index c5f1043de93..53166f3598b 100644
--- a/arch/mips/vr41xx/common/pmu.c
+++ b/arch/mips/vr41xx/common/pmu.c
@@ -1,7 +1,7 @@
 /*
  *  pmu.c, Power Management Unit routines for NEC VR4100 series.
  *
- *  Copyright (C) 2003-2004  Yoichi Yuasa <yuasa@hh.iij4u.or.jp>
+ *  Copyright (C) 2003-2005  Yoichi Yuasa <yuasa@hh.iij4u.or.jp>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -17,7 +17,9 @@
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
+#include <linux/errno.h>
 #include <linux/init.h>
+#include <linux/ioport.h>
 #include <linux/kernel.h>
 #include <linux/smp.h>
 #include <linux/types.h>
@@ -27,20 +29,31 @@
 #include <asm/reboot.h>
 #include <asm/system.h>
 
-#define PMUCNT2REG	KSEG1ADDR(0x0f0000c6)
+#define PMU_TYPE1_BASE	0x0b0000a0UL
+#define PMU_TYPE1_SIZE	0x0eUL
+
+#define PMU_TYPE2_BASE	0x0f0000c0UL
+#define PMU_TYPE2_SIZE	0x10UL
+
+#define PMUCNT2REG	0x06
  #define SOFTRST	0x0010
 
+static void __iomem *pmu_base;
+
+#define pmu_read(offset)		readw(pmu_base + (offset))
+#define pmu_write(offset, value)	writew((value), pmu_base + (offset))
+
 static inline void software_reset(void)
 {
-	uint16_t val;
+	uint16_t pmucnt2;
 
 	switch (current_cpu_data.cputype) {
 	case CPU_VR4122:
 	case CPU_VR4131:
 	case CPU_VR4133:
-		val = readw(PMUCNT2REG);
-		val |= SOFTRST;
-		writew(val, PMUCNT2REG);
+		pmucnt2 = pmu_read(PMUCNT2REG);
+		pmucnt2 |= SOFTRST;
+		pmu_write(PMUCNT2REG, pmucnt2);
 		break;
 	default:
 		break;
@@ -71,6 +84,34 @@ static void vr41xx_power_off(void)
 
 static int __init vr41xx_pmu_init(void)
 {
+	unsigned long start, size;
+
+	switch (current_cpu_data.cputype) {
+	case CPU_VR4111:
+	case CPU_VR4121:
+		start = PMU_TYPE1_BASE;
+		size = PMU_TYPE1_SIZE;
+		break;
+	case CPU_VR4122:
+	case CPU_VR4131:
+	case CPU_VR4133:
+		start = PMU_TYPE2_BASE;
+		size = PMU_TYPE2_SIZE;
+		break;
+	default:
+		printk("Unexpected CPU of NEC VR4100 series\n");
+		return -ENODEV;
+	}
+
+	if (request_mem_region(start, size, "PMU") == NULL)
+		return -EBUSY;
+
+	pmu_base = ioremap(start, size);
+	if (pmu_base == NULL) {
+		release_mem_region(start, size);
+		return -EBUSY;
+	}
+
 	_machine_restart = vr41xx_restart;
 	_machine_halt = vr41xx_halt;
 	_machine_power_off = vr41xx_power_off;
@@ -78,4 +119,4 @@ static int __init vr41xx_pmu_init(void)
 	return 0;
 }
 
-early_initcall(vr41xx_pmu_init);
+core_initcall(vr41xx_pmu_init);
diff --git a/arch/ppc/kernel/setup.c b/arch/ppc/kernel/setup.c
index 5dfb42f1a15..309797d7f96 100644
--- a/arch/ppc/kernel/setup.c
+++ b/arch/ppc/kernel/setup.c
@@ -753,6 +753,8 @@ void __init setup_arch(char **cmdline_p)
 	strlcpy(saved_command_line, cmd_line, COMMAND_LINE_SIZE);
 	*cmdline_p = cmd_line;
 
+	parse_early_param();
+
 	/* set up the bootmem stuff with available memory */
 	do_init_bootmem();
 	if ( ppc_md.progress ) ppc_md.progress("setup_arch: bootmem", 0x3eab);
diff --git a/arch/um/kernel/irq_user.c b/arch/um/kernel/irq_user.c
index 6d6f9484b88..b3074cbaa47 100644
--- a/arch/um/kernel/irq_user.c
+++ b/arch/um/kernel/irq_user.c
@@ -236,9 +236,15 @@ static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg)
 				       (*prev)->fd, pollfds[i].fd);
 				goto out;
 			}
-			memcpy(&pollfds[i], &pollfds[i + 1],
-			       (pollfds_num - i - 1) * sizeof(pollfds[0]));
+
 			pollfds_num--;
+
+			/* This moves the *whole* array after pollfds[i] (though
+			 * it doesn't spot as such)! */
+
+			memmove(&pollfds[i], &pollfds[i + 1],
+			       (pollfds_num - i) * sizeof(pollfds[0]));
+
 			if(last_irq_ptr == &old_fd->next) 
 				last_irq_ptr = prev;
 			*prev = (*prev)->next;
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 44ee7f6acf7..82cb2a3f127 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -303,6 +303,20 @@ config HPET_TIMER
 	  as it is off-chip.  You can find the HPET spec at
 	  <http://www.intel.com/labs/platcomp/hpet/hpetspec.htm>.
 
+config X86_PM_TIMER
+	bool "PM timer"
+	default y
+	help
+	  Support the ACPI PM timer for time keeping. This is slow,
+	  but is useful on some chipsets without HPET on systems with more
+	  than one CPU. On a single processor or single socket multi core
+	  system it is normally not required.
+	  When the PM timer is active 64bit vsyscalls are disabled
+	  and should not be enabled (/proc/sys/kernel/vsyscall64 should
+	  not be changed).
+	  The kernel selects the PM timer only as a last resort, so it is
+	  useful to enable just in case.
+
 config HPET_EMULATE_RTC
 	bool "Provide RTC interrupt"
 	depends on HPET_TIMER && RTC=y
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index 9ce51dee30b..569595b74c7 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.11-bk7
-# Sat Mar 12 23:43:44 2005
+# Linux kernel version: 2.6.12-rc4
+# Fri May 13 06:39:11 2005
 #
 CONFIG_X86_64=y
 CONFIG_64BIT=y
@@ -11,8 +11,6 @@ CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
 CONFIG_X86_CMPXCHG=y
 CONFIG_EARLY_PRINTK=y
-CONFIG_HPET_TIMER=y
-CONFIG_HPET_EMULATE_RTC=y
 CONFIG_GENERIC_ISA_DMA=y
 CONFIG_GENERIC_IOMAP=y
 
@@ -22,6 +20,7 @@ CONFIG_GENERIC_IOMAP=y
 CONFIG_EXPERIMENTAL=y
 CONFIG_CLEAN_COMPILE=y
 CONFIG_LOCK_KERNEL=y
+CONFIG_INIT_ENV_ARG_LIMIT=32
 
 #
 # General setup
@@ -33,7 +32,6 @@ CONFIG_POSIX_MQUEUE=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 CONFIG_SYSCTL=y
 # CONFIG_AUDIT is not set
-CONFIG_LOG_BUF_SHIFT=18
 # CONFIG_HOTPLUG is not set
 CONFIG_KOBJECT_UEVENT=y
 CONFIG_IKCONFIG=y
@@ -43,10 +41,11 @@ CONFIG_IKCONFIG_PROC=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+CONFIG_PRINTK=y
+CONFIG_BUG=y
 CONFIG_BASE_FULL=y
 CONFIG_FUTEX=y
 CONFIG_EPOLL=y
-# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SHMEM=y
 CONFIG_CC_ALIGN_FUNCTIONS=0
 CONFIG_CC_ALIGN_LABELS=0
@@ -93,6 +92,9 @@ CONFIG_DISCONTIGMEM=y
 CONFIG_NUMA=y
 CONFIG_HAVE_DEC_LOCK=y
 CONFIG_NR_CPUS=8
+CONFIG_HPET_TIMER=y
+CONFIG_X86_PM_TIMER=y
+CONFIG_HPET_EMULATE_RTC=y
 CONFIG_GART_IOMMU=y
 CONFIG_SWIOTLB=y
 CONFIG_X86_MCE=y
@@ -100,6 +102,7 @@ CONFIG_X86_MCE_INTEL=y
 CONFIG_SECCOMP=y
 CONFIG_GENERIC_HARDIRQS=y
 CONFIG_GENERIC_IRQ_PROBE=y
+CONFIG_ISA_DMA_API=y
 
 #
 # Power management options
@@ -129,7 +132,7 @@ CONFIG_ACPI_NUMA=y
 # CONFIG_ACPI_IBM is not set
 CONFIG_ACPI_TOSHIBA=y
 CONFIG_ACPI_BLACKLIST_YEAR=2001
-CONFIG_ACPI_DEBUG=y
+# CONFIG_ACPI_DEBUG is not set
 CONFIG_ACPI_BUS=y
 CONFIG_ACPI_EC=y
 CONFIG_ACPI_POWER=y
@@ -141,6 +144,7 @@ CONFIG_ACPI_SYSTEM=y
 # CPU Frequency scaling
 #
 CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_TABLE=y
 # CONFIG_CPU_FREQ_DEBUG is not set
 CONFIG_CPU_FREQ_STAT=y
 # CONFIG_CPU_FREQ_STAT_DETAILS is not set
@@ -150,7 +154,6 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
 # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_GOV_ONDEMAND=y
-CONFIG_CPU_FREQ_TABLE=y
 
 #
 # CPUFreq processor drivers
@@ -164,6 +167,7 @@ CONFIG_X86_ACPI_CPUFREQ=y
 # shared options
 #
 CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y
+# CONFIG_X86_SPEEDSTEP_LIB is not set
 
 #
 # Bus options (PCI etc.)
@@ -172,9 +176,11 @@ CONFIG_PCI=y
 CONFIG_PCI_DIRECT=y
 CONFIG_PCI_MMCONFIG=y
 CONFIG_UNORDERED_IO=y
+# CONFIG_PCIEPORTBUS is not set
 CONFIG_PCI_MSI=y
 # CONFIG_PCI_LEGACY_PROC is not set
 # CONFIG_PCI_NAMES is not set
+# CONFIG_PCI_DEBUG is not set
 
 #
 # PCCARD (PCMCIA/CardBus) support
@@ -182,10 +188,6 @@ CONFIG_PCI_MSI=y
 # CONFIG_PCCARD is not set
 
 #
-# PC-card bridges
-#
-
-#
 # PCI Hotplug Support
 #
 # CONFIG_HOTPLUG_PCI is not set
@@ -254,7 +256,7 @@ CONFIG_LBD=y
 # IO Schedulers
 #
 CONFIG_IOSCHED_NOOP=y
-CONFIG_IOSCHED_AS=y
+# CONFIG_IOSCHED_AS is not set
 CONFIG_IOSCHED_DEADLINE=y
 CONFIG_IOSCHED_CFQ=y
 # CONFIG_ATA_OVER_ETH is not set
@@ -308,7 +310,8 @@ CONFIG_BLK_DEV_AMD74XX=y
 CONFIG_BLK_DEV_PIIX=y
 # CONFIG_BLK_DEV_NS87415 is not set
 # CONFIG_BLK_DEV_PDC202XX_OLD is not set
-# CONFIG_BLK_DEV_PDC202XX_NEW is not set
+CONFIG_BLK_DEV_PDC202XX_NEW=y
+# CONFIG_PDC202XX_FORCE is not set
 # CONFIG_BLK_DEV_SVWKS is not set
 # CONFIG_BLK_DEV_SIIMAGE is not set
 # CONFIG_BLK_DEV_SIS5513 is not set
@@ -353,7 +356,7 @@ CONFIG_BLK_DEV_SD=y
 #
 # SCSI low-level drivers
 #
-CONFIG_BLK_DEV_3W_XXXX_RAID=y
+# CONFIG_BLK_DEV_3W_XXXX_RAID is not set
 # CONFIG_SCSI_3W_9XXX is not set
 # CONFIG_SCSI_ACARD is not set
 # CONFIG_SCSI_AACRAID is not set
@@ -384,7 +387,6 @@ CONFIG_SCSI_SATA_VIA=y
 # CONFIG_SCSI_BUSLOGIC is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_EATA is not set
-# CONFIG_SCSI_EATA_PIO is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
 # CONFIG_SCSI_GDTH is not set
 # CONFIG_SCSI_IPS is not set
@@ -392,7 +394,6 @@ CONFIG_SCSI_SATA_VIA=y
 # CONFIG_SCSI_INIA100 is not set
 # CONFIG_SCSI_SYM53C8XX_2 is not set
 # CONFIG_SCSI_IPR is not set
-# CONFIG_SCSI_QLOGIC_ISP is not set
 # CONFIG_SCSI_QLOGIC_FC is not set
 # CONFIG_SCSI_QLOGIC_1280 is not set
 CONFIG_SCSI_QLA2XXX=y
@@ -401,6 +402,7 @@ CONFIG_SCSI_QLA2XXX=y
 # CONFIG_SCSI_QLA2300 is not set
 # CONFIG_SCSI_QLA2322 is not set
 # CONFIG_SCSI_QLA6312 is not set
+# CONFIG_SCSI_LPFC is not set
 # CONFIG_SCSI_DC395x is not set
 # CONFIG_SCSI_DC390T is not set
 # CONFIG_SCSI_DEBUG is not set
@@ -437,7 +439,6 @@ CONFIG_NET=y
 #
 CONFIG_PACKET=y
 # CONFIG_PACKET_MMAP is not set
-# CONFIG_NETLINK_DEV is not set
 CONFIG_UNIX=y
 # CONFIG_NET_KEY is not set
 CONFIG_INET=y
@@ -502,7 +503,7 @@ CONFIG_NETDEVICES=y
 # CONFIG_DUMMY is not set
 # CONFIG_BONDING is not set
 # CONFIG_EQUALIZER is not set
-# CONFIG_TUN is not set
+CONFIG_TUN=y
 
 #
 # ARCnet devices
@@ -525,8 +526,7 @@ CONFIG_MII=y
 # CONFIG_HP100 is not set
 CONFIG_NET_PCI=y
 # CONFIG_PCNET32 is not set
-CONFIG_AMD8111_ETH=y
-# CONFIG_AMD8111E_NAPI is not set
+# CONFIG_AMD8111_ETH is not set
 # CONFIG_ADAPTEC_STARFIRE is not set
 # CONFIG_B44 is not set
 CONFIG_FORCEDETH=y
@@ -536,7 +536,7 @@ CONFIG_FORCEDETH=y
 # CONFIG_FEALNX is not set
 # CONFIG_NATSEMI is not set
 # CONFIG_NE2K_PCI is not set
-CONFIG_8139CP=m
+CONFIG_8139CP=y
 CONFIG_8139TOO=y
 # CONFIG_8139TOO_PIO is not set
 # CONFIG_8139TOO_TUNE_TWISTER is not set
@@ -671,6 +671,7 @@ CONFIG_SERIAL_8250_NR_UARTS=4
 #
 CONFIG_SERIAL_CORE=y
 CONFIG_SERIAL_CORE_CONSOLE=y
+# CONFIG_SERIAL_JSM is not set
 CONFIG_UNIX98_PTYS=y
 CONFIG_LEGACY_PTYS=y
 CONFIG_LEGACY_PTY_COUNT=256
@@ -696,6 +697,7 @@ CONFIG_RTC=y
 #
 CONFIG_AGP=y
 CONFIG_AGP_AMD64=y
+CONFIG_AGP_INTEL=y
 # CONFIG_DRM is not set
 # CONFIG_MWAVE is not set
 CONFIG_RAW_DRIVER=y
@@ -703,7 +705,7 @@ CONFIG_HPET=y
 # CONFIG_HPET_RTC_IRQ is not set
 CONFIG_HPET_MMAP=y
 CONFIG_MAX_RAW_DEVS=256
-CONFIG_HANGCHECK_TIMER=y
+# CONFIG_HANGCHECK_TIMER is not set
 
 #
 # TPM devices
@@ -786,6 +788,8 @@ CONFIG_SOUND_ICH=y
 #
 # USB support
 #
+CONFIG_USB_ARCH_HAS_HCD=y
+CONFIG_USB_ARCH_HAS_OHCI=y
 CONFIG_USB=y
 # CONFIG_USB_DEBUG is not set
 
@@ -797,8 +801,6 @@ CONFIG_USB_DEVICEFS=y
 # CONFIG_USB_DYNAMIC_MINORS is not set
 # CONFIG_USB_SUSPEND is not set
 # CONFIG_USB_OTG is not set
-CONFIG_USB_ARCH_HAS_HCD=y
-CONFIG_USB_ARCH_HAS_OHCI=y
 
 #
 # USB Host Controller Drivers
@@ -826,7 +828,6 @@ CONFIG_USB_PRINTER=y
 #
 CONFIG_USB_STORAGE=y
 # CONFIG_USB_STORAGE_DEBUG is not set
-# CONFIG_USB_STORAGE_RW_DETECT is not set
 # CONFIG_USB_STORAGE_DATAFAB is not set
 # CONFIG_USB_STORAGE_FREECOM is not set
 # CONFIG_USB_STORAGE_ISD200 is not set
@@ -965,7 +966,7 @@ CONFIG_AUTOFS_FS=y
 # CD-ROM/DVD Filesystems
 #
 CONFIG_ISO9660_FS=y
-# CONFIG_JOLIET is not set
+CONFIG_JOLIET=y
 # CONFIG_ZISOFS is not set
 # CONFIG_UDF_FS is not set
 
@@ -1092,9 +1093,10 @@ CONFIG_OPROFILE=y
 #
 # Kernel hacking
 #
+# CONFIG_PRINTK_TIME is not set
 CONFIG_DEBUG_KERNEL=y
 CONFIG_MAGIC_SYSRQ=y
-# CONFIG_PRINTK_TIME is not set
+CONFIG_LOG_BUF_SHIFT=18
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_DEBUG_SLAB is not set
 # CONFIG_DEBUG_SPINLOCK is not set
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 0a3318e08ab..5ca4a4598fd 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_GART_IOMMU)	+= pci-gart.o aperture.o
 obj-$(CONFIG_DUMMY_IOMMU)	+= pci-nommu.o pci-dma.o
 obj-$(CONFIG_SWIOTLB)		+= swiotlb.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
+obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer.o
 
 obj-$(CONFIG_MODULES)		+= module.o
 
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 7e13545748e..f8e6cc4fecd 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -33,6 +33,7 @@
 #include <asm/mpspec.h>
 #include <asm/pgalloc.h>
 #include <asm/mach_apic.h>
+#include <asm/nmi.h>
 
 int apic_verbosity;
 
@@ -925,7 +926,7 @@ __init int oem_force_hpet_timer(void)
 	unsigned id;
 	DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
 
-	bitmap_empty(clustermap, NUM_APIC_CLUSTERS);
+	bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
 
 	for (i = 0; i < NR_CPUS; i++) {
 		id = bios_cpu_apicid[i];
@@ -1056,7 +1057,7 @@ int __init APIC_init_uniprocessor (void)
 		nr_ioapics = 0;
 #endif
 	setup_boot_APIC_clock();
-
+	check_nmi_watchdog();
 	return 0;
 }
 
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 1086b5fcac2..28817490fdc 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -220,13 +220,18 @@ sysret_careful:
 	jmp sysret_check
 
 	/* Handle a signal */ 
-	/* edx:	work flags (arg3) */
 sysret_signal:
 	sti
+	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+	jz    1f
+
+	/* Really a signal */
+	/* edx:	work flags (arg3) */
 	leaq do_notify_resume(%rip),%rax
 	leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
 	xorl %esi,%esi # oldset -> arg2
 	call ptregscall_common
+1:	movl $_TIF_NEED_RESCHED,%edi
 	jmp sysret_check
 	
 	/* Do syscall tracing */
@@ -484,6 +489,8 @@ retint_careful:
 	jmp retint_check
 	
 retint_signal:
+	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+	jz    retint_swapgs
 	sti
 	SAVE_REST
 	movq $-1,ORIG_RAX(%rsp) 			
@@ -492,8 +499,8 @@ retint_signal:
 	call do_notify_resume
 	RESTORE_REST
 	cli
+	movl $_TIF_NEED_RESCHED,%edi
 	GET_THREAD_INFO(%rcx)
-	movl $_TIF_WORK_MASK,%edi
 	jmp retint_check
 
 #ifdef CONFIG_PREEMPT
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 60be58617eb..ac768432495 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -1804,76 +1804,6 @@ device_initcall(ioapic_init_sysfs);
 
 #define IO_APIC_MAX_ID		0xFE
 
-int __init io_apic_get_unique_id (int ioapic, int apic_id)
-{
-	union IO_APIC_reg_00 reg_00;
-	static physid_mask_t apic_id_map;
-	unsigned long flags;
-	int i = 0;
-
-	/*
-	 * The P4 platform supports up to 256 APIC IDs on two separate APIC 
-	 * buses (one for LAPICs, one for IOAPICs), where predecessors only 
-	 * supports up to 16 on one shared APIC bus.
-	 * 
-	 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
-	 *      advantage of new APIC bus architecture.
-	 */
-
-	if (physids_empty(apic_id_map))
-		apic_id_map = phys_cpu_present_map;
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	reg_00.raw = io_apic_read(ioapic, 0);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-
-	if (apic_id >= IO_APIC_MAX_ID) {
-		apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
-			"%d\n", ioapic, apic_id, reg_00.bits.ID);
-		apic_id = reg_00.bits.ID;
-	}
-
-	/*
-	 * Every APIC in a system must have a unique ID or we get lots of nice 
-	 * 'stuck on smp_invalidate_needed IPI wait' messages.
-	 */
-	if (physid_isset(apic_id, apic_id_map)) {
-
-		for (i = 0; i < IO_APIC_MAX_ID; i++) {
-			if (!physid_isset(i, apic_id_map))
-				break;
-		}
-
-		if (i == IO_APIC_MAX_ID)
-			panic("Max apic_id exceeded!\n");
-
-		apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
-			"trying %d\n", ioapic, apic_id, i);
-
-		apic_id = i;
-	} 
-
-	physid_set(apic_id, apic_id_map);
-
-	if (reg_00.bits.ID != apic_id) {
-		reg_00.bits.ID = apic_id;
-
-		spin_lock_irqsave(&ioapic_lock, flags);
-		io_apic_write(ioapic, 0, reg_00.raw);
-		reg_00.raw = io_apic_read(ioapic, 0);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
-
-		/* Sanity check */
-		if (reg_00.bits.ID != apic_id)
-			panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic);
-	}
-
-	apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
-
-	return apic_id;
-}
-
-
 int __init io_apic_get_version (int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 7ec031c6ca1..f86d9db94bf 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -107,6 +107,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
 static void __init MP_processor_info (struct mpc_config_processor *m)
 {
 	int ver;
+	static int found_bsp=0;
 
 	if (!(m->mpc_cpuflag & CPU_ENABLED))
 		return;
@@ -126,11 +127,6 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
 			" Processor ignored.\n", NR_CPUS);
 		return;
 	}
-	if (num_processors >= maxcpus) {
-		printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
-			" Processor ignored.\n", maxcpus);
-		return;
-	}
 
 	num_processors++;
 
@@ -150,7 +146,19 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
 		ver = 0x10;
 	}
 	apic_version[m->mpc_apicid] = ver;
-	bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
+ 	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+ 		/*
+ 		 * bios_cpu_apicid is required to have processors listed
+ 		 * in same order as logical cpu numbers. Hence the first
+ 		 * entry is BSP, and so on.
+ 		 */
+ 		bios_cpu_apicid[0] = m->mpc_apicid;
+ 		x86_cpu_to_apicid[0] = m->mpc_apicid;
+ 		found_bsp = 1;
+ 	} else {
+ 		bios_cpu_apicid[num_processors - found_bsp] = m->mpc_apicid;
+ 		x86_cpu_to_apicid[num_processors - found_bsp] = m->mpc_apicid;
+ 	}
 }
 
 static void __init MP_bus_info (struct mpc_config_bus *m)
@@ -759,7 +767,7 @@ void __init mp_register_ioapic (
 	mp_ioapics[idx].mpc_apicaddr = address;
 
 	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-	mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id);
+	mp_ioapics[idx].mpc_apicid = id;
 	mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
 	
 	/* 
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 61de0b34a01..31c0f2e6ac9 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -33,6 +33,7 @@
 #include <asm/msr.h>
 #include <asm/proto.h>
 #include <asm/kdebug.h>
+#include <asm/local.h>
 
 /*
  * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
@@ -59,7 +60,8 @@ int panic_on_timeout;
 
 unsigned int nmi_watchdog = NMI_DEFAULT;
 static unsigned int nmi_hz = HZ;
-unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
+static unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
+static unsigned int nmi_p4_cccr_val;
 
 /* Note that these events don't tick when the CPU idles. This means
    the frequency varies with CPU load. */
@@ -71,61 +73,87 @@ unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING	0x76
 #define K7_NMI_EVENT		K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
 
-#define P6_EVNTSEL0_ENABLE	(1 << 22)
-#define P6_EVNTSEL_INT		(1 << 20)
-#define P6_EVNTSEL_OS		(1 << 17)
-#define P6_EVNTSEL_USR		(1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED	0x79
-#define P6_NMI_EVENT		P6_EVENT_CPU_CLOCKS_NOT_HALTED
+#define MSR_P4_MISC_ENABLE	0x1A0
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL	(1<<7)
+#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL	(1<<12)
+#define MSR_P4_PERFCTR0		0x300
+#define MSR_P4_CCCR0		0x360
+#define P4_ESCR_EVENT_SELECT(N)	((N)<<25)
+#define P4_ESCR_OS		(1<<3)
+#define P4_ESCR_USR		(1<<2)
+#define P4_CCCR_OVF_PMI0	(1<<26)
+#define P4_CCCR_OVF_PMI1	(1<<27)
+#define P4_CCCR_THRESHOLD(N)	((N)<<20)
+#define P4_CCCR_COMPLEMENT	(1<<19)
+#define P4_CCCR_COMPARE		(1<<18)
+#define P4_CCCR_REQUIRED	(3<<16)
+#define P4_CCCR_ESCR_SELECT(N)	((N)<<13)
+#define P4_CCCR_ENABLE		(1<<12)
+/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+   CRU_ESCR0 (with any non-null event selector) through a complemented
+   max threshold. [IA32-Vol3, Section 14.9.9] */
+#define MSR_P4_IQ_COUNTER0	0x30C
+#define P4_NMI_CRU_ESCR0	(P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
+#define P4_NMI_IQ_CCCR0	\
+	(P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT|	\
+	 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
+
+static __init inline int nmi_known_cpu(void)
+{
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		return boot_cpu_data.x86 == 15;
+	case X86_VENDOR_INTEL:
+		return boot_cpu_data.x86 == 15;
+	}
+	return 0;
+}
 
 /* Run after command line and cpu_init init, but before all other checks */
 void __init nmi_watchdog_default(void)
 {
 	if (nmi_watchdog != NMI_DEFAULT)
 		return;
-
-	/* For some reason the IO APIC watchdog doesn't work on the AMD
-	   8111 chipset. For now switch to local APIC mode using
-	   perfctr0 there.  On Intel CPUs we don't have code to handle
-	   the perfctr and the IO-APIC seems to work, so use that.  */
-
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
-		nmi_watchdog = NMI_LOCAL_APIC; 
-		printk(KERN_INFO 
-              "Using local APIC NMI watchdog using perfctr0\n");
-	} else {
-		printk(KERN_INFO "Using IO APIC NMI watchdog\n");
+	if (nmi_known_cpu())
+		nmi_watchdog = NMI_LOCAL_APIC;
+	else
 		nmi_watchdog = NMI_IO_APIC;
-	}
 }
 
-/* Why is there no CPUID flag for this? */
-static __init int cpu_has_lapic(void)
+#ifdef CONFIG_SMP
+/* The performance counters used by NMI_LOCAL_APIC don't trigger when
+ * the CPU is idle. To make sure the NMI watchdog really ticks on all
+ * CPUs during the test make them busy.
+ */
+static __init void nmi_cpu_busy(void *data)
 {
-	switch (boot_cpu_data.x86_vendor) { 
-	case X86_VENDOR_INTEL:
-	case X86_VENDOR_AMD: 
-		return boot_cpu_data.x86 >= 6; 
-	/* .... add more cpus here or find a different way to figure this out. */	
-	default:
-		return 0;
-	} 	
+	volatile int *endflag = data;
+	local_irq_enable();
+	/* Intentionally don't use cpu_relax here. This is
+	   to make sure that the performance counter really ticks,
+	   even if there is a simulator or similar that catches the
+	   pause instruction. On a real HT machine this is fine because
+	   all other CPUs are busy with "useless" delay loops and don't
+	   care if they get somewhat less cycles. */
+	while (*endflag == 0)
+		barrier();
 }
+#endif
 
-static int __init check_nmi_watchdog (void)
+int __init check_nmi_watchdog (void)
 {
-	int counts[NR_CPUS];
+	volatile int endflag = 0;
+	int *counts;
 	int cpu;
 
-	if (nmi_watchdog == NMI_NONE)
-		return 0;
+	counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
+	if (!counts)
+		return -1;
 
-	if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic())  {
-		nmi_watchdog = NMI_NONE;
-		return -1; 
-	}	
+	printk(KERN_INFO "testing NMI watchdog ... ");
 
-	printk(KERN_INFO "Testing NMI watchdog ... ");
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
 		counts[cpu] = cpu_pda[cpu].__nmi_count; 
@@ -133,15 +161,22 @@ static int __init check_nmi_watchdog (void)
 	mdelay((10*1000)/nmi_hz); // wait 10 ticks
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		if (!cpu_online(cpu))
+			continue;
 		if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
-			printk("CPU#%d: NMI appears to be stuck (%d)!\n", 
+			endflag = 1;
+			printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
 			       cpu,
+			       counts[cpu],
 			       cpu_pda[cpu].__nmi_count);
 			nmi_active = 0;
 			lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
+			nmi_perfctr_msr = 0;
+			kfree(counts);
 			return -1;
 		}
 	}
+	endflag = 1;
 	printk("OK.\n");
 
 	/* now that we know it works we can reduce NMI frequency to
@@ -149,10 +184,9 @@ static int __init check_nmi_watchdog (void)
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		nmi_hz = 1;
 
+	kfree(counts);
 	return 0;
 }
-/* Have this called later during boot so counters are updating */
-late_initcall(check_nmi_watchdog);
 
 int __init setup_nmi_watchdog(char *str)
 {
@@ -170,7 +204,7 @@ int __init setup_nmi_watchdog(char *str)
 
 	if (nmi >= NMI_INVALID)
 		return 0;
-		nmi_watchdog = nmi;
+	nmi_watchdog = nmi;
 	return 1;
 }
 
@@ -185,7 +219,10 @@ static void disable_lapic_nmi_watchdog(void)
 		wrmsr(MSR_K7_EVNTSEL0, 0, 0);
 		break;
 	case X86_VENDOR_INTEL:
-		wrmsr(MSR_IA32_EVNTSEL0, 0, 0);
+		if (boot_cpu_data.x86 == 15) {
+			wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
+			wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
+		}
 		break;
 	}
 	nmi_active = -1;
@@ -253,7 +290,7 @@ void enable_timer_nmi_watchdog(void)
 
 static int nmi_pm_active; /* nmi_active before suspend */
 
-static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
+static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
 {
 	nmi_pm_active = nmi_active;
 	disable_lapic_nmi_watchdog();
@@ -300,22 +337,27 @@ late_initcall(init_lapic_nmi_sysfs);
  * Original code written by Keith Owens.
  */
 
+static void clear_msr_range(unsigned int base, unsigned int n)
+{
+	unsigned int i;
+
+	for(i = 0; i < n; ++i)
+		wrmsr(base+i, 0, 0);
+}
+
 static void setup_k7_watchdog(void)
 {
 	int i;
 	unsigned int evntsel;
 
-	/* No check, so can start with slow frequency */
-	nmi_hz = 1; 
-
-	/* XXX should check these in EFER */
-
 	nmi_perfctr_msr = MSR_K7_PERFCTR0;
 
 	for(i = 0; i < 4; ++i) {
 		/* Simulator may not support it */
-		if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL))
+		if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) {
+			nmi_perfctr_msr = 0;
 			return;
+		}
 		wrmsrl(MSR_K7_PERFCTR0+i, 0UL);
 	}
 
@@ -325,12 +367,54 @@ static void setup_k7_watchdog(void)
 		| K7_NMI_EVENT;
 
 	wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
-	wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz);
+	wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
 	evntsel |= K7_EVNTSEL_ENABLE;
 	wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
 }
 
+
+static int setup_p4_watchdog(void)
+{
+	unsigned int misc_enable, dummy;
+
+	rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
+	if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
+		return 0;
+
+	nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
+	nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
+#ifdef CONFIG_SMP
+	if (smp_num_siblings == 2)
+		nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
+#endif
+
+	if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
+		clear_msr_range(0x3F1, 2);
+	/* MSR 0x3F0 seems to have a default value of 0xFC00, but current
+	   docs doesn't fully define it, so leave it alone for now. */
+	if (boot_cpu_data.x86_model >= 0x3) {
+		/* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
+		clear_msr_range(0x3A0, 26);
+		clear_msr_range(0x3BC, 3);
+	} else {
+		clear_msr_range(0x3A0, 31);
+	}
+	clear_msr_range(0x3C0, 6);
+	clear_msr_range(0x3C8, 6);
+	clear_msr_range(0x3E0, 2);
+	clear_msr_range(MSR_P4_CCCR0, 18);
+	clear_msr_range(MSR_P4_PERFCTR0, 18);
+
+	wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
+	wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
+	Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000));
+	wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
+	return 1;
+}
+
 void setup_apic_nmi_watchdog(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
@@ -341,6 +425,13 @@ void setup_apic_nmi_watchdog(void)
 			return;
 		setup_k7_watchdog();
 		break;
+	case X86_VENDOR_INTEL:
+		if (boot_cpu_data.x86 != 15)
+			return;
+		if (!setup_p4_watchdog())
+			return;
+		break;
+
 	default:
 		return;
 	}
@@ -355,56 +446,67 @@ void setup_apic_nmi_watchdog(void)
  *
  * as these watchdog NMI IRQs are generated on every CPU, we only
  * have to check the current processor.
- *
- * since NMIs don't listen to _any_ locks, we have to be extremely
- * careful not to rely on unsafe variables. The printk might lock
- * up though, so we have to break up any console locks first ...
- * [when there will be more tty-related locks, break them up
- *  here too!]
  */
 
-static unsigned int
-	last_irq_sums [NR_CPUS],
-	alert_counter [NR_CPUS];
+static DEFINE_PER_CPU(unsigned, last_irq_sum);
+static DEFINE_PER_CPU(local_t, alert_counter);
+static DEFINE_PER_CPU(int, nmi_touch);
 
 void touch_nmi_watchdog (void)
 {
 	int i;
 
 	/*
-	 * Just reset the alert counters, (other CPUs might be
-	 * spinning on locks we hold):
+ 	 * Tell other CPUs to reset their alert counters. We cannot
+	 * do it ourselves because the alert count increase is not
+	 * atomic.
 	 */
 	for (i = 0; i < NR_CPUS; i++)
-		alert_counter[i] = 0;
+		per_cpu(nmi_touch, i) = 1;
 }
 
 void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
 {
-	int sum, cpu;
+	int sum;
+	int touched = 0;
 
-	cpu = safe_smp_processor_id();
 	sum = read_pda(apic_timer_irqs);
-	if (last_irq_sums[cpu] == sum) {
+	if (__get_cpu_var(nmi_touch)) {
+		__get_cpu_var(nmi_touch) = 0;
+		touched = 1;
+	}
+	if (!touched && __get_cpu_var(last_irq_sum) == sum) {
 		/*
 		 * Ayiee, looks like this CPU is stuck ...
 		 * wait a few IRQs (5 seconds) before doing the oops ...
 		 */
-		alert_counter[cpu]++;
-		if (alert_counter[cpu] == 5*nmi_hz) {
+		local_inc(&__get_cpu_var(alert_counter));
+		if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) {
 			if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
 							== NOTIFY_STOP) {
-				alert_counter[cpu] = 0; 
+				local_set(&__get_cpu_var(alert_counter), 0);
 				return;
 			} 
 			die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs);
 		}
 	} else {
-		last_irq_sums[cpu] = sum;
-		alert_counter[cpu] = 0;
+		__get_cpu_var(last_irq_sum) = sum;
+		local_set(&__get_cpu_var(alert_counter), 0);
 	}
-	if (nmi_perfctr_msr)
+	if (nmi_perfctr_msr) {
+ 		if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
+ 			/*
+ 			 * P4 quirks:
+ 			 * - An overflown perfctr will assert its interrupt
+ 			 *   until the OVF flag in its CCCR is cleared.
+ 			 * - LVTPC is masked on interrupt and must be
+ 			 *   unmasked by the LVTPC handler.
+ 			 */
+ 			wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
+ 			apic_write(APIC_LVTPC, APIC_DM_NMI);
+ 		}
 		wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
+	}
 }
 
 static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c
new file mode 100644
index 00000000000..feb5f108dd2
--- /dev/null
+++ b/arch/x86_64/kernel/pmtimer.c
@@ -0,0 +1,101 @@
+/* Ported over from i386 by AK, original copyright was:
+ *
+ * (C) Dominik Brodowski <linux@brodo.de> 2003
+ *
+ * Driver to use the Power Management Timer (PMTMR) available in some
+ * southbridges as primary timing source for the Linux kernel.
+ *
+ * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
+ * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
+ *
+ * This file is licensed under the GPL v2.
+ *
+ * Dropped all the hardware bug workarounds for now. Hopefully they
+ * are not needed on 64bit chipsets.
+ */
+
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/cpumask.h>
+#include <asm/io.h>
+#include <asm/proto.h>
+#include <asm/msr.h>
+#include <asm/vsyscall.h>
+
+/* The I/O port the PMTMR resides at.
+ * The location is detected during setup_arch(),
+ * in arch/i386/kernel/acpi/boot.c */
+u32 pmtmr_ioport;
+
+/* value of the Power timer at last timer interrupt */
+static u32 offset_delay;
+static u32 last_pmtmr_tick;
+
+#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
+
+static inline u32 cyc2us(u32 cycles)
+{
+	/* The Power Management Timer ticks at 3.579545 ticks per microsecond.
+	 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
+	 *
+	 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
+	 * easily be multiplied with 286 (=0x11E) without having to fear
+	 * u32 overflows.
+	 */
+	cycles *= 286;
+	return (cycles >> 10);
+}
+
+int pmtimer_mark_offset(void)
+{
+	static int first_run = 1;
+	unsigned long tsc;
+	u32 lost;
+
+	u32 tick = inl(pmtmr_ioport);
+	u32 delta;
+
+	delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK);
+
+	last_pmtmr_tick = tick;
+	monotonic_base += delta * NSEC_PER_USEC;
+
+	delta += offset_delay;
+
+	lost = delta / (USEC_PER_SEC / HZ);
+	offset_delay = delta % (USEC_PER_SEC / HZ);
+
+	rdtscll(tsc);
+	vxtime.last_tsc = tsc - offset_delay * cpu_khz;
+
+	/* don't calculate delay for first run,
+	   or if we've got less then a tick */
+	if (first_run || (lost < 1)) {
+		first_run = 0;
+		offset_delay = 0;
+	}
+
+	return lost - 1;
+}
+
+unsigned int do_gettimeoffset_pm(void)
+{
+	u32 now, offset, delta = 0;
+
+	offset = last_pmtmr_tick;
+	now = inl(pmtmr_ioport);
+	delta = (now - offset) & ACPI_PM_MASK;
+
+	return offset_delay + cyc2us(delta);
+}
+
+
+static int __init nopmtimer_setup(char *s)
+{
+	pmtmr_ioport = 0;
+	return 0;
+}
+
+__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index e26e86bb56f..60dc9b98951 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -257,13 +257,13 @@ static int putreg(struct task_struct *child,
 			value &= 0xffff;
 			return 0;
 		case offsetof(struct user_regs_struct,fs_base):
-			if (!((value >> 48) == 0 || (value >> 48) == 0xffff))
-				return -EIO; 
+			if (value >= TASK_SIZE)
+				return -EIO;
 			child->thread.fs = value;
 			return 0;
 		case offsetof(struct user_regs_struct,gs_base):
-			if (!((value >> 48) == 0 || (value >> 48) == 0xffff))
-				return -EIO; 
+			if (value >= TASK_SIZE)
+				return -EIO;
 			child->thread.gs = value;
 			return 0;
 		case offsetof(struct user_regs_struct, eflags):
@@ -277,6 +277,11 @@ static int putreg(struct task_struct *child,
 				return -EIO;
 			value &= 0xffff;
 			break;
+		case offsetof(struct user_regs_struct, rip):
+			/* Check if the new RIP address is canonical */
+			if (value >= TASK_SIZE)
+				return -EIO;
+			break;
 	}
 	put_stack_long(child, regno - sizeof(struct pt_regs), value);
 	return 0;
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 2129cf9ba6b..b9fd0252c27 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -727,11 +727,12 @@ static void __init display_cacheinfo(struct cpuinfo_x86 *c)
 static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
-	int cpu = c->x86_apicid;
+	int cpu = smp_processor_id();
 	int node = 0;
 	if (c->x86_num_cores == 1)
 		return;
-	cpu_core_id[cpu] = cpu >> hweight32(c->x86_num_cores - 1);
+	/* Fix up the APIC ID following the AMD specification. */
+ 	cpu_core_id[cpu] >>= hweight32(c->x86_num_cores - 1);
 
 #ifdef CONFIG_NUMA
 	/* When an ACPI SRAT table is available use the mappings from SRAT
@@ -745,6 +746,9 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 		node = cpu_to_node[cpu];
 	}
 #endif
+	/* For now: - better than BAD_APIC_ID at least*/
+	phys_proc_id[cpu] = cpu_core_id[cpu];
+
 	printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n",
 			cpu, c->x86_num_cores, node, cpu_core_id[cpu]);
 #endif
@@ -925,7 +929,6 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	c->x86_clflush_size = 64;
 	c->x86_cache_alignment = c->x86_clflush_size;
 	c->x86_num_cores = 1;
-	c->x86_apicid = c == &boot_cpu_data ? 0 : c - cpu_data;
 	c->extended_cpuid_level = 0;
 	memset(&c->x86_capability, 0, sizeof c->x86_capability);
 
@@ -954,11 +957,15 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
 		} 
 		if (c->x86_capability[0] & (1<<19)) 
 			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
-		c->x86_apicid = misc >> 24;
 	} else {
 		/* Have CPUID level 0 only - unheard of */
 		c->x86 = 4;
 	}
+
+#ifdef CONFIG_SMP
+	phys_proc_id[smp_processor_id()] =
+	cpu_core_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
+#endif
 }
 
 /*
@@ -1088,7 +1095,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 
 		/* Other (Linux-defined) */
-		"cxmmx", NULL, "cyrix_arr", "centaur_mcr", "k8c+",
+		"cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
 		"constant_tsc", NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 73f7e8b9543..5abdee1e16a 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -56,6 +56,7 @@
 #include <asm/kdebug.h>
 #include <asm/tlbflush.h>
 #include <asm/proto.h>
+#include <asm/nmi.h>
 
 /* Change for real CPU hotplug. Note other files need to be fixed
    first too. */
@@ -125,96 +126,210 @@ static void __cpuinit smp_store_cpu_info(int id)
 
 	*c = boot_cpu_data;
 	identify_cpu(c);
+	print_cpu_info(c);
 }
 
 /*
- * Synchronize TSCs of CPUs
+ * New Funky TSC sync algorithm borrowed from IA64.
+ * Main advantage is that it doesn't reset the TSCs fully and
+ * in general looks more robust and it works better than my earlier
+ * attempts. I believe it was written by David Mosberger. Some minor
+ * adjustments for x86-64 by me -AK
  *
- * This new algorithm is less accurate than the old "zero TSCs"
- * one, but we cannot zero TSCs anymore in the new hotplug CPU
- * model.
+ * Original comment reproduced below.
+ *
+ * Synchronize TSC of the current (slave) CPU with the TSC of the
+ * MASTER CPU (normally the time-keeper CPU).  We use a closed loop to
+ * eliminate the possibility of unaccounted-for errors (such as
+ * getting a machine check in the middle of a calibration step).  The
+ * basic idea is for the slave to ask the master what itc value it has
+ * and to read its own itc before and after the master responds.  Each
+ * iteration gives us three timestamps:
+ *
+ *	slave		master
+ *
+ *	t0 ---\
+ *             ---\
+ *		   --->
+ *			tm
+ *		   /---
+ *	       /---
+ *	t1 <---
+ *
+ *
+ * The goal is to adjust the slave's TSC such that tm falls exactly
+ * half-way between t0 and t1.  If we achieve this, the clocks are
+ * synchronized provided the interconnect between the slave and the
+ * master is symmetric.  Even if the interconnect were asymmetric, we
+ * would still know that the synchronization error is smaller than the
+ * roundtrip latency (t0 - t1).
+ *
+ * When the interconnect is quiet and symmetric, this lets us
+ * synchronize the TSC to within one or two cycles.  However, we can
+ * only *guarantee* that the synchronization is accurate to within a
+ * round-trip time, which is typically in the range of several hundred
+ * cycles (e.g., ~500 cycles).  In practice, this means that the TSCs
+ * are usually almost perfectly synchronized, but we shouldn't assume
+ * that the accuracy is much better than half a micro second or so.
+ *
+ * [there are other errors like the latency of RDTSC and of the
+ * WRMSR. These can also account to hundreds of cycles. So it's
+ * probably worse. It claims 153 cycles error on a dual Opteron,
+ * but I suspect the numbers are actually somewhat worse -AK]
  */
 
-static atomic_t __cpuinitdata tsc_flag;
+#define MASTER	0
+#define SLAVE	(SMP_CACHE_BYTES/8)
+
+/* Intentionally don't use cpu_relax() while TSC synchronization
+   because we don't want to go into funky power save modi or cause
+   hypervisors to schedule us away.  Going to sleep would likely affect
+   latency and low latency is the primary objective here. -AK */
+#define no_cpu_relax() barrier()
+
 static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
-static unsigned long long __cpuinitdata bp_tsc, ap_tsc;
+static volatile __cpuinitdata unsigned long go[SLAVE + 1];
+static int notscsync __cpuinitdata;
+
+#undef DEBUG_TSC_SYNC
 
-#define NR_LOOPS 5
+#define NUM_ROUNDS	64	/* magic value */
+#define NUM_ITERS	5	/* likewise */
 
-static void __cpuinit sync_tsc_bp_init(int init)
+/* Callback on boot CPU */
+static __cpuinit void sync_master(void *arg)
 {
-	if (init)
-		_raw_spin_lock(&tsc_sync_lock);
-	else
-		_raw_spin_unlock(&tsc_sync_lock);
-	atomic_set(&tsc_flag, 0);
+	unsigned long flags, i;
+
+	if (smp_processor_id() != boot_cpu_id)
+		return;
+
+	go[MASTER] = 0;
+
+	local_irq_save(flags);
+	{
+		for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
+			while (!go[MASTER])
+				no_cpu_relax();
+			go[MASTER] = 0;
+			rdtscll(go[SLAVE]);
+		}
+	}
+	local_irq_restore(flags);
 }
 
 /*
- * Synchronize TSC on AP with BP.
+ * Return the number of cycles by which our tsc differs from the tsc
+ * on the master (time-keeper) CPU.  A positive number indicates our
+ * tsc is ahead of the master, negative that it is behind.
  */
-static void __cpuinit __sync_tsc_ap(void)
+static inline long
+get_delta(long *rt, long *master)
 {
-	if (!cpu_has_tsc)
-		return;
-	Dprintk("AP %d syncing TSC\n", smp_processor_id());
+	unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
+	unsigned long tcenter, t0, t1, tm;
+	int i;
 
-	while (atomic_read(&tsc_flag) != 0)
-		cpu_relax();
-	atomic_inc(&tsc_flag);
-	mb();
-	_raw_spin_lock(&tsc_sync_lock);
-	wrmsrl(MSR_IA32_TSC, bp_tsc);
-	_raw_spin_unlock(&tsc_sync_lock);
-	rdtscll(ap_tsc);
-	mb();
-	atomic_inc(&tsc_flag);
-	mb();
+	for (i = 0; i < NUM_ITERS; ++i) {
+		rdtscll(t0);
+		go[MASTER] = 1;
+		while (!(tm = go[SLAVE]))
+			no_cpu_relax();
+		go[SLAVE] = 0;
+		rdtscll(t1);
+
+		if (t1 - t0 < best_t1 - best_t0)
+			best_t0 = t0, best_t1 = t1, best_tm = tm;
+	}
+
+	*rt = best_t1 - best_t0;
+	*master = best_tm - best_t0;
+
+	/* average best_t0 and best_t1 without overflow: */
+	tcenter = (best_t0/2 + best_t1/2);
+	if (best_t0 % 2 + best_t1 % 2 == 2)
+		++tcenter;
+	return tcenter - best_tm;
 }
 
-static void __cpuinit sync_tsc_ap(void)
+static __cpuinit void sync_tsc(void)
 {
-	int i;
-	for (i = 0; i < NR_LOOPS; i++)
-		__sync_tsc_ap();
+	int i, done = 0;
+	long delta, adj, adjust_latency = 0;
+	unsigned long flags, rt, master_time_stamp, bound;
+#if DEBUG_TSC_SYNC
+	static struct syncdebug {
+		long rt;	/* roundtrip time */
+		long master;	/* master's timestamp */
+		long diff;	/* difference between midpoint and master's timestamp */
+		long lat;	/* estimate of tsc adjustment latency */
+	} t[NUM_ROUNDS] __cpuinitdata;
+#endif
+
+	go[MASTER] = 1;
+
+	smp_call_function(sync_master, NULL, 1, 0);
+
+	while (go[MASTER])	/* wait for master to be ready */
+		no_cpu_relax();
+
+	spin_lock_irqsave(&tsc_sync_lock, flags);
+	{
+		for (i = 0; i < NUM_ROUNDS; ++i) {
+			delta = get_delta(&rt, &master_time_stamp);
+			if (delta == 0) {
+				done = 1;	/* let's lock on to this... */
+				bound = rt;
+			}
+
+			if (!done) {
+				unsigned long t;
+				if (i > 0) {
+					adjust_latency += -delta;
+					adj = -delta + adjust_latency/4;
+				} else
+					adj = -delta;
+
+				rdtscll(t);
+				wrmsrl(MSR_IA32_TSC, t + adj);
+			}
+#if DEBUG_TSC_SYNC
+			t[i].rt = rt;
+			t[i].master = master_time_stamp;
+			t[i].diff = delta;
+			t[i].lat = adjust_latency/4;
+#endif
+		}
+	}
+	spin_unlock_irqrestore(&tsc_sync_lock, flags);
+
+#if DEBUG_TSC_SYNC
+	for (i = 0; i < NUM_ROUNDS; ++i)
+		printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
+		       t[i].rt, t[i].master, t[i].diff, t[i].lat);
+#endif
+
+	printk(KERN_INFO
+	       "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
+	       "maxerr %lu cycles)\n",
+	       smp_processor_id(), boot_cpu_id, delta, rt);
 }
 
-/*
- * Synchronize TSC from BP to AP.
- */
-static void __cpuinit __sync_tsc_bp(int cpu)
+static void __cpuinit tsc_sync_wait(void)
 {
-	if (!cpu_has_tsc)
+	if (notscsync || !cpu_has_tsc)
 		return;
-
-	/* Wait for AP */
-	while (atomic_read(&tsc_flag) == 0)
-		cpu_relax();
-	/* Save BPs TSC */
-	sync_core();
-	rdtscll(bp_tsc);
-	/* Don't do the sync core here to avoid too much latency. */
-	mb();
-	/* Start the AP */
-	_raw_spin_unlock(&tsc_sync_lock);
-	/* Wait for AP again */
-	while (atomic_read(&tsc_flag) < 2)
-		cpu_relax();
-	rdtscl(bp_tsc);
-	barrier();
+	printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(),
+			boot_cpu_id);
+	sync_tsc();
 }
 
-static void __cpuinit sync_tsc_bp(int cpu)
+static __init int notscsync_setup(char *s)
 {
-	int i;
-	for (i = 0; i < NR_LOOPS - 1; i++) {
-		__sync_tsc_bp(cpu);
-		sync_tsc_bp_init(1);
-	}
-	__sync_tsc_bp(cpu);
-	printk(KERN_INFO "Synced TSC of CPU %d difference %Ld\n",
-	       cpu, ap_tsc - bp_tsc);
+	notscsync = 1;
+	return 0;
 }
+__setup("notscsync", notscsync_setup);
 
 static atomic_t init_deasserted __cpuinitdata;
 
@@ -315,11 +430,6 @@ void __cpuinit start_secondary(void)
 	cpu_init();
 	smp_callin();
 
-	/*
-	 * Synchronize the TSC with the BP
-	 */
-	sync_tsc_ap();
-
 	/* otherwise gcc will move up the smp_processor_id before the cpu_init */
 	barrier();
 
@@ -334,7 +444,6 @@ void __cpuinit start_secondary(void)
 		enable_8259A_irq(0);
 	}
 
-
 	enable_APIC_timer();
 
 	/*
@@ -343,6 +452,11 @@ void __cpuinit start_secondary(void)
 	cpu_set(smp_processor_id(), cpu_online_map);
 	mb();
 
+	/* Wait for TSC sync to not schedule things before.
+	   We still process interrupts, which could see an inconsistent
+	   time in that window unfortunately. */
+	tsc_sync_wait();
+
 	cpu_idle();
 }
 
@@ -531,7 +645,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
 		printk("failed fork for CPU %d\n", cpu);
 		return PTR_ERR(idle);
 	}
-	x86_cpu_to_apicid[cpu] = apicid;
 
 	cpu_pda[cpu].pcurrent = idle;
 
@@ -600,8 +713,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
 
 		if (cpu_isset(cpu, cpu_callin_map)) {
 			/* number CPUs logically, starting from 1 (BSP is 0) */
-			Dprintk("OK.\n");
-			print_cpu_info(&cpu_data[cpu]);
 			Dprintk("CPU has booted.\n");
 		} else {
 			boot_error = 1;
@@ -842,7 +953,6 @@ void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
 		      GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
 		/* Or can we switch back to PIC here? */
 	}
-	x86_cpu_to_apicid[0] = boot_cpu_id;
 
 	/*
 	 * Now start the IO-APICs
@@ -889,18 +999,14 @@ int __cpuinit __cpu_up(unsigned int cpu)
 		printk("__cpu_up: bad cpu %d\n", cpu);
 		return -EINVAL;
 	}
-	sync_tsc_bp_init(1);
 
 	/* Boot it! */
 	err = do_boot_cpu(cpu, apicid);
 	if (err < 0) {
-		sync_tsc_bp_init(0);
 		Dprintk("do_boot_cpu failed %d\n", err);
 		return err;
 	}
 
-	sync_tsc_bp(cpu);
-
 	/* Unleash the CPU! */
 	Dprintk("waiting for cpu %d\n", cpu);
 
@@ -923,4 +1029,6 @@ void __cpuinit smp_cpus_done(unsigned int max_cpus)
 
 	detect_siblings();
 	time_init_gtod();
+
+	check_nmi_watchdog();
 }
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 0e3b60e1ac5..735b6767c8e 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -26,6 +26,8 @@
 #include <linux/sysdev.h>
 #include <linux/bcd.h>
 #include <linux/kallsyms.h>
+#include <linux/acpi.h>
+#include <acpi/achware.h>	/* for PM timer frequency */
 #include <asm/8253pit.h>
 #include <asm/pgtable.h>
 #include <asm/vsyscall.h>
@@ -396,6 +398,10 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 			(offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick;
 
 		vxtime.last = offset;
+#ifdef CONFIG_X86_PM_TIMER
+	} else if (vxtime.mode == VXTIME_PMTMR) {
+		lost = pmtimer_mark_offset();
+#endif
 	} else {
 		offset = (((tsc - vxtime.last_tsc) *
 			   vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ);
@@ -898,6 +904,13 @@ void __init time_init(void)
 			hpet_period;
 		cpu_khz = hpet_calibrate_tsc();
 		timename = "HPET";
+#ifdef CONFIG_X86_PM_TIMER
+	} else if (pmtmr_ioport) {
+		vxtime_hz = PM_TIMER_FREQUENCY;
+		timename = "PM";
+		pit_init();
+		cpu_khz = pit_calibrate_tsc();
+#endif
 	} else {
 		pit_init();
 		cpu_khz = pit_calibrate_tsc();
@@ -923,35 +936,50 @@ void __init time_init(void)
 }
 
 /*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+static __init int unsynchronized_tsc(void)
+{
+#ifdef CONFIG_SMP
+	if (oem_force_hpet_timer())
+		return 1;
+ 	/* Intel systems are normally all synchronized. Exceptions
+ 	   are handled in the OEM check above. */
+ 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ 		return 0;
+ 	/* All in a single socket - should be synchronized */
+ 	if (cpus_weight(cpu_core_map[0]) == num_online_cpus())
+ 		return 0;
+#endif
+ 	/* Assume multi socket systems are not synchronized */
+ 	return num_online_cpus() > 1;
+}
+
+/*
  * Decide after all CPUs are booted what mode gettimeofday should use.
  */
 void __init time_init_gtod(void)
 {
 	char *timetype;
 
-	/*
-	 * AMD systems with more than one CPU don't have fully synchronized
-	 * TSCs. Always use HPET gettimeofday for these, although it is slower.
-	 * Intel SMP systems usually have synchronized TSCs, so use always
-	 * the TSC.
-	 *
-	 * Exceptions:
-	 * IBM Summit2 checked by oem_force_hpet_timer().
- 	 * AMD dual core may also not need HPET. Check me.
-	 *
-	 * Can be turned off with "notsc".
-	 */
-	if (num_online_cpus() > 1 &&
-	    boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-		notsc = 1;
-	/* Some systems will want to disable TSC and use HPET. */
-	if (oem_force_hpet_timer())
+	if (unsynchronized_tsc())
 		notsc = 1;
 	if (vxtime.hpet_address && notsc) {
 		timetype = "HPET";
 		vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
 		vxtime.mode = VXTIME_HPET;
 		do_gettimeoffset = do_gettimeoffset_hpet;
+#ifdef CONFIG_X86_PM_TIMER
+	/* Using PM for gettimeofday is quite slow, but we have no other
+	   choice because the TSC is too unreliable on some systems. */
+	} else if (pmtmr_ioport && !vxtime.hpet_address && notsc) {
+		timetype = "PM";
+		do_gettimeoffset = do_gettimeoffset_pm;
+		vxtime.mode = VXTIME_PMTMR;
+		sysctl_vsyscall = 0;
+		printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n");
+#endif
 	} else {
 		timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC";
 		vxtime.mode = VXTIME_TSC;
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index b4b8dc59663..2e573442594 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -65,7 +65,7 @@ static force_inline void do_vgettimeofday(struct timeval * tv)
 		usec = (__xtime.tv_nsec / 1000) +
 			(__jiffies - __wall_jiffies) * (1000000 / HZ);
 
-		if (__vxtime.mode == VXTIME_TSC) {
+		if (__vxtime.mode != VXTIME_HPET) {
 			sync_core();
 			rdtscll(t);
 			if (t < __vxtime.last_tsc)
@@ -217,8 +217,9 @@ static int __init vsyscall_init(void)
 	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
 	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
 	map_vsyscall();
-	sysctl_vsyscall = 1;
+#ifdef CONFIG_SYSCTL
 	register_sysctl_table(kernel_root_table2, 0);
+#endif
 	return 0;
 }
 
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index e0330921676..5d6b2114cc9 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -234,6 +234,8 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
 
 /*
  * Handle a fault on the vmalloc or module mapping area
+ *
+ * This assumes no large pages in there.
  */
 static int vmalloc_fault(unsigned long address)
 {
@@ -272,7 +274,10 @@ static int vmalloc_fault(unsigned long address)
 	if (!pte_present(*pte_ref))
 		return -1;
 	pte = pte_offset_kernel(pmd, address);
-	if (!pte_present(*pte) || pte_page(*pte) != pte_page(*pte_ref))
+	/* Don't use pte_page here, because the mappings can point
+	   outside mem_map, and the NUMA hash lookup cannot handle
+	   that. */
+	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
 		BUG();
 	__flush_tlb_all();
 	return 0;
@@ -346,7 +351,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	 * protection error (error_code & 1) == 0.
 	 */
 	if (unlikely(address >= TASK_SIZE)) {
-		if (!(error_code & 5)) {
+		if (!(error_code & 5) &&
+		      ((address >= VMALLOC_START && address < VMALLOC_END) ||
+		       (address >= MODULES_VADDR && address < MODULES_END))) {
 			if (vmalloc_fault(address) < 0)
 				goto bad_area_nosemaphore;
 			return;
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
index 74ec8554b19..c6fb0cb6999 100644
--- a/arch/x86_64/mm/ioremap.c
+++ b/arch/x86_64/mm/ioremap.c
@@ -272,7 +272,7 @@ void iounmap(volatile void __iomem *addr)
 	if ((p->flags >> 20) &&
 		p->phys_addr + p->size - 1 < virt_to_phys(high_memory)) {
 		/* p->size includes the guard page, but cpa doesn't like that */
-		change_page_attr(virt_to_page(__va(p->phys_addr)),
+		change_page_attr_addr((unsigned long)__va(p->phys_addr),
 				 p->size >> PAGE_SHIFT,
 				 PAGE_KERNEL);
 		global_flush_tlb();