From 3e347261a80b57df792ab9464b5f0ed59add53a8 Mon Sep 17 00:00:00 2001
From: Bob Picco <bob.picco@hp.com>
Date: Sat, 3 Sep 2005 15:54:28 -0700
Subject: [PATCH] sparsemem extreme implementation

With cleanups from Dave Hansen <haveblue@us.ibm.com>

SPARSEMEM_EXTREME makes mem_section a one dimensional array of pointers to
mem_sections.  This two level layout scheme is able to achieve smaller
memory requirements for SPARSEMEM with the tradeoff of an additional shift
and load when fetching the memory section.  The current SPARSEMEM
implementation is a one dimensional array of mem_sections which is the
default SPARSEMEM configuration.  The patch attempts isolates the
implementation details of the physical layout of the sparsemem section
array.

SPARSEMEM_EXTREME requires bootmem to be functioning at the time of
memory_present() calls.  This is not always feasible, so architectures
which do not need it may allocate everything statically by using
SPARSEMEM_STATIC.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/i386')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 619d843ba23..dcb0ad098c6 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -754,6 +754,7 @@ config NUMA
 	depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI))
 	default n if X86_PC
 	default y if (X86_NUMAQ || X86_SUMMIT)
+	select SPARSEMEM_STATIC
 
 # Need comments to help the hapless user trying to turn on NUMA support
 comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support"
-- 
cgit v1.2.3-70-g09d2


From 7bf07f3d4b4358aa6d99a26d7a0165f1e91c3fcc Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Sat, 3 Sep 2005 15:55:00 -0700
Subject: [PATCH] hugetlb: move stale pte check into huge_pte_alloc()

Initial Post (Wed, 17 Aug 2005)

This patch moves the
	if (! pte_none(*pte))
		hugetlb_clean_stale_pgtable(pte);
logic into huge_pte_alloc() so all of its callers can be immune to the bug
described by Kenneth Chen at http://lkml.org/lkml/2004/6/16/246

> It turns out there is a bug in hugetlb_prefault(): with 3 level page table,
> huge_pte_alloc() might return a pmd that points to a PTE page. It happens
> if the virtual address for hugetlb mmap is recycled from previously used
> normal page mmap. free_pgtables() might not scrub the pmd entry on
> munmap and hugetlb_prefault skips on any pmd presence regardless what type
> it is.

Unless I am missing something, it seems more correct to place the check inside
huge_pte_alloc() to prevent a the same bug wherever a huge pte is allocated.
It also allows checking for this condition when lazily faulting huge pages
later in the series.

Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: <linux-mm@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/hugetlbpage.c | 13 +++++++++++--
 mm/hugetlb.c               |  2 --
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 3b099f32b94..57c486f0e89 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -22,12 +22,21 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
-	pmd_t *pmd = NULL;
+	pmd_t *pmd;
+	pte_t *pte = NULL;
 
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
 	pmd = pmd_alloc(mm, pud, addr);
-	return (pte_t *) pmd;
+
+	if (!pmd)
+		goto out;
+
+	pte = (pte_t *) pmd;
+	if (!pte_none(*pte) && !pte_huge(*pte))
+		hugetlb_clean_stale_pgtable(pte);
+out:
+	return pte;
 }
 
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6bf720bc662..901ac523a1c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -360,8 +360,6 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 			ret = -ENOMEM;
 			goto out;
 		}
-		if (! pte_none(*pte))
-			hugetlb_clean_stale_pgtable(pte);
 
 		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
 			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-- 
cgit v1.2.3-70-g09d2


From 02b0ccef903e85673ead74ddb7c431f2f7ce183d Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Sat, 3 Sep 2005 15:55:01 -0700
Subject: [PATCH] hugetlb: check p?d_present in huge_pte_offset()

For demand faulting, we cannot assume that the page tables will be
populated.  Do what the rest of the architectures do and test p?d_present()
while walking down the page table.

Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: <linux-mm@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/hugetlbpage.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 57c486f0e89..24c8a536b58 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -46,8 +46,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	pmd_t *pmd = NULL;
 
 	pgd = pgd_offset(mm, addr);
-	pud = pud_offset(pgd, addr);
-	pmd = pmd_offset(pud, addr);
+	if (pgd_present(*pgd)) {
+		pud = pud_offset(pgd, addr);
+		if (pud_present(*pud))
+			pmd = pmd_offset(pud, addr);
+	}
 	return (pte_t *) pmd;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 0e5c9f39f64d8a55c5db37a5ea43e37d3422fd92 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Sat, 3 Sep 2005 15:55:02 -0700
Subject: [PATCH] remove hugetlb_clean_stale_pgtable() and fix huge_pte_alloc()

I don't think we need to call hugetlb_clean_stale_pgtable() anymore
in 2.6.13 because of the rework with free_pgtables().  It now collect
all the pte page at the time of munmap.  It used to only collect page
table pages when entire one pgd can be freed and left with staled pte
pages.  Not anymore with 2.6.13.  This function will never be called
and We should turn it into a BUG_ON.

I also spotted two problems here, not Adam's fault :-)
(1) in huge_pte_alloc(), it looks like a bug to me that pud is not
    checked before calling pmd_alloc()
(2) in hugetlb_clean_stale_pgtable(), it also missed a call to
    pmd_free_tlb.  I think a tlb flush is required to flush the mapping
    for the page table itself when we clear out the pmd pointing to a
    pte page.  However, since hugetlb_clean_stale_pgtable() is never
    called, so it won't trigger the bug.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/hugetlbpage.c | 23 +++--------------------
 include/asm-i386/page.h    |  1 -
 include/asm-x86_64/page.h  |  1 -
 include/linux/hugetlb.h    |  6 ------
 4 files changed, 3 insertions(+), 28 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 24c8a536b58..d524127c9af 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -22,20 +22,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
-	pmd_t *pmd;
 	pte_t *pte = NULL;
 
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
-	pmd = pmd_alloc(mm, pud, addr);
+	if (pud)
+		pte = (pte_t *) pmd_alloc(mm, pud, addr);
+	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
 
-	if (!pmd)
-		goto out;
-
-	pte = (pte_t *) pmd;
-	if (!pte_none(*pte) && !pte_huge(*pte))
-		hugetlb_clean_stale_pgtable(pte);
-out:
 	return pte;
 }
 
@@ -130,17 +124,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 }
 #endif
 
-void hugetlb_clean_stale_pgtable(pte_t *pte)
-{
-	pmd_t *pmd = (pmd_t *) pte;
-	struct page *page;
-
-	page = pmd_page(*pmd);
-	pmd_clear(pmd);
-	dec_page_state(nr_page_table_pages);
-	page_cache_release(page);
-}
-
 /* x86_64 also uses this file */
 
 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index 10045fd8210..73296d9924f 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -68,7 +68,6 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 #define HPAGE_MASK	(~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-#define ARCH_HAS_HUGETLB_CLEAN_STALE_PGTABLE
 #endif
 
 #define pgd_val(x)	((x).pgd)
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index fcf890aa8c8..135ffaa0393 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -28,7 +28,6 @@
 #define HPAGE_SIZE	((1UL) << HPAGE_SHIFT)
 #define HPAGE_MASK	(~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
-#define ARCH_HAS_HUGETLB_CLEAN_STALE_PGTABLE
 
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f529d144281..e670b0d13fe 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -70,12 +70,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 void hugetlb_prefault_arch_hook(struct mm_struct *mm);
 #endif
 
-#ifndef ARCH_HAS_HUGETLB_CLEAN_STALE_PGTABLE
-#define hugetlb_clean_stale_pgtable(pte)	BUG()
-#else
-void hugetlb_clean_stale_pgtable(pte_t *pte);
-#endif
-
 #else /* !CONFIG_HUGETLB_PAGE */
 
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
-- 
cgit v1.2.3-70-g09d2


From 869f96a00e8f53c7db8470ca9cf72e2e3fa40119 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 3 Sep 2005 15:56:26 -0700
Subject: [PATCH] x86: compress the stack layout of do_page_fault()

This patch pushes the creation of a rare signal frame (SIGBUS or SIGSEGV)
into a separate function, thus saving stackspace in the main
do_page_fault() stackframe.  The effect is 132 bytes less of stack used by
the typical do_page_fault() invocation - resulting in a denser
cache-layout.

(Another minor effect is that in case of kernel crashes that come from a
pagefault, we add less space to the already existing frame, giving the
crash functions a slightly higher chance to do their stuff without
overflowing the stack.)

(The changes also result in slightly cleaner code.)

argument bugfix from "Guillaume C." <guichaz@gmail.com>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/fault.c | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 8e90339d6ea..61d9e34af5a 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -199,6 +199,18 @@ static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
 	return 0;
 } 
 
+static noinline void force_sig_info_fault(int si_signo, int si_code,
+	unsigned long address, struct task_struct *tsk)
+{
+	siginfo_t info;
+
+	info.si_signo = si_signo;
+	info.si_errno = 0;
+	info.si_code = si_code;
+	info.si_addr = (void __user *)address;
+	force_sig_info(si_signo, &info, tsk);
+}
+
 fastcall void do_invalid_op(struct pt_regs *, unsigned long);
 
 /*
@@ -218,8 +230,7 @@ fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	struct vm_area_struct * vma;
 	unsigned long address;
 	unsigned long page;
-	int write;
-	siginfo_t info;
+	int write, si_code;
 
 	/* get the address */
 	__asm__("movl %%cr2,%0":"=r" (address));
@@ -233,7 +244,7 @@ fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
 	tsk = current;
 
-	info.si_code = SEGV_MAPERR;
+	si_code = SEGV_MAPERR;
 
 	/*
 	 * We fault-in kernel-space virtual memory on-demand. The
@@ -313,7 +324,7 @@ fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code)
  * we can handle it..
  */
 good_area:
-	info.si_code = SEGV_ACCERR;
+	si_code = SEGV_ACCERR;
 	write = 0;
 	switch (error_code & 3) {
 		default:	/* 3: write, present */
@@ -387,11 +398,7 @@ bad_area_nosemaphore:
 		/* Kernel addresses are always protection faults */
 		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
 		tsk->thread.trap_no = 14;
-		info.si_signo = SIGSEGV;
-		info.si_errno = 0;
-		/* info.si_code has been set above */
-		info.si_addr = (void __user *)address;
-		force_sig_info(SIGSEGV, &info, tsk);
+		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
 		return;
 	}
 
@@ -500,11 +507,7 @@ do_sigbus:
 	tsk->thread.cr2 = address;
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_no = 14;
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_ADRERR;
-	info.si_addr = (void __user *)address;
-	force_sig_info(SIGBUS, &info, tsk);
+	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
 	return;
 
 vmalloc_fault:
-- 
cgit v1.2.3-70-g09d2


From 4116c527ea9517623369a5b3b037aedde280d672 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Sat, 3 Sep 2005 15:56:27 -0700
Subject: [PATCH] hpet: use read_timer_tsc only when CPU has TSC

Only use read_timer_tsc only when CPU has TSC.  Thanks to Andrea for
pointing this out.  Should not be issue on any platforms as all recent
systems that has HPET also has CPUs that supports TSC.  The patch is still
required for correctness.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/timers/timer_hpet.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
index ef8dac5dd33..cbb3f221ced 100644
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ b/arch/i386/kernel/timers/timer_hpet.c
@@ -136,6 +136,8 @@ static void delay_hpet(unsigned long loops)
 	} while ((hpet_end - hpet_start) < (loops));
 }
 
+static struct timer_opts timer_hpet;
+
 static int __init init_hpet(char* override)
 {
 	unsigned long result, remain;
@@ -163,6 +165,8 @@ static int __init init_hpet(char* override)
 			}
 			set_cyc2ns_scale(cpu_khz/1000);
 		}
+		/* set this only when cpu_has_tsc */
+		timer_hpet.read_timer = read_timer_tsc;
 	}
 
 	/*
@@ -186,7 +190,6 @@ static struct timer_opts timer_hpet __read_mostly = {
 	.get_offset =		get_offset_hpet,
 	.monotonic_clock =	monotonic_clock_hpet,
 	.delay = 		delay_hpet,
-	.read_timer = 		read_timer_tsc,
 };
 
 struct init_timer_opts __initdata timer_hpet_init = {
-- 
cgit v1.2.3-70-g09d2


From 7ae65fd334232468a9d6b523a4fc141cd6ec5ea4 Mon Sep 17 00:00:00 2001
From: Matt Tolentino <metolent@snoqualmie.dp.intel.com>
Date: Sat, 3 Sep 2005 15:56:27 -0700
Subject: [PATCH] x86: fix EFI memory map parsing

The memory descriptors that comprise the EFI memory map are not fixed in
stone such that the size could change in the future.  This uses the memory
descriptor size obtained from EFI to iterate over the memory map entries
during boot.  This enables the removal of an x86 specific pad (and ifdef)
in the EFI header.  I also couldn't stomach the broken up nature of the
function to put EFI runtime calls into virtual mode any longer so I fixed
that up a bit as well.

For reference, this patch only impacts x86.

Signed-off-by: Matt Tolentino <matthew.e.tolentino@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/efi.c   | 101 +++++++++++++++++++++++------------------------
 arch/i386/kernel/setup.c |  14 ++++---
 arch/i386/mm/init.c      |   5 ++-
 include/asm-i386/setup.h |   2 +-
 include/linux/efi.h      |  14 ++-----
 5 files changed, 67 insertions(+), 69 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 385883ea8c1..850648ae830 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -233,22 +233,23 @@ void __init efi_map_memmap(void)
 {
 	memmap.map = NULL;
 
-	memmap.map = (efi_memory_desc_t *)
-		bt_ioremap((unsigned long) memmap.phys_map,
-			(memmap.nr_map * sizeof(efi_memory_desc_t)));
-
+	memmap.map = bt_ioremap((unsigned long) memmap.phys_map,
+			(memmap.nr_map * memmap.desc_size));
 	if (memmap.map == NULL)
 		printk(KERN_ERR PFX "Could not remap the EFI memmap!\n");
+
+	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
 }
 
 #if EFI_DEBUG
 static void __init print_efi_memmap(void)
 {
 	efi_memory_desc_t *md;
+	void *p;
 	int i;
 
-	for (i = 0; i < memmap.nr_map; i++) {
-		md = &memmap.map[i];
+	for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
+		md = p;
 		printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, "
 			"range=[0x%016llx-0x%016llx) (%lluMB)\n",
 			i, md->type, md->attribute, md->phys_addr,
@@ -271,10 +272,10 @@ void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
 	} prev, curr;
 	efi_memory_desc_t *md;
 	unsigned long start, end;
-	int i;
+	void *p;
 
-	for (i = 0; i < memmap.nr_map; i++) {
-		md = &memmap.map[i];
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
 
 		if ((md->num_pages == 0) || (!is_available_memory(md)))
 			continue;
@@ -325,6 +326,7 @@ void __init efi_init(void)
 	memmap.phys_map = EFI_MEMMAP;
 	memmap.nr_map = EFI_MEMMAP_SIZE/EFI_MEMDESC_SIZE;
 	memmap.desc_version = EFI_MEMDESC_VERSION;
+	memmap.desc_size = EFI_MEMDESC_SIZE;
 
 	efi.systab = (efi_system_table_t *)
 		boot_ioremap((unsigned long) efi_phys.systab,
@@ -428,22 +430,30 @@ void __init efi_init(void)
 		printk(KERN_ERR PFX "Could not map the runtime service table!\n");
 
 	/* Map the EFI memory map for use until paging_init() */
-
-	memmap.map = (efi_memory_desc_t *)
-		boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE);
-
+	memmap.map = boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE);
 	if (memmap.map == NULL)
 		printk(KERN_ERR PFX "Could not map the EFI memory map!\n");
 
-	if (EFI_MEMDESC_SIZE != sizeof(efi_memory_desc_t)) {
-		printk(KERN_WARNING PFX "Warning! Kernel-defined memdesc doesn't "
-			   "match the one from EFI!\n");
-	}
+	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+
 #if EFI_DEBUG
 	print_efi_memmap();
 #endif
 }
 
+static inline void __init check_range_for_systab(efi_memory_desc_t *md)
+{
+	if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) &&
+		((unsigned long)efi_phys.systab < md->phys_addr +
+		((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) {
+		unsigned long addr;
+
+		addr = md->virt_addr - md->phys_addr +
+			(unsigned long)efi_phys.systab;
+		efi.systab = (efi_system_table_t *)addr;
+	}
+}
+
 /*
  * This function will switch the EFI runtime services to virtual mode.
  * Essentially, look through the EFI memmap and map every region that
@@ -457,43 +467,32 @@ void __init efi_enter_virtual_mode(void)
 {
 	efi_memory_desc_t *md;
 	efi_status_t status;
-	int i;
+	void *p;
 
 	efi.systab = NULL;
 
-	for (i = 0; i < memmap.nr_map; i++) {
-		md = &memmap.map[i];
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
 
-		if (md->attribute & EFI_MEMORY_RUNTIME) {
-			md->virt_addr =
-				(unsigned long)ioremap(md->phys_addr,
-					md->num_pages << EFI_PAGE_SHIFT);
-			if (!(unsigned long)md->virt_addr) {
-				printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
-					(unsigned long)md->phys_addr);
-			}
+		if (!(md->attribute & EFI_MEMORY_RUNTIME))
+			continue;
 
-			if (((unsigned long)md->phys_addr <=
-					(unsigned long)efi_phys.systab) &&
-				((unsigned long)efi_phys.systab <
-					md->phys_addr +
-					((unsigned long)md->num_pages <<
-						EFI_PAGE_SHIFT))) {
-				unsigned long addr;
-
-				addr = md->virt_addr - md->phys_addr +
-						(unsigned long)efi_phys.systab;
-				efi.systab = (efi_system_table_t *)addr;
-			}
+		md->virt_addr = (unsigned long)ioremap(md->phys_addr,
+			md->num_pages << EFI_PAGE_SHIFT);
+		if (!(unsigned long)md->virt_addr) {
+			printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
+				(unsigned long)md->phys_addr);
 		}
+		/* update the virtual address of the EFI system table */
+		check_range_for_systab(md);
 	}
 
 	if (!efi.systab)
 		BUG();
 
 	status = phys_efi_set_virtual_address_map(
-			sizeof(efi_memory_desc_t) * memmap.nr_map,
-			sizeof(efi_memory_desc_t),
+			memmap.desc_size * memmap.nr_map,
+			memmap.desc_size,
 			memmap.desc_version,
 		       	memmap.phys_map);
 
@@ -533,10 +532,10 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 {
 	struct resource *res;
 	efi_memory_desc_t *md;
-	int i;
+	void *p;
 
-	for (i = 0; i < memmap.nr_map; i++) {
-		md = &memmap.map[i];
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
 
 		if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >
 		    0x100000000ULL)
@@ -613,10 +612,10 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 u32 efi_mem_type(unsigned long phys_addr)
 {
 	efi_memory_desc_t *md;
-	int i;
+	void *p;
 
-	for (i = 0; i < memmap.nr_map; i++) {
-		md = &memmap.map[i];
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
 		if ((md->phys_addr <= phys_addr) && (phys_addr <
 			(md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
 			return md->type;
@@ -627,10 +626,10 @@ u32 efi_mem_type(unsigned long phys_addr)
 u64 efi_mem_attributes(unsigned long phys_addr)
 {
 	efi_memory_desc_t *md;
-	int i;
+	void *p;
 
-	for (i = 0; i < memmap.nr_map; i++) {
-		md = &memmap.map[i];
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
 		if ((md->phys_addr <= phys_addr) && (phys_addr <
 			(md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
 			return md->attribute;
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index af4de58cab5..9adbf710ec8 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -370,12 +370,16 @@ static void __init limit_regions(unsigned long long size)
 	int i;
 
 	if (efi_enabled) {
-		for (i = 0; i < memmap.nr_map; i++) {
-			current_addr = memmap.map[i].phys_addr +
-				       (memmap.map[i].num_pages << 12);
-			if (memmap.map[i].type == EFI_CONVENTIONAL_MEMORY) {
+		efi_memory_desc_t *md;
+		void *p;
+
+		for (p = memmap.map, i = 0; p < memmap.map_end;
+			p += memmap.desc_size, i++) {
+			md = p;
+			current_addr = md->phys_addr + (md->num_pages << 12);
+			if (md->type == EFI_CONVENTIONAL_MEMORY) {
 				if (current_addr >= size) {
-					memmap.map[i].num_pages -=
+					md->num_pages -=
 						(((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
 					memmap.nr_map = i + 1;
 					return;
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 12216b52e28..d8b23ab7653 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -198,9 +198,10 @@ int page_is_ram(unsigned long pagenr)
 
 	if (efi_enabled) {
 		efi_memory_desc_t *md;
+		void *p;
 
-		for (i = 0; i < memmap.nr_map; i++) {
-			md = &memmap.map[i];
+		for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+			md = p;
 			if (!is_available_memory(md))
 				continue;
 			addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
diff --git a/include/asm-i386/setup.h b/include/asm-i386/setup.h
index 7a32184d54b..826a8ca50ac 100644
--- a/include/asm-i386/setup.h
+++ b/include/asm-i386/setup.h
@@ -44,7 +44,7 @@ extern unsigned char boot_params[PARAM_SIZE];
 #define EFI_SYSTAB ((efi_system_table_t *) *((unsigned long *)(PARAM+0x1c4)))
 #define EFI_MEMDESC_SIZE (*((unsigned long *) (PARAM+0x1c8)))
 #define EFI_MEMDESC_VERSION (*((unsigned long *) (PARAM+0x1cc)))
-#define EFI_MEMMAP ((efi_memory_desc_t *) *((unsigned long *)(PARAM+0x1d0)))
+#define EFI_MEMMAP ((void *) *((unsigned long *)(PARAM+0x1d0)))
 #define EFI_MEMMAP_SIZE (*((unsigned long *) (PARAM+0x1d4)))
 #define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2))
 #define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8))
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 73781ec165b..c7c5dd31618 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -91,11 +91,6 @@ typedef	struct {
 
 #define EFI_PAGE_SHIFT		12
 
-/*
- * For current x86 implementations of EFI, there is
- * additional padding in the mem descriptors.  This is not
- * the case in ia64.  Need to have this fixed in the f/w.
- */
 typedef struct {
 	u32 type;
 	u32 pad;
@@ -103,9 +98,6 @@ typedef struct {
 	u64 virt_addr;
 	u64 num_pages;
 	u64 attribute;
-#if defined (__i386__)
-	u64 pad1;
-#endif
 } efi_memory_desc_t;
 
 typedef int (*efi_freemem_callback_t) (unsigned long start, unsigned long end, void *arg);
@@ -240,10 +232,12 @@ typedef struct {
 } efi_system_table_t;
 
 struct efi_memory_map {
-	efi_memory_desc_t *phys_map;
-	efi_memory_desc_t *map;
+	void *phys_map;
+	void *map;
+	void *map_end;
 	int nr_map;
 	unsigned long desc_version;
+	unsigned long desc_size;
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 5fd75ebb1a58c1a3c9e3d9fdf75ce7286b79bb74 Mon Sep 17 00:00:00 2001
From: Petr Tesarik <kernel@tesarici.cz>
Date: Sat, 3 Sep 2005 15:56:28 -0700
Subject: [PATCH] vm86: Honor TF bit when emulating an instruction

If the virtual 86 machine reaches an instruction which raises a General
Protection Fault (such as CLI or STI), the instruction is emulated (in
handle_vm86_fault).  However, the emulation ignored the TF bit, so the
hardware debug interrupt was not invoked after such an emulated instruction
(and the DOS debugger missed it).

This patch fixes the problem by emulating the hardware debug interrupt as
the last action before control is returned to the VM86 program.

Signed-off-by: Petr Tesarik <kernel@tesarici.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/vm86.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index ec0f68ce688..2daa06fb4a8 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -542,7 +542,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
 	unsigned char opcode;
 	unsigned char __user *csp;
 	unsigned char __user *ssp;
-	unsigned short ip, sp;
+	unsigned short ip, sp, orig_flags;
 	int data32, pref_done;
 
 #define CHECK_IF_IN_TRAP \
@@ -551,8 +551,12 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
 #define VM86_FAULT_RETURN do { \
 	if (VMPI.force_return_for_pic  && (VEFLAGS & (IF_MASK | VIF_MASK))) \
 		return_to_32bit(regs, VM86_PICRETURN); \
+	if (orig_flags & TF_MASK) \
+		handle_vm86_trap(regs, 0, 1); \
 	return; } while (0)
 
+	orig_flags = *(unsigned short *)&regs->eflags;
+
 	csp = (unsigned char __user *) (regs->cs << 4);
 	ssp = (unsigned char __user *) (regs->ss << 4);
 	sp = SP(regs);
-- 
cgit v1.2.3-70-g09d2


From 484b90c4b965d54037ff99b198d84cdf144f8a35 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Sat, 3 Sep 2005 15:56:31 -0700
Subject: [PATCH] kdump: Save parameter segment in protected mode (x86)

o With introduction of kexec as boot-loader, the assumption that parameter
  segment will always be loaded at lower address than kernel and will be
  addressable by early bootup page tables is no longer valid. In kexec on
  panic case parameter segment might well be loaded beyond kernel image and
  might not be addressable by early boot page tables.
o This case might hit in the scenario where user has reserved a chunk of
  memory for second kernel, for example 16MB to 64MB, and has also built
  second kernel for physical memory location 16MB. In this case kexec has no
  choice but to load the parameter segment at a higher address than new kernel
  image at safe location where new kernel does not stomp it.
o Though problem should automatically go away once relocatable kernel for i386
  is in place and kexec can determine the location of new kernel at run time
  and load parameter segment at lower address than kernel image. But till then
  this patch can go in (assuming it does not break something else).
o This patch moves up the boot parameter saving code. Now boot parameters
  are copied out in protected mode before page tables are initialized. This
  will ensure that parameter segment is always addressable irrespective of
  its physical location.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/head.S | 48 ++++++++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 22 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 4477bb10709..0480ca9e9e5 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -77,6 +77,32 @@ ENTRY(startup_32)
 	subl %edi,%ecx
 	shrl $2,%ecx
 	rep ; stosl
+/*
+ * Copy bootup parameters out of the way.
+ * Note: %esi still has the pointer to the real-mode data.
+ * With the kexec as boot loader, parameter segment might be loaded beyond
+ * kernel image and might not even be addressable by early boot page tables.
+ * (kexec on panic case). Hence copy out the parameters before initializing
+ * page tables.
+ */
+	movl $(boot_params - __PAGE_OFFSET),%edi
+	movl $(PARAM_SIZE/4),%ecx
+	cld
+	rep
+	movsl
+	movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi
+	andl %esi,%esi
+	jnz 2f			# New command line protocol
+	cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
+	jne 1f
+	movzwl OLD_CL_OFFSET,%esi
+	addl $(OLD_CL_BASE_ADDR),%esi
+2:
+	movl $(saved_command_line - __PAGE_OFFSET),%edi
+	movl $(COMMAND_LINE_SIZE/4),%ecx
+	rep
+	movsl
+1:
 
 /*
  * Initialize page tables.  This creates a PDE and a set of page
@@ -214,28 +240,6 @@ ENTRY(startup_32_smp)
  */
 	call setup_idt
 
-/*
- * Copy bootup parameters out of the way.
- * Note: %esi still has the pointer to the real-mode data.
- */
-	movl $boot_params,%edi
-	movl $(PARAM_SIZE/4),%ecx
-	cld
-	rep
-	movsl
-	movl boot_params+NEW_CL_POINTER,%esi
-	andl %esi,%esi
-	jnz 2f			# New command line protocol
-	cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
-	jne 1f
-	movzwl OLD_CL_OFFSET,%esi
-	addl $(OLD_CL_BASE_ADDR),%esi
-2:
-	movl $saved_command_line,%edi
-	movl $(COMMAND_LINE_SIZE/4),%ecx
-	rep
-	movsl
-1:
 checkCPUtype:
 
 	movl $-1,X86_CPUID		#  -1 for no CPUID initially
-- 
cgit v1.2.3-70-g09d2


From 911a62d42365076209e2c327e7688db296e35d62 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Sat, 3 Sep 2005 15:56:31 -0700
Subject: [PATCH] x86: sutomatically enable bigsmp when we have more than 8
 CPUs

i386 generic subarchitecture requires explicit dmi strings or command line
to enable bigsmp mode.  The patch below removes that restriction, and uses
bigsmp as soon as it finds more than 8 logical CPUs, Intel processors and
xAPIC support.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/acpi/boot.c              |  3 +++
 arch/i386/kernel/mpparse.c                |  9 +++++++++
 arch/i386/kernel/setup.c                  |  8 +++++++-
 arch/i386/mach-generic/bigsmp.c           |  5 ++++-
 arch/i386/mach-generic/probe.c            | 20 ++++++++++++++++++++
 include/asm-i386/apicdef.h                |  1 +
 include/asm-i386/mach-generic/mach_apic.h |  2 ++
 include/asm-i386/mpspec.h                 |  1 +
 8 files changed, 47 insertions(+), 2 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index b7808a89d94..34ee500c26e 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -833,6 +833,9 @@ acpi_process_madt(void)
 		if (!error) {
 			acpi_lapic = 1;
 
+#ifdef CONFIG_X86_GENERICARCH
+			generic_bigsmp_probe();
+#endif
 			/*
 			 * Parse MADT IO-APIC entries
 			 */
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index ce838abb27d..788efffa993 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -65,6 +65,8 @@ int nr_ioapics;
 int pic_mode;
 unsigned long mp_lapic_addr;
 
+unsigned int def_to_bigsmp = 0;
+
 /* Processor that is doing the boot up */
 unsigned int boot_cpu_physical_apicid = -1U;
 /* Internal processor count */
@@ -213,6 +215,13 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
 		ver = 0x10;
 	}
 	apic_version[m->mpc_apicid] = ver;
+	if ((num_processors > 8) &&
+	    APIC_XAPIC(ver) &&
+	    (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL))
+		def_to_bigsmp = 1;
+	else
+		def_to_bigsmp = 0;
+
 	bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
 }
 
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 9adbf710ec8..294bcca985a 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -1585,8 +1585,14 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	acpi_boot_table_init();
 	acpi_boot_init();
-#endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
+	if (def_to_bigsmp)
+		printk(KERN_WARNING "More than 8 CPUs detected and "
+			"CONFIG_X86_PC cannot handle it.\nUse "
+			"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
+#endif
+#endif
 #ifdef CONFIG_X86_LOCAL_APIC
 	if (smp_found_config)
 		get_smp_config();
diff --git a/arch/i386/mach-generic/bigsmp.c b/arch/i386/mach-generic/bigsmp.c
index 25883b44f62..037b2af1a1f 100644
--- a/arch/i386/mach-generic/bigsmp.c
+++ b/arch/i386/mach-generic/bigsmp.c
@@ -47,7 +47,10 @@ static struct dmi_system_id __initdata bigsmp_dmi_table[] = {
 
 static __init int probe_bigsmp(void)
 { 
-	dmi_check_system(bigsmp_dmi_table);
+	if (def_to_bigsmp)
+        	dmi_bigsmp = 1;
+	else
+		dmi_check_system(bigsmp_dmi_table);
 	return dmi_bigsmp; 
 } 
 
diff --git a/arch/i386/mach-generic/probe.c b/arch/i386/mach-generic/probe.c
index 5497c65a879..cea5b3ce4b5 100644
--- a/arch/i386/mach-generic/probe.c
+++ b/arch/i386/mach-generic/probe.c
@@ -30,6 +30,25 @@ struct genapic *apic_probe[] __initdata = {
 	NULL,
 };
 
+static int cmdline_apic;
+
+void __init generic_bigsmp_probe(void)
+{
+	/*
+	 * This routine is used to switch to bigsmp mode when
+	 * - There is no apic= option specified by the user
+	 * - generic_apic_probe() has choosen apic_default as the sub_arch
+	 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
+	 */
+
+	if (!cmdline_apic && genapic == &apic_default)
+		if (apic_bigsmp.probe()) {
+			genapic = &apic_bigsmp;
+			printk(KERN_INFO "Overriding APIC driver with %s\n",
+			       genapic->name);
+		}
+}
+
 void __init generic_apic_probe(char *command_line) 
 { 
 	char *s;
@@ -52,6 +71,7 @@ void __init generic_apic_probe(char *command_line)
 		if (!changed)
 			printk(KERN_ERR "Unknown genapic `%s' specified.\n", s);
 		*p = old;
+		cmdline_apic = changed;
 	} 
 	for (i = 0; !changed && apic_probe[i]; i++) { 
 		if (apic_probe[i]->probe()) {
diff --git a/include/asm-i386/apicdef.h b/include/asm-i386/apicdef.h
index a96a8f48fbf..03185cef8e0 100644
--- a/include/asm-i386/apicdef.h
+++ b/include/asm-i386/apicdef.h
@@ -16,6 +16,7 @@
 #define			GET_APIC_VERSION(x)	((x)&0xFF)
 #define			GET_APIC_MAXLVT(x)	(((x)>>16)&0xFF)
 #define			APIC_INTEGRATED(x)	((x)&0xF0)
+#define			APIC_XAPIC(x)		((x) >= 0x14)
 #define		APIC_TASKPRI	0x80
 #define			APIC_TPRI_MASK		0xFF
 #define		APIC_ARBPRI	0x90
diff --git a/include/asm-i386/mach-generic/mach_apic.h b/include/asm-i386/mach-generic/mach_apic.h
index b13767a4e93..d9dc039da94 100644
--- a/include/asm-i386/mach-generic/mach_apic.h
+++ b/include/asm-i386/mach-generic/mach_apic.h
@@ -28,4 +28,6 @@
 #define enable_apic_mode (genapic->enable_apic_mode)
 #define phys_pkg_id (genapic->phys_pkg_id)
 
+extern void generic_bigsmp_probe(void);
+
 #endif /* __ASM_MACH_APIC_H */
diff --git a/include/asm-i386/mpspec.h b/include/asm-i386/mpspec.h
index d9fafba075b..d84a9c326c2 100644
--- a/include/asm-i386/mpspec.h
+++ b/include/asm-i386/mpspec.h
@@ -11,6 +11,7 @@ extern int mp_bus_id_to_local [MAX_MP_BUSSES];
 extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
 extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES];
 
+extern unsigned int def_to_bigsmp;
 extern unsigned int boot_cpu_physical_apicid;
 extern int smp_found_config;
 extern void find_smp_config (void);
-- 
cgit v1.2.3-70-g09d2


From 252943efcfce945d8dd3738ca4c4b9cbeb4f3fa9 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Sat, 3 Sep 2005 15:56:32 -0700
Subject: [PATCH] x86: Add the check for all the cores in a package in cache
 information

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/intel_cacheinfo.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index 6c55b50cf04..9e0d5f83cb9 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -305,6 +305,9 @@ static void __devinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
 {
 	struct _cpuid4_info	*this_leaf;
 	unsigned long num_threads_sharing;
+#ifdef CONFIG_X86_HT
+	struct cpuinfo_x86 *c = cpu_data + cpu;
+#endif
 
 	this_leaf = CPUID4_INFO_IDX(cpu, index);
 	num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
@@ -314,10 +317,12 @@ static void __devinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
 #ifdef CONFIG_X86_HT
 	else if (num_threads_sharing == smp_num_siblings)
 		this_leaf->shared_cpu_map = cpu_sibling_map[cpu];
-#endif
+	else if (num_threads_sharing == (c->x86_num_cores * smp_num_siblings))
+		this_leaf->shared_cpu_map = cpu_core_map[cpu];
 	else
-		printk(KERN_INFO "Number of CPUs sharing cache didn't match "
+		printk(KERN_DEBUG "Number of CPUs sharing cache didn't match "
 				"any known set of CPUs\n");
+#endif
 }
 #else
 static void __init cache_shared_cpu_map_setup(unsigned int cpu, int index) {}
-- 
cgit v1.2.3-70-g09d2


From 56f1d5d52a21b93bc2984c920b17e0d80df5d1b2 Mon Sep 17 00:00:00 2001
From: "Natalie.Protasevich@unisys.com" <Natalie.Protasevich@unisys.com>
Date: Sat, 3 Sep 2005 15:56:34 -0700
Subject: [PATCH] ES7000 platform update (i386)

This is subarch update for ES7000.  I've modified platform check code and
removed unnecessary OEM table parsing for newer systems that don't use OEM
information during boot.  Parsing the table in fact is causing problems,
and the platform doesn't get recognized.  The patch only affects the ES7000
subach.

Signed-off-by: <Natalie.Protasevich@unisys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mach-es7000/es7000.h              |  5 ++--
 arch/i386/mach-es7000/es7000plat.c          | 45 ++++++++++++++---------------
 include/asm-i386/mach-es7000/mach_mpparse.h | 30 +++++++++++++++----
 3 files changed, 49 insertions(+), 31 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/mach-es7000/es7000.h b/arch/i386/mach-es7000/es7000.h
index 70691f0c4ce..898ed905e11 100644
--- a/arch/i386/mach-es7000/es7000.h
+++ b/arch/i386/mach-es7000/es7000.h
@@ -104,7 +104,8 @@ struct mip_reg {
 #define	MIP_SW_APIC		0x1020b
 #define	MIP_FUNC(VALUE) 	(VALUE & 0xff)
 
-extern int parse_unisys_oem (char *oemptr, int oem_entries);
-extern int find_unisys_acpi_oem_table(unsigned long *oem_addr, int *length);
+extern int parse_unisys_oem (char *oemptr);
+extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
+extern void setup_unisys ();
 extern int es7000_start_cpu(int cpu, unsigned long eip);
 extern void es7000_sw_apic(void);
diff --git a/arch/i386/mach-es7000/es7000plat.c b/arch/i386/mach-es7000/es7000plat.c
index d5936d50047..2000bdca2fc 100644
--- a/arch/i386/mach-es7000/es7000plat.c
+++ b/arch/i386/mach-es7000/es7000plat.c
@@ -75,12 +75,29 @@ es7000_rename_gsi(int ioapic, int gsi)
 
 #endif // (CONFIG_X86_IO_APIC) && (CONFIG_ACPI_INTERPRETER || CONFIG_ACPI_BOOT)
 
+void __init
+setup_unisys ()
+{
+	/*
+	 * Determine the generation of the ES7000 currently running.
+	 *
+	 * es7000_plat = 1 if the machine is a 5xx ES7000 box
+	 * es7000_plat = 2 if the machine is a x86_64 ES7000 box
+	 *
+	 */
+	if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
+		es7000_plat = 2;
+	else
+		es7000_plat = 1;
+	ioapic_renumber_irq = es7000_rename_gsi;
+}
+
 /*
  * Parse the OEM Table
  */
 
 int __init
-parse_unisys_oem (char *oemptr, int oem_entries)
+parse_unisys_oem (char *oemptr)
 {
 	int                     i;
 	int 			success = 0;
@@ -95,7 +112,7 @@ parse_unisys_oem (char *oemptr, int oem_entries)
 
 	tp += 8;
 
-	for (i=0; i <= oem_entries; i++) {
+	for (i=0; i <= 6; i++) {
 		type = *tp++;
 		size = *tp++;
 		tp -= 2;
@@ -130,34 +147,18 @@ parse_unisys_oem (char *oemptr, int oem_entries)
 		default:
 			break;
 		}
-		if (i == 6) break;
 		tp += size;
 	}
 
 	if (success < 2) {
 		es7000_plat = 0;
-	} else {
-		printk("\nEnabling ES7000 specific features...\n");
-		/*
-		 * Determine the generation of the ES7000 currently running.
-		 *
-		 * es7000_plat = 0 if the machine is NOT a Unisys ES7000 box
-		 * es7000_plat = 1 if the machine is a 5xx ES7000 box
-		 * es7000_plat = 2 if the machine is a x86_64 ES7000 box
-		 *
-		 */
-		if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
-			es7000_plat = 2;
-		else
-			es7000_plat = 1;
-
-		ioapic_renumber_irq = es7000_rename_gsi;
-	}
+	} else
+		setup_unisys();
 	return es7000_plat;
 }
 
 int __init
-find_unisys_acpi_oem_table(unsigned long *oem_addr, int *length)
+find_unisys_acpi_oem_table(unsigned long *oem_addr)
 {
 	struct acpi_table_rsdp		*rsdp = NULL;
 	unsigned long			rsdp_phys = 0;
@@ -201,13 +202,11 @@ find_unisys_acpi_oem_table(unsigned long *oem_addr, int *length)
 				acpi_table_print(header, sdt.entry[i].pa);
 				t = (struct oem_table *) __acpi_map_table(sdt.entry[i].pa, header->length);
 				addr = (void *) __acpi_map_table(t->OEMTableAddr, t->OEMTableSize);
-				*length = header->length;
 				*oem_addr = (unsigned long) addr;
 				return 0;
 			}
 		}
 	}
-	Dprintk("ES7000: did not find Unisys ACPI OEM table!\n");
 	return -1;
 }
 
diff --git a/include/asm-i386/mach-es7000/mach_mpparse.h b/include/asm-i386/mach-es7000/mach_mpparse.h
index 85809e0898d..28a84f6185a 100644
--- a/include/asm-i386/mach-es7000/mach_mpparse.h
+++ b/include/asm-i386/mach-es7000/mach_mpparse.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_MACH_MPPARSE_H
 #define __ASM_MACH_MPPARSE_H
 
+#include <linux/acpi.h>
+
 static inline void mpc_oem_bus_info(struct mpc_config_bus *m, char *name, 
 				struct mpc_config_translation *translation)
 {
@@ -12,8 +14,9 @@ static inline void mpc_oem_pci_bus(struct mpc_config_bus *m,
 {
 }
 
-extern int parse_unisys_oem (char *oemptr, int oem_entries);
-extern int find_unisys_acpi_oem_table(unsigned long *oem_addr, int *length);
+extern int parse_unisys_oem (char *oemptr);
+extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
+extern void setup_unisys();
 
 static inline int mps_oem_check(struct mp_config_table *mpc, char *oem,
 		char *productid)
@@ -22,18 +25,33 @@ static inline int mps_oem_check(struct mp_config_table *mpc, char *oem,
 		struct mp_config_oemtable *oem_table = 
 			(struct mp_config_oemtable *)mpc->mpc_oemptr;
 		if (!strncmp(oem, "UNISYS", 6))
-			return parse_unisys_oem((char *)oem_table, oem_table->oem_length);
+			return parse_unisys_oem((char *)oem_table);
 	}
 	return 0;
 }
 
+static inline int es7000_check_dsdt()
+{
+	struct acpi_table_header *header = NULL;
+	if(!acpi_get_table_header_early(ACPI_DSDT, &header))
+		acpi_table_print(header, 0);
+	if (!strncmp(header->oem_id, "UNISYS", 6))
+		return 1;
+	return 0;
+}
+
 /* Hook from generic ACPI tables.c */
 static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
 	unsigned long oem_addr; 
-	int oem_entries;
-	if (!find_unisys_acpi_oem_table(&oem_addr, &oem_entries))
-		return parse_unisys_oem((char *)oem_addr, oem_entries);
+	if (!find_unisys_acpi_oem_table(&oem_addr)) {
+		if (es7000_check_dsdt())
+			return parse_unisys_oem((char *)oem_addr);
+		else {
+			setup_unisys();
+			return 1;
+		}
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 2a0694d15d55d0deed928786a6393d5e45e37d76 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Sat, 3 Sep 2005 15:56:35 -0700
Subject: [PATCH] i386: clean up vDSO alignment padding

This makes the vDSO use nops for all its padding around instructions,
rather than sometimes zeros, and nop-pads the end of the area containing
instructions to a 32-byte cache line, to keep text and data in separate
lines.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/vsyscall-sigreturn.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/vsyscall-sigreturn.S b/arch/i386/kernel/vsyscall-sigreturn.S
index c8fcf75b9be..68afa50dd7c 100644
--- a/arch/i386/kernel/vsyscall-sigreturn.S
+++ b/arch/i386/kernel/vsyscall-sigreturn.S
@@ -15,7 +15,7 @@
 */
 
 	.text
-	.org	__kernel_vsyscall+32
+	.org __kernel_vsyscall+32,0x90
 	.globl __kernel_sigreturn
 	.type __kernel_sigreturn,@function
 __kernel_sigreturn:
@@ -35,6 +35,7 @@ __kernel_rt_sigreturn:
 	int $0x80
 .LEND_rt_sigreturn:
 	.size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
+	.balign 32
 	.previous
 
 	.section .eh_frame,"a",@progbits
-- 
cgit v1.2.3-70-g09d2


From 4bb0d3ec3e5b1e9e2399cdc641b3b6521ac9cdaa Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:36 -0700
Subject: [PATCH] i386: inline asm cleanup

i386 Inline asm cleanup.  Use cr/dr accessor functions.

Also, a potential bugfix.  Also, some CR accessors really should be volatile.
Reads from CR0 (numeric state may change in an exception handler), writes to
CR4 (flipping CR4.TSD) and reads from CR2 (page fault) prevent instruction
re-ordering.  I did not add memory clobber to CR3 / CR4 / CR0 updates, as it
was not there to begin with, and in no case should kernel memory be clobbered,
except when doing a TLB flush, which already has memory clobber.

I noticed that page invalidation does not have a memory clobber.  I can't find
a bug as a result, but there is definitely a potential for a bug here:

#define __flush_tlb_single(addr) \
	__asm__ __volatile__("invlpg %0": :"m" (*(char *) addr))

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/common.c           | 12 ++++++------
 arch/i386/kernel/cpu/cpufreq/longhaul.c | 12 +++---------
 arch/i386/kernel/cpu/cyrix.c            |  6 +-----
 arch/i386/kernel/efi.c                  |  4 ++--
 arch/i386/kernel/machine_kexec.c        |  8 +-------
 arch/i386/kernel/process.c              | 16 ++++++----------
 arch/i386/kernel/smp.c                  |  2 +-
 arch/i386/mm/fault.c                    |  6 +++---
 arch/i386/mm/pageattr.c                 |  2 +-
 arch/i386/power/cpu.c                   | 16 ++++++++--------
 include/asm-i386/agp.h                  |  2 +-
 include/asm-i386/bugs.h                 |  5 ++++-
 include/asm-i386/processor.h            | 22 +++++++++-------------
 include/asm-i386/system.h               | 28 +++++++++++++++++++++++++---
 include/asm-i386/xor.h                  | 26 +++++++++++++-------------
 15 files changed, 84 insertions(+), 83 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 4553ffd94b1..361f2e7ccb1 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -642,12 +642,12 @@ void __devinit cpu_init(void)
 	asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
 
 	/* Clear all 6 debug registers: */
-
-#define CD(register) set_debugreg(0, register)
-
-	CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
-
-#undef CD
+	set_debugreg(0, 0);
+	set_debugreg(0, 1);
+	set_debugreg(0, 2);
+	set_debugreg(0, 3);
+	set_debugreg(0, 6);
+	set_debugreg(0, 7);
 
 	/*
 	 * Force FPU initialization:
diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.c b/arch/i386/kernel/cpu/cpufreq/longhaul.c
index 04e3563da4f..bf02b5026e6 100644
--- a/arch/i386/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/i386/kernel/cpu/cpufreq/longhaul.c
@@ -64,8 +64,6 @@ static int dont_scale_voltage;
 #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg)
 
 
-#define __hlt()     __asm__ __volatile__("hlt": : :"memory")
-
 /* Clock ratios multiplied by 10 */
 static int clock_ratio[32];
 static int eblcr_table[32];
@@ -168,11 +166,9 @@ static void do_powersaver(union msr_longhaul *longhaul,
 	outb(0xFE,0x21);	/* TMR0 only */
 	outb(0xFF,0x80);	/* delay */
 
-	local_irq_enable();
-
-	__hlt();
+	safe_halt();
 	wrmsrl(MSR_VIA_LONGHAUL, longhaul->val);
-	__hlt();
+	halt();
 
 	local_irq_disable();
 
@@ -251,9 +247,7 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
 		bcr2.bits.CLOCKMUL = clock_ratio_index;
 		local_irq_disable();
 		wrmsrl (MSR_VIA_BCR2, bcr2.val);
-		local_irq_enable();
-
-		__hlt();
+		safe_halt();
 
 		/* Disable software clock multiplier */
 		rdmsrl (MSR_VIA_BCR2, bcr2.val);
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index ba4b01138c8..ff87cc22b32 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -132,11 +132,7 @@ static void __init set_cx86_memwb(void)
 	setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
 	/* set 'Not Write-through' */
 	cr0 = 0x20000000;
-	__asm__("movl %%cr0,%%eax\n\t"
-		"orl %0,%%eax\n\t"
-		"movl %%eax,%%cr0\n"
-		: : "r" (cr0)
-		:"ax");
+	write_cr0(read_cr0() | cr0);
 	/* CCR2 bit 2: lock NW bit and set WT1 */
 	setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 );
 }
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 850648ae830..921fdb15fc9 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -79,7 +79,7 @@ static void efi_call_phys_prelog(void)
 	 * directory. If I have PSE, I just need to duplicate one entry in
 	 * page directory.
 	 */
-	__asm__ __volatile__("movl %%cr4, %0":"=r"(cr4));
+	cr4 = read_cr4();
 
 	if (cr4 & X86_CR4_PSE) {
 		efi_bak_pg_dir_pointer[0].pgd =
@@ -115,7 +115,7 @@ static void efi_call_phys_epilog(void)
 	cpu_gdt_descr[0].address =
 		(unsigned long) __va(cpu_gdt_descr[0].address);
 	__asm__ __volatile__("lgdt %0":"=m"(cpu_gdt_descr));
-	__asm__ __volatile__("movl %%cr4, %0":"=r"(cr4));
+	cr4 = read_cr4();
 
 	if (cr4 & X86_CR4_PSE) {
 		swapper_pg_dir[pgd_index(0)].pgd =
diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c
index cb699a2aa1f..f19f6d34bcb 100644
--- a/arch/i386/kernel/machine_kexec.c
+++ b/arch/i386/kernel/machine_kexec.c
@@ -17,13 +17,7 @@
 #include <asm/apic.h>
 #include <asm/cpufeature.h>
 #include <asm/desc.h>
-
-static inline unsigned long read_cr3(void)
-{
-	unsigned long cr3;
-	asm volatile("movl %%cr3,%0": "=r"(cr3));
-	return cr3;
-}
+#include <asm/system.h>
 
 #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
 
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index e3f362e8af5..761d4ed47ef 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -313,16 +313,12 @@ void show_regs(struct pt_regs * regs)
 	printk(" DS: %04x ES: %04x\n",
 		0xffff & regs->xds,0xffff & regs->xes);
 
-	__asm__("movl %%cr0, %0": "=r" (cr0));
-	__asm__("movl %%cr2, %0": "=r" (cr2));
-	__asm__("movl %%cr3, %0": "=r" (cr3));
-	/* This could fault if %cr4 does not exist */
-	__asm__("1: movl %%cr4, %0		\n"
-		"2:				\n"
-		".section __ex_table,\"a\"	\n"
-		".long 1b,2b			\n"
-		".previous			\n"
-		: "=r" (cr4): "0" (0));
+	cr0 = read_cr0();
+	cr2 = read_cr2();
+	cr3 = read_cr3();
+	if (current_cpu_data.x86 > 4) {
+		cr4 = read_cr4();
+	}
 	printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
 	show_trace(NULL, &regs->esp);
 }
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index cec4bde6716..48b55db3680 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -576,7 +576,7 @@ static void stop_this_cpu (void * dummy)
 	local_irq_disable();
 	disable_local_APIC();
 	if (cpu_data[smp_processor_id()].hlt_works_ok)
-		for(;;) __asm__("hlt");
+		for(;;) halt();
 	for (;;);
 }
 
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 61d9e34af5a..411b8500ad1 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -233,7 +233,7 @@ fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	int write, si_code;
 
 	/* get the address */
-	__asm__("movl %%cr2,%0":"=r" (address));
+        address = read_cr2();
 
 	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
 					SIGSEGV) == NOTIFY_STOP)
@@ -453,7 +453,7 @@ no_context:
 	printk(" at virtual address %08lx\n",address);
 	printk(KERN_ALERT " printing eip:\n");
 	printk("%08lx\n", regs->eip);
-	asm("movl %%cr3,%0":"=r" (page));
+	page = read_cr3();
 	page = ((unsigned long *) __va(page))[address >> 22];
 	printk(KERN_ALERT "*pde = %08lx\n", page);
 	/*
@@ -526,7 +526,7 @@ vmalloc_fault:
 		pmd_t *pmd, *pmd_k;
 		pte_t *pte_k;
 
-		asm("movl %%cr3,%0":"=r" (pgd_paddr));
+		pgd_paddr = read_cr3();
 		pgd = index + (pgd_t *)__va(pgd_paddr);
 		pgd_k = init_mm.pgd + index;
 
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index cb3da6baa70..bce06a79eaf 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -62,7 +62,7 @@ static void flush_kernel_map(void *dummy)
 { 
 	/* Could use CLFLUSH here if the CPU supports it (Hammer,P4) */
 	if (boot_cpu_data.x86_model >= 4) 
-		asm volatile("wbinvd":::"memory"); 
+		wbinvd();
 	/* Flush all to work around Errata in early athlons regarding 
 	 * large page flushing. 
 	 */
diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c
index c547c1af6fa..4e19c43e095 100644
--- a/arch/i386/power/cpu.c
+++ b/arch/i386/power/cpu.c
@@ -57,10 +57,10 @@ void __save_processor_state(struct saved_context *ctxt)
 	/*
 	 * control registers 
 	 */
-	asm volatile ("movl %%cr0, %0" : "=r" (ctxt->cr0));
-	asm volatile ("movl %%cr2, %0" : "=r" (ctxt->cr2));
-	asm volatile ("movl %%cr3, %0" : "=r" (ctxt->cr3));
-	asm volatile ("movl %%cr4, %0" : "=r" (ctxt->cr4));
+	ctxt->cr0 = read_cr0();
+	ctxt->cr2 = read_cr2();
+	ctxt->cr3 = read_cr3();
+	ctxt->cr4 = read_cr4();
 }
 
 void save_processor_state(void)
@@ -109,10 +109,10 @@ void __restore_processor_state(struct saved_context *ctxt)
 	/*
 	 * control registers
 	 */
-	asm volatile ("movl %0, %%cr4" :: "r" (ctxt->cr4));
-	asm volatile ("movl %0, %%cr3" :: "r" (ctxt->cr3));
-	asm volatile ("movl %0, %%cr2" :: "r" (ctxt->cr2));
-	asm volatile ("movl %0, %%cr0" :: "r" (ctxt->cr0));
+	write_cr4(ctxt->cr4);
+	write_cr3(ctxt->cr3);
+	write_cr2(ctxt->cr2);
+	write_cr2(ctxt->cr0);
 
 	/*
 	 * now restore the descriptor tables to their proper values
diff --git a/include/asm-i386/agp.h b/include/asm-i386/agp.h
index b82f5f3ab88..9075083bab7 100644
--- a/include/asm-i386/agp.h
+++ b/include/asm-i386/agp.h
@@ -19,7 +19,7 @@ int unmap_page_from_agp(struct page *page);
 /* Could use CLFLUSH here if the cpu supports it. But then it would
    need to be called for each cacheline of the whole page so it may not be 
    worth it. Would need a page for it. */
-#define flush_agp_cache() asm volatile("wbinvd":::"memory")
+#define flush_agp_cache() wbinvd()
 
 /* Convert a physical address to an address suitable for the GART. */
 #define phys_to_gart(x) (x)
diff --git a/include/asm-i386/bugs.h b/include/asm-i386/bugs.h
index 6789fc275da..ea54540638d 100644
--- a/include/asm-i386/bugs.h
+++ b/include/asm-i386/bugs.h
@@ -118,7 +118,10 @@ static void __init check_hlt(void)
 		printk("disabled\n");
 		return;
 	}
-	__asm__ __volatile__("hlt ; hlt ; hlt ; hlt");
+	halt();
+	halt();
+	halt();
+	halt();
 	printk("OK.\n");
 }
 
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index d0d8b016009..7e17d3b4f65 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -203,9 +203,7 @@ static inline unsigned int cpuid_edx(unsigned int op)
 	return edx;
 }
 
-#define load_cr3(pgdir) \
-	asm volatile("movl %0,%%cr3": :"r" (__pa(pgdir)))
-
+#define load_cr3(pgdir) write_cr3(__pa(pgdir))
 
 /*
  * Intel CPU features in CR4
@@ -232,22 +230,20 @@ extern unsigned long mmu_cr4_features;
 
 static inline void set_in_cr4 (unsigned long mask)
 {
+	unsigned cr4;
 	mmu_cr4_features |= mask;
-	__asm__("movl %%cr4,%%eax\n\t"
-		"orl %0,%%eax\n\t"
-		"movl %%eax,%%cr4\n"
-		: : "irg" (mask)
-		:"ax");
+	cr4 = read_cr4();
+	cr4 |= mask;
+	write_cr4(cr4);
 }
 
 static inline void clear_in_cr4 (unsigned long mask)
 {
+	unsigned cr4;
 	mmu_cr4_features &= ~mask;
-	__asm__("movl %%cr4,%%eax\n\t"
-		"andl %0,%%eax\n\t"
-		"movl %%eax,%%cr4\n"
-		: : "irg" (~mask)
-		:"ax");
+	cr4 = read_cr4();
+	cr4 &= ~mask;
+	write_cr4(cr4);
 }
 
 /*
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 3db717a244f..8048a5e018c 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -107,13 +107,33 @@ static inline unsigned long _get_base(char * addr)
 #define clts() __asm__ __volatile__ ("clts")
 #define read_cr0() ({ \
 	unsigned int __dummy; \
-	__asm__( \
+	__asm__ __volatile__( \
 		"movl %%cr0,%0\n\t" \
 		:"=r" (__dummy)); \
 	__dummy; \
 })
 #define write_cr0(x) \
-	__asm__("movl %0,%%cr0": :"r" (x));
+	__asm__ __volatile__("movl %0,%%cr0": :"r" (x));
+
+#define read_cr2() ({ \
+	unsigned int __dummy; \
+	__asm__ __volatile__( \
+		"movl %%cr2,%0\n\t" \
+		:"=r" (__dummy)); \
+	__dummy; \
+})
+#define write_cr2(x) \
+	__asm__ __volatile__("movl %0,%%cr2": :"r" (x));
+
+#define read_cr3() ({ \
+	unsigned int __dummy; \
+	__asm__ ( \
+		"movl %%cr3,%0\n\t" \
+		:"=r" (__dummy)); \
+	__dummy; \
+})
+#define write_cr3(x) \
+	__asm__ __volatile__("movl %0,%%cr3": :"r" (x));
 
 #define read_cr4() ({ \
 	unsigned int __dummy; \
@@ -123,7 +143,7 @@ static inline unsigned long _get_base(char * addr)
 	__dummy; \
 })
 #define write_cr4(x) \
-	__asm__("movl %0,%%cr4": :"r" (x));
+	__asm__ __volatile__("movl %0,%%cr4": :"r" (x));
 #define stts() write_cr0(8 | read_cr0())
 
 #endif	/* __KERNEL__ */
@@ -447,6 +467,8 @@ struct alt_instr {
 #define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
 /* used in the idle loop; sti takes one instruction cycle to complete */
 #define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
+/* used when interrupts are already enabled or to shutdown the processor */
+#define halt()			__asm__ __volatile__("hlt": : :"memory")
 
 #define irqs_disabled()			\
 ({					\
diff --git a/include/asm-i386/xor.h b/include/asm-i386/xor.h
index f80e2dbe1b5..23c86cef3b2 100644
--- a/include/asm-i386/xor.h
+++ b/include/asm-i386/xor.h
@@ -535,14 +535,14 @@ static struct xor_block_template xor_block_p5_mmx = {
 
 #define XMMS_SAVE do {				\
 	preempt_disable();			\
+	cr0 = read_cr0();			\
+	clts();					\
 	__asm__ __volatile__ ( 			\
-		"movl %%cr0,%0		;\n\t"	\
-		"clts			;\n\t"	\
-		"movups %%xmm0,(%1)	;\n\t"	\
-		"movups %%xmm1,0x10(%1)	;\n\t"	\
-		"movups %%xmm2,0x20(%1)	;\n\t"	\
-		"movups %%xmm3,0x30(%1)	;\n\t"	\
-		: "=&r" (cr0)			\
+		"movups %%xmm0,(%0)	;\n\t"	\
+		"movups %%xmm1,0x10(%0)	;\n\t"	\
+		"movups %%xmm2,0x20(%0)	;\n\t"	\
+		"movups %%xmm3,0x30(%0)	;\n\t"	\
+		:				\
 		: "r" (xmm_save) 		\
 		: "memory");			\
 } while(0)
@@ -550,14 +550,14 @@ static struct xor_block_template xor_block_p5_mmx = {
 #define XMMS_RESTORE do {			\
 	__asm__ __volatile__ ( 			\
 		"sfence			;\n\t"	\
-		"movups (%1),%%xmm0	;\n\t"	\
-		"movups 0x10(%1),%%xmm1	;\n\t"	\
-		"movups 0x20(%1),%%xmm2	;\n\t"	\
-		"movups 0x30(%1),%%xmm3	;\n\t"	\
-		"movl 	%0,%%cr0	;\n\t"	\
+		"movups (%0),%%xmm0	;\n\t"	\
+		"movups 0x10(%0),%%xmm1	;\n\t"	\
+		"movups 0x20(%0),%%xmm2	;\n\t"	\
+		"movups 0x30(%0),%%xmm3	;\n\t"	\
 		:				\
-		: "r" (cr0), "r" (xmm_save)	\
+		: "r" (xmm_save)		\
 		: "memory");			\
+	write_cr0(cr0);				\
 	preempt_enable();			\
 } while(0)
 
-- 
cgit v1.2.3-70-g09d2


From 245067d1674d451855692fcd4647daf9fd47f82d Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:37 -0700
Subject: [PATCH] i386: cleanup serialize msr

i386 arch cleanup.  Introduce the serialize macro to serialize processor
state.  Why the microcode update needs it I am not quite sure, since wrmsr()
is already a serializing instruction, but it is a microcode update, so I will
keep the semantic the same, since this could be a timing workaround.  As far
as I can tell, this has always been there since the original microcode update
source.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/microcode.c   | 7 +++++--
 include/asm-i386/processor.h   | 5 +++++
 include/asm-x86_64/processor.h | 5 +++++
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index a77c612aad0..165f13158c6 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -164,7 +164,8 @@ static void collect_cpu_info (void *unused)
 	}
 
 	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-	__asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
+	/* see notes above for revision 1.07.  Apparent chip bug */
+	serialize_cpu();
 	/* get the current revision from MSR 0x8B */
 	rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
 	pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
@@ -377,7 +378,9 @@ static void do_update_one (void * unused)
 		(unsigned long) uci->mc->bits >> 16 >> 16);
 	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
 
-	__asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
+	/* see notes above for revision 1.07.  Apparent chip bug */
+	serialize_cpu();
+
 	/* get the current revision from MSR 0x8B */
 	rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
 
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 7e17d3b4f65..a0489b22270 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -277,6 +277,11 @@ static inline void clear_in_cr4 (unsigned long mask)
 	outb((data), 0x23); \
 } while (0)
 
+static inline void serialize_cpu(void)
+{
+	 __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
+}
+
 static inline void __monitor(const void *eax, unsigned long ecx,
 		unsigned long edx)
 {
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 85549e656ee..194160f6a43 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -437,6 +437,11 @@ static inline void prefetchw(void *x)
 	outb((data), 0x23); \
 } while (0)
 
+static inline void serialize_cpu(void)
+{
+	__asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
+}
+
 static inline void __monitor(const void *eax, unsigned long ecx,
 		unsigned long edx)
 {
-- 
cgit v1.2.3-70-g09d2


From 4d37e7e3fd851428dede4d05d3e69d03795a744a Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:38 -0700
Subject: [PATCH] i386: inline assembler: cleanup and encapsulate descriptor
 and task register management

i386 inline assembler cleanup.

This change encapsulates descriptor and task register management.  Also,
it is possible to improve assembler generation in two cases; savesegment
may store the value in a register instead of a memory location, which
allows GCC to optimize stack variables into registers, and MOV MEM, SEG
is always a 16-bit write to memory, making the casting in math-emu
unnecessary.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/common.c    |  4 ++--
 arch/i386/kernel/doublefault.c   |  2 +-
 arch/i386/kernel/efi.c           |  5 ++---
 arch/i386/kernel/reboot.c        |  9 +++++----
 arch/i386/kernel/signal.c        |  4 ++--
 arch/i386/kernel/traps.c         |  2 +-
 arch/i386/kernel/vm86.c          |  4 ++--
 arch/i386/math-emu/get_address.c | 13 +++----------
 arch/i386/power/cpu.c            | 26 +++++++++++++-------------
 include/asm-i386/desc.h          | 10 ++++++++++
 include/asm-i386/system.h        |  4 ++--
 11 files changed, 43 insertions(+), 40 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 361f2e7ccb1..46ce9b248f5 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -613,8 +613,8 @@ void __devinit cpu_init(void)
 	memcpy(thread->tls_array, &per_cpu(cpu_gdt_table, cpu),
 		GDT_ENTRY_TLS_ENTRIES * 8);
 
-	__asm__ __volatile__("lgdt %0" : : "m" (cpu_gdt_descr[cpu]));
-	__asm__ __volatile__("lidt %0" : : "m" (idt_descr));
+	load_gdt(&cpu_gdt_descr[cpu]);
+	load_idt(&idt_descr);
 
 	/*
 	 * Delete NT
diff --git a/arch/i386/kernel/doublefault.c b/arch/i386/kernel/doublefault.c
index 789af3e9fb1..5edb1d379ad 100644
--- a/arch/i386/kernel/doublefault.c
+++ b/arch/i386/kernel/doublefault.c
@@ -20,7 +20,7 @@ static void doublefault_fn(void)
 	struct Xgt_desc_struct gdt_desc = {0, 0};
 	unsigned long gdt, tss;
 
-	__asm__ __volatile__("sgdt %0": "=m" (gdt_desc): :"memory");
+	store_gdt(&gdt_desc);
 	gdt = gdt_desc.address;
 
 	printk("double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 921fdb15fc9..ecad519fd39 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -104,8 +104,7 @@ static void efi_call_phys_prelog(void)
 	local_flush_tlb();
 
 	cpu_gdt_descr[0].address = __pa(cpu_gdt_descr[0].address);
-	__asm__ __volatile__("lgdt %0":"=m"
-			    (*(struct Xgt_desc_struct *) __pa(&cpu_gdt_descr[0])));
+	load_gdt((struct Xgt_desc_struct *) __pa(&cpu_gdt_descr[0]));
 }
 
 static void efi_call_phys_epilog(void)
@@ -114,7 +113,7 @@ static void efi_call_phys_epilog(void)
 
 	cpu_gdt_descr[0].address =
 		(unsigned long) __va(cpu_gdt_descr[0].address);
-	__asm__ __volatile__("lgdt %0":"=m"(cpu_gdt_descr));
+	load_gdt(&cpu_gdt_descr[0]);
 	cr4 = read_cr4();
 
 	if (cr4 & X86_CR4_PSE) {
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index c71fef31dc4..1cbb9c0f470 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -13,6 +13,7 @@
 #include <linux/dmi.h>
 #include <asm/uaccess.h>
 #include <asm/apic.h>
+#include <asm/desc.h>
 #include "mach_reboot.h"
 #include <linux/reboot_fixups.h>
 
@@ -242,13 +243,13 @@ void machine_real_restart(unsigned char *code, int length)
 
 	/* Set up the IDT for real mode. */
 
-	__asm__ __volatile__ ("lidt %0" : : "m" (real_mode_idt));
+	load_idt(&real_mode_idt);
 
 	/* Set up a GDT from which we can load segment descriptors for real
 	   mode.  The GDT is not used in real mode; it is just needed here to
 	   prepare the descriptors. */
 
-	__asm__ __volatile__ ("lgdt %0" : : "m" (real_mode_gdt));
+	load_gdt(&real_mode_gdt);
 
 	/* Load the data segment registers, and thus the descriptors ready for
 	   real mode.  The base address of each segment is 0x100, 16 times the
@@ -316,7 +317,7 @@ void machine_emergency_restart(void)
 	if (!reboot_thru_bios) {
 		if (efi_enabled) {
 			efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL);
-			__asm__ __volatile__("lidt %0": :"m" (no_idt));
+			load_idt(&no_idt);
 			__asm__ __volatile__("int3");
 		}
 		/* rebooting needs to touch the page at absolute addr 0 */
@@ -325,7 +326,7 @@ void machine_emergency_restart(void)
 			mach_reboot_fixups(); /* for board specific fixups */
 			mach_reboot();
 			/* That didn't work - force a triple fault.. */
-			__asm__ __volatile__("lidt %0": :"m" (no_idt));
+			load_idt(&no_idt);
 			__asm__ __volatile__("int3");
 		}
 	}
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 140e340569c..7bcda6d69d8 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -278,9 +278,9 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
 	int tmp, err = 0;
 
 	tmp = 0;
-	__asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
+	savesegment(gs, tmp);
 	err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
-	__asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp));
+	savesegment(fs, tmp);
 	err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
 
 	err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index cd2d5d5514f..1144df0c48d 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -1008,7 +1008,7 @@ void __init trap_init_f00f_bug(void)
 	 * it uses the read-only mapped virtual address.
 	 */
 	idt_descr.address = fix_to_virt(FIX_F00F_IDT);
-	__asm__ __volatile__("lidt %0" : : "m" (idt_descr));
+	load_idt(&idt_descr);
 }
 #endif
 
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index 2daa06fb4a8..16b48500962 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -294,8 +294,8 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
  */
 	info->regs32->eax = 0;
 	tsk->thread.saved_esp0 = tsk->thread.esp0;
-	asm volatile("mov %%fs,%0":"=m" (tsk->thread.saved_fs));
-	asm volatile("mov %%gs,%0":"=m" (tsk->thread.saved_gs));
+	savesegment(fs, tsk->thread.saved_fs);
+	savesegment(gs, tsk->thread.saved_gs);
 
 	tss = &per_cpu(init_tss, get_cpu());
 	tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
diff --git a/arch/i386/math-emu/get_address.c b/arch/i386/math-emu/get_address.c
index 91175738e94..9819b705efa 100644
--- a/arch/i386/math-emu/get_address.c
+++ b/arch/i386/math-emu/get_address.c
@@ -155,7 +155,6 @@ static long pm_address(u_char FPU_modrm, u_char segment,
 { 
   struct desc_struct descriptor;
   unsigned long base_address, limit, address, seg_top;
-  unsigned short selector;
 
   segment--;
 
@@ -173,17 +172,11 @@ static long pm_address(u_char FPU_modrm, u_char segment,
       /* fs and gs aren't used by the kernel, so they still have their
 	 user-space values. */
     case PREFIX_FS_-1:
-      /* The cast is needed here to get gcc 2.8.0 to use a 16 bit register
-	 in the assembler statement. */
-
-      __asm__("mov %%fs,%0":"=r" (selector));
-      addr->selector = selector;
+      /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
+      savesegment(fs, addr->selector);
       break;
     case PREFIX_GS_-1:
-      /* The cast is needed here to get gcc 2.8.0 to use a 16 bit register
-	 in the assembler statement. */
-      __asm__("mov %%gs,%0":"=r" (selector));
-      addr->selector = selector;
+      savesegment(gs, addr->selector);
       break;
     default:
       addr->selector = PM_REG_(segment);
diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c
index 4e19c43e095..c7a6436aa93 100644
--- a/arch/i386/power/cpu.c
+++ b/arch/i386/power/cpu.c
@@ -42,17 +42,17 @@ void __save_processor_state(struct saved_context *ctxt)
 	/*
 	 * descriptor tables
 	 */
-	asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
-	asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
-	asm volatile ("str %0"  : "=m" (ctxt->tr));
+ 	store_gdt(&ctxt->gdt_limit);
+ 	store_idt(&ctxt->idt_limit);
+ 	store_tr(ctxt->tr);
 
 	/*
 	 * segment registers
 	 */
-	asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
-	asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
-	asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs));
-	asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss));
+ 	savesegment(es, ctxt->es);
+ 	savesegment(fs, ctxt->fs);
+ 	savesegment(gs, ctxt->gs);
+ 	savesegment(ss, ctxt->ss);
 
 	/*
 	 * control registers 
@@ -118,16 +118,16 @@ void __restore_processor_state(struct saved_context *ctxt)
 	 * now restore the descriptor tables to their proper values
 	 * ltr is done i fix_processor_context().
 	 */
-	asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
-	asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
+ 	load_gdt(&ctxt->gdt_limit);
+ 	load_idt(&ctxt->idt_limit);
 
 	/*
 	 * segment registers
 	 */
-	asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
-	asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
-	asm volatile ("movw %0, %%gs" :: "r" (ctxt->gs));
-	asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss));
+ 	loadsegment(es, ctxt->es);
+ 	loadsegment(fs, ctxt->fs);
+ 	loadsegment(gs, ctxt->gs);
+ 	loadsegment(ss, ctxt->ss);
 
 	/*
 	 * sysenter MSRs
diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h
index 11e67811a99..9a0b85a52e7 100644
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -30,6 +30,16 @@ extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS];
 #define load_TR_desc() __asm__ __volatile__("ltr %%ax"::"a" (GDT_ENTRY_TSS*8))
 #define load_LDT_desc() __asm__ __volatile__("lldt %%ax"::"a" (GDT_ENTRY_LDT*8))
 
+#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
+#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
+#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
+#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
+
+#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
+#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
+#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
+#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
+
 /*
  * This is the ldt that every process will get unless we need
  * something other than this.
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 8048a5e018c..37fd2f8c719 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -93,13 +93,13 @@ static inline unsigned long _get_base(char * addr)
 		".align 4\n\t"			\
 		".long 1b,3b\n"			\
 		".previous"			\
-		: :"m" (value))
+		: :"rm" (value))
 
 /*
  * Save a segment register away
  */
 #define savesegment(seg, value) \
-	asm volatile("mov %%" #seg ",%0":"=m" (value))
+	asm volatile("mov %%" #seg ",%0":"=rm" (value))
 
 /*
  * Clear and set 'TS' bit respectively
-- 
cgit v1.2.3-70-g09d2


From e7a2ff593c0e48b130434dee4d2fd3452a850e6f Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:39 -0700
Subject: [PATCH] i386: load_tls() fix

Subtle fix: load_TLS has been moved after saving %fs and %gs segments to avoid
creating non-reversible segments.  This could conceivably cause a bug if the
kernel ever needed to save and restore fs/gs from the NMI handler.  It
currently does not, but this is the safest approach to avoiding fs/gs
corruption.  SMIs are safe, since SMI saves the descriptor hidden state.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/process.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 761d4ed47ef..9d94995e967 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -678,21 +678,26 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	__unlazy_fpu(prev_p);
 
 	/*
-	 * Reload esp0, LDT and the page table pointer:
+	 * Reload esp0.
 	 */
 	load_esp0(tss, next);
 
 	/*
-	 * Load the per-thread Thread-Local Storage descriptor.
+	 * Save away %fs and %gs. No need to save %es and %ds, as
+	 * those are always kernel segments while inside the kernel.
+	 * Doing this before setting the new TLS descriptors avoids
+	 * the situation where we temporarily have non-reloadable
+	 * segments in %fs and %gs.  This could be an issue if the
+	 * NMI handler ever used %fs or %gs (it does not today), or
+	 * if the kernel is running inside of a hypervisor layer.
 	 */
-	load_TLS(next, cpu);
+	savesegment(fs, prev->fs);
+	savesegment(gs, prev->gs);
 
 	/*
-	 * Save away %fs and %gs. No need to save %es and %ds, as
-	 * those are always kernel segments while inside the kernel.
+	 * Load the per-thread Thread-Local Storage descriptor.
 	 */
-	asm volatile("mov %%fs,%0":"=m" (prev->fs));
-	asm volatile("mov %%gs,%0":"=m" (prev->gs));
+	load_TLS(next, cpu);
 
 	/*
 	 * Restore %fs and %gs if needed.
-- 
cgit v1.2.3-70-g09d2


From c9b02a24130e3ff14a553d966a79f46cf806b037 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:40 -0700
Subject: [PATCH] i386: use set_pte macros in a couple places where they were
 missing

Also, setting PDPEs in PAE mode does not require atomic operations, since the
PDPEs are cached by the processor, and only reloaded on an explicit or
implicit reload of CR3.

Since the four PDPEs must always be present in an active root, and the kernel
PDPE is never updated, we are safe even from SMIs and interrupts / NMIs using
task gates (which reload CR3).  Actually, much of this is moot, since the user
PDPEs are never updated either, and the only usage of task gates is by the
doublefault handler.  It appears the only place PGDs get updated in PAE mode
is in init_low_mappings() / zap_low_mapping() for initial page table creation
and recovery from ACPI sleep state, and these sites are safe by inspection.
Getting rid of the cmpxchg8b saves code space and 720 cycles in pgd_alloc on
P4.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/init.c               | 2 +-
 arch/i386/mm/pageattr.c           | 5 +++--
 include/asm-i386/pgtable-3level.h | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index d8b23ab7653..9edfc058b89 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -349,7 +349,7 @@ static void __init pagetable_init (void)
 	 * All user-space mappings are explicitly cleared after
 	 * SMP startup.
 	 */
-	pgd_base[0] = pgd_base[USER_PTRS_PER_PGD];
+	set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]);
 #endif
 }
 
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index bce06a79eaf..f600fc244f0 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -12,6 +12,7 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
 
 static DEFINE_SPINLOCK(cpa_lock);
 static struct list_head df_list = LIST_HEAD_INIT(df_list);
@@ -52,8 +53,8 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot)
 	addr = address & LARGE_PAGE_MASK; 
 	pbase = (pte_t *)page_address(base);
 	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
-		pbase[i] = pfn_pte(addr >> PAGE_SHIFT, 
-				   addr == address ? prot : PAGE_KERNEL);
+               set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
+                                          addr == address ? prot : PAGE_KERNEL));
 	}
 	return base;
 } 
diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h
index d609f9c2c1f..2e3f4a344a2 100644
--- a/include/asm-i386/pgtable-3level.h
+++ b/include/asm-i386/pgtable-3level.h
@@ -64,7 +64,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
 #define set_pmd(pmdptr,pmdval) \
 		set_64bit((unsigned long long *)(pmdptr),pmd_val(pmdval))
 #define set_pud(pudptr,pudval) \
-		set_64bit((unsigned long long *)(pudptr),pud_val(pudval))
+		(*(pudptr) = (pudval))
 
 /*
  * Pentium-II erratum A13: in PAE mode we explicitly have to flush
-- 
cgit v1.2.3-70-g09d2


From f2ab4461249df85b20930a7a57b54f39c5ae291a Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:42 -0700
Subject: [PATCH] x86: more asm cleanups

Some more assembler cleanups I noticed along the way.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/intel.c           |  9 +++------
 arch/i386/kernel/crash.c               |  2 +-
 arch/i386/kernel/machine_kexec.c       | 10 ++--------
 arch/i386/kernel/msr.c                 | 31 ++++++-------------------------
 arch/i386/kernel/process.c             |  2 +-
 arch/i386/mach-voyager/voyager_basic.c | 14 ++++++--------
 arch/i386/mach-voyager/voyager_smp.c   |  2 +-
 include/asm-i386/msr.h                 | 15 +++++++++++++++
 8 files changed, 35 insertions(+), 50 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index a2c33c1a46c..43601de0f63 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -82,16 +82,13 @@ static void __devinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
  */
 static int __devinit num_cpu_cores(struct cpuinfo_x86 *c)
 {
-	unsigned int eax;
+	unsigned int eax, ebx, ecx, edx;
 
 	if (c->cpuid_level < 4)
 		return 1;
 
-	__asm__("cpuid"
-		: "=a" (eax)
-		: "0" (4), "c" (0)
-		: "bx", "dx");
-
+	/* Intel has a non-standard dependency on %ecx for this CPUID level. */
+	cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
 	if (eax & 0x1f)
 		return ((eax >> 26) + 1);
 	else
diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index e5fab12f792..913be77bb84 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -153,7 +153,7 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu)
 	disable_local_APIC();
 	atomic_dec(&waiting_for_crash_ipi);
 	/* Assume hlt works */
-	__asm__("hlt");
+	halt();
 	for(;;);
 
 	return 1;
diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c
index f19f6d34bcb..a912fed4848 100644
--- a/arch/i386/kernel/machine_kexec.c
+++ b/arch/i386/kernel/machine_kexec.c
@@ -93,10 +93,7 @@ static void set_idt(void *newidt, __u16 limit)
 	curidt.size    = limit;
 	curidt.address = (unsigned long)newidt;
 
-	__asm__ __volatile__ (
-		"lidtl %0\n"
-		: : "m" (curidt)
-		);
+	load_idt(&curidt);
 };
 
 
@@ -108,10 +105,7 @@ static void set_gdt(void *newgdt, __u16 limit)
 	curgdt.size    = limit;
 	curgdt.address = (unsigned long)newgdt;
 
-	__asm__ __volatile__ (
-		"lgdtl %0\n"
-		: : "m" (curgdt)
-		);
+	load_gdt(&curgdt);
 };
 
 static void load_segments(void)
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index b2f03c39a6f..03100d6fc5d 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -46,23 +46,13 @@
 
 static struct class *msr_class;
 
-/* Note: "err" is handled in a funny way below.  Otherwise one version
-   of gcc or another breaks. */
-
 static inline int wrmsr_eio(u32 reg, u32 eax, u32 edx)
 {
 	int err;
 
-	asm volatile ("1:	wrmsr\n"
-		      "2:\n"
-		      ".section .fixup,\"ax\"\n"
-		      "3:	movl %4,%0\n"
-		      "	jmp 2b\n"
-		      ".previous\n"
-		      ".section __ex_table,\"a\"\n"
-		      "	.align 4\n" "	.long 1b,3b\n" ".previous":"=&bDS" (err)
-		      :"a"(eax), "d"(edx), "c"(reg), "i"(-EIO), "0"(0));
-
+	err = wrmsr_safe(reg, eax, edx);
+	if (err)
+		err = -EIO;
 	return err;
 }
 
@@ -70,18 +60,9 @@ static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx)
 {
 	int err;
 
-	asm volatile ("1:	rdmsr\n"
-		      "2:\n"
-		      ".section .fixup,\"ax\"\n"
-		      "3:	movl %4,%0\n"
-		      "	jmp 2b\n"
-		      ".previous\n"
-		      ".section __ex_table,\"a\"\n"
-		      "	.align 4\n"
-		      "	.long 1b,3b\n"
-		      ".previous":"=&bDS" (err), "=a"(*eax), "=d"(*edx)
-		      :"c"(reg), "i"(-EIO), "0"(0));
-
+	err = rdmsr_safe(reg, eax, edx);
+	if (err)
+		err = -EIO;
 	return err;
 }
 
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 9d94995e967..66099780039 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -164,7 +164,7 @@ static inline void play_dead(void)
 	 */
 	local_irq_disable();
 	while (1)
-		__asm__ __volatile__("hlt":::"memory");
+		halt();
 }
 #else
 static inline void play_dead(void)
diff --git a/arch/i386/mach-voyager/voyager_basic.c b/arch/i386/mach-voyager/voyager_basic.c
index c6384061328..cc69875d979 100644
--- a/arch/i386/mach-voyager/voyager_basic.c
+++ b/arch/i386/mach-voyager/voyager_basic.c
@@ -234,10 +234,9 @@ voyager_power_off(void)
 #endif
 	}
 	/* and wait for it to happen */
-	for(;;) {
-		__asm("cli");
-		__asm("hlt");
-	}
+	local_irq_disable();
+	for(;;)
+		halt();
 }
 
 /* copied from process.c */
@@ -278,10 +277,9 @@ machine_restart(char *cmd)
 		outb(basebd | 0x08, VOYAGER_MC_SETUP);
 		outb(0x02, catbase + 0x21);
 	}
-	for(;;) {
-		asm("cli");
-		asm("hlt");
-	}
+	local_irq_disable();
+	for(;;)
+		halt();
 }
 
 void
diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c
index 0e1f4208b07..16790b79861 100644
--- a/arch/i386/mach-voyager/voyager_smp.c
+++ b/arch/i386/mach-voyager/voyager_smp.c
@@ -1015,7 +1015,7 @@ smp_stop_cpu_function(void *dummy)
 	cpu_clear(smp_processor_id(), cpu_online_map);
 	local_irq_disable();
 	for(;;)
-	       __asm__("hlt");
+		halt();
 }
 
 static DEFINE_SPINLOCK(call_lock);
diff --git a/include/asm-i386/msr.h b/include/asm-i386/msr.h
index c76fce8badb..62b76cd9695 100644
--- a/include/asm-i386/msr.h
+++ b/include/asm-i386/msr.h
@@ -47,6 +47,21 @@ static inline void wrmsrl (unsigned long msr, unsigned long long val)
 		     : "c" (msr), "0" (a), "d" (b), "i" (-EFAULT));\
 	ret__; })
 
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr,a,b) ({ int ret__;						\
+	asm volatile("2: rdmsr ; xorl %0,%0\n"						\
+		     "1:\n\t"								\
+		     ".section .fixup,\"ax\"\n\t"					\
+		     "3:  movl %4,%0 ; jmp 1b\n\t"					\
+		     ".previous\n\t"							\
+ 		     ".section __ex_table,\"a\"\n"					\
+		     "   .align 4\n\t"							\
+		     "   .long 	2b,3b\n\t"						\
+		     ".previous"							\
+		     : "=r" (ret__), "=a" (*(a)), "=d" (*(b))				\
+		     : "c" (msr), "i" (-EFAULT));\
+	ret__; })
+
 #define rdtsc(low,high) \
      __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
 
-- 
cgit v1.2.3-70-g09d2


From 0998e4228aca046fbd747c3fed909791d52e88eb Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:43 -0700
Subject: [PATCH] x86: privilege cleanup

Privilege checking cleanup.  Originally, these diffs were much greater, but
recent cleanups in Linux have already done much of the cleanup.  I added
some explanatory comments in places where the reasoning behind certain
tests is rather subtle.

Also, in traps.c, we can skip the user_mode check in handle_BUG().  The
reason is, there are only two call chains - one via die_if_kernel() and one
via do_page_fault(), both entering from die().  Both of these paths already
ensure that a kernel mode failure has happened.  Also, the original check
here, if (user_mode(regs)) was insufficient anyways, since it would not
rule out BUG faults from V8086 mode execution.

Saving the %ss segment in show_regs() rather than assuming a fixed value
also gives better information about the current kernel state in the
register dump.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/signal.c | 4 +++-
 arch/i386/kernel/traps.c  | 5 +----
 include/asm-i386/ptrace.h | 7 +++++++
 3 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 7bcda6d69d8..61eb0c8a6e4 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -604,7 +604,9 @@ int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset)
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
 	 * kernel mode. Just return without doing anything
-	 * if so.
+ 	 * if so.  vm86 regs switched out by assembly code
+ 	 * before reaching here, so testing against kernel
+ 	 * CS suffices.
 	 */
 	if (!user_mode(regs))
 		return 1;
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 1144df0c48d..b2b4bb890bb 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -210,7 +210,7 @@ void show_registers(struct pt_regs *regs)
 	unsigned short ss;
 
 	esp = (unsigned long) (&regs->esp);
-	ss = __KERNEL_DS;
+	savesegment(ss, ss);
 	if (user_mode(regs)) {
 		in_kernel = 0;
 		esp = regs->esp;
@@ -267,9 +267,6 @@ static void handle_BUG(struct pt_regs *regs)
 	char c;
 	unsigned long eip;
 
-	if (user_mode(regs))
-		goto no_bug;		/* Not in kernel */
-
 	eip = regs->eip;
 
 	if (eip < PAGE_OFFSET)
diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h
index 05532875e39..7e0f2945d17 100644
--- a/include/asm-i386/ptrace.h
+++ b/include/asm-i386/ptrace.h
@@ -61,6 +61,13 @@ struct pt_regs {
 struct task_struct;
 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code);
 
+/*
+ * user_mode_vm(regs) determines whether a register set came from user mode.
+ * This is true if V8086 mode was enabled OR if the register set was from
+ * protected mode with RPL-3 CS value.  This tricky test checks that with
+ * one comparison.  Many places in the kernel can bypass this full check
+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ */
 static inline int user_mode(struct pt_regs *regs)
 {
 	return (regs->xcs & 3) != 0;
-- 
cgit v1.2.3-70-g09d2


From a5201129307f414890f9a4410e38da205f5d7359 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:44 -0700
Subject: [PATCH] x86: make IOPL explicit

The pushf/popf in switch_to are ONLY used to switch IOPL.  Making this
explicit in C code is more clear.  This pushf/popf pair was added as a
bugfix for leaking IOPL to unprivileged processes when using
sysenter/sysexit based system calls (sysexit does not restore flags).

When requesting an IOPL change in sys_iopl(), it is just as easy to change
the current flags and the flags in the stack image (in case an IRET is
required), but there is no reason to force an IRET if we came in from the
SYSENTER path.

This change is the minimal solution for supporting a paravirtualized Linux
kernel that allows user processes to run with I/O privilege.  Other
solutions require radical rewrites of part of the low level fault / system
call handling code, or do not fully support sysenter based system calls.

Unfortunately, this added one field to the thread_struct.  But as a bonus,
on P4, the fastest time measured for switch_to() went from 312 to 260
cycles, a win of about 17% in the fast case through this performance
critical path.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/ioport.c    |  7 ++++---
 arch/i386/kernel/process.c   |  6 ++++++
 include/asm-i386/processor.h | 16 ++++++++++++++++
 include/asm-i386/system.h    |  4 +---
 4 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c
index 8b25160393c..f2b37654777 100644
--- a/arch/i386/kernel/ioport.c
+++ b/arch/i386/kernel/ioport.c
@@ -132,6 +132,7 @@ asmlinkage long sys_iopl(unsigned long unused)
 	volatile struct pt_regs * regs = (struct pt_regs *) &unused;
 	unsigned int level = regs->ebx;
 	unsigned int old = (regs->eflags >> 12) & 3;
+	struct thread_struct *t = &current->thread;
 
 	if (level > 3)
 		return -EINVAL;
@@ -140,8 +141,8 @@ asmlinkage long sys_iopl(unsigned long unused)
 		if (!capable(CAP_SYS_RAWIO))
 			return -EPERM;
 	}
-	regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12);
-	/* Make sure we return the long way (not sysenter) */
-	set_thread_flag(TIF_IRET);
+	t->iopl = level << 12;
+	regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl;
+	set_iopl_mask(t->iopl);
 	return 0;
 }
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 66099780039..b45cbf93d43 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -711,6 +711,12 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	if (prev->gs | next->gs)
 		loadsegment(gs, next->gs);
 
+	/*
+	 * Restore IOPL if needed.
+	 */
+	if (unlikely(prev->iopl != next->iopl))
+		set_iopl_mask(next->iopl);
+
 	/*
 	 * Now maybe reload the debug registers
 	 */
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 992224bff54..37bef8ed7be 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -455,6 +455,7 @@ struct thread_struct {
 	unsigned int		saved_fs, saved_gs;
 /* IO permissions */
 	unsigned long	*io_bitmap_ptr;
+ 	unsigned long	iopl;
 /* max allowed port in the bitmap, in bytes: */
 	unsigned long	io_bitmap_max;
 };
@@ -511,6 +512,21 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa
 			: /* no output */			\
 			:"r" (value))
 
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static inline void set_iopl_mask(unsigned mask)
+{
+	unsigned int reg;
+	__asm__ __volatile__ ("pushfl;"
+			      "popl %0;"
+			      "andl %1, %0;"
+			      "orl %2, %0;"
+			      "pushl %0;"
+			      "popfl"
+				: "=&r" (reg)
+				: "i" (~X86_EFLAGS_IOPL), "r" (mask));
+}
 
 /* Forward declaration, a strange C thing */
 struct task_struct;
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 37fd2f8c719..acd5c26b69b 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -14,8 +14,7 @@ extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struc
 
 #define switch_to(prev,next,last) do {					\
 	unsigned long esi,edi;						\
-	asm volatile("pushfl\n\t"					\
-		     "pushl %%ebp\n\t"					\
+	asm volatile("pushl %%ebp\n\t"					\
 		     "movl %%esp,%0\n\t"	/* save ESP */		\
 		     "movl %5,%%esp\n\t"	/* restore ESP */	\
 		     "movl $1f,%1\n\t"		/* save EIP */		\
@@ -23,7 +22,6 @@ extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struc
 		     "jmp __switch_to\n"				\
 		     "1:\t"						\
 		     "popl %%ebp\n\t"					\
-		     "popfl"						\
 		     :"=m" (prev->thread.esp),"=m" (prev->thread.eip),	\
 		      "=a" (last),"=S" (esi),"=D" (edi)			\
 		     :"m" (next->thread.esp),"m" (next->thread.eip),	\
-- 
cgit v1.2.3-70-g09d2


From e9f86e351fda5b3c40192fc3990453613f160779 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:45 -0700
Subject: [PATCH] x86: remove redundant TSS clearing

When reviewing GDT updates, I found the code:

	set_tss_desc(cpu,t);	/* This just modifies memory; ... */
        per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_TSS].b &= 0xfffffdff;

This second line is unnecessary, since set_tss_desc() has already cleared
the busy bit.

Commented disassembly, line 1:

c028b8bd:       8b 0c 86                mov    (%esi,%eax,4),%ecx
c028b8c0:       01 cb                   add    %ecx,%ebx
c028b8c2:       8d 0c 39                lea    (%ecx,%edi,1),%ecx

  => %ecx = per_cpu(cpu_gdt_table, cpu)

c028b8c5:       8d 91 80 00 00 00       lea    0x80(%ecx),%edx

  => %edx = &per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_TSS]

c028b8cb:       66 c7 42 00 73 20       movw   $0x2073,0x0(%edx)
c028b8d1:       66 89 5a 02             mov    %bx,0x2(%edx)
c028b8d5:       c1 cb 10                ror    $0x10,%ebx
c028b8d8:       88 5a 04                mov    %bl,0x4(%edx)
c028b8db:       c6 42 05 89             movb   $0x89,0x5(%edx)

  => ((char *)%edx)[5] = 0x89
  (equivalent) ((char *)per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_TSS])[5] = 0x89

c028b8df:       c6 42 06 00             movb   $0x0,0x6(%edx)
c028b8e3:       88 7a 07                mov    %bh,0x7(%edx)
c028b8e6:       c1 cb 10                ror    $0x10,%ebx

  => other bits

Commented disassembly, line 2:

c028b8e9:       8b 14 86                mov    (%esi,%eax,4),%edx
c028b8ec:       8d 04 3a                lea    (%edx,%edi,1),%eax

  => %eax = per_cpu(cpu_gdt_table, cpu)

c028b8ef:       81 a0 84 00 00 00 ff    andl   $0xfffffdff,0x84(%eax)

  => per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_TSS].b &= 0xfffffdff;
  (equivalent) ((char *)per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_TSS])[5] &= 0xfd

Note that (0x89 & ~0xfd) == 0; i.e, set_tss_desc(cpu,t) has already stored
the type field in the GDT with the busy bit clear.

Eliminating redundant and obscure code is always a good thing; in fact, I
pointed out this same optimization many moons ago in arch/i386/setup.c,
back when it used to be called that.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/power/cpu.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/i386')

diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c
index c7a6436aa93..7b0b9ad848e 100644
--- a/arch/i386/power/cpu.c
+++ b/arch/i386/power/cpu.c
@@ -84,7 +84,6 @@ static void fix_processor_context(void)
 	struct tss_struct * t = &per_cpu(init_tss, cpu);
 
 	set_tss_desc(cpu,t);	/* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
-        per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_TSS].b &= 0xfffffdff;
 
 	load_TR_desc();				/* This does ltr */
 	load_LDT(&current->active_mm->context);	/* This does lldt */
-- 
cgit v1.2.3-70-g09d2


From f2f30ebca6c0c95e987cb9a1fd1495770a75432e Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:47 -0700
Subject: [PATCH] x86: introduce a write acessor for updating the current LDT

Introduce a write acessor for updating the current LDT.  This is required
for hypervisors like Xen that do not allow LDT pages to be directly
written.

Testing - here's a fun little LDT test that can be trivially modified to
test limits as well.

/*
 * Copyright (c) 2005, Zachary Amsden (zach@vmware.com)
 * This is licensed under the GPL.
 */

#include <stdio.h>
#include <signal.h>
#include <asm/ldt.h>
#include <asm/segment.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/mman.h>
#define __KERNEL__
#include <asm/page.h>

void main(void)
{
        struct user_desc desc;
        char *code;
        unsigned long long tsc;

        code = (char *)mmap(0, 8192, PROT_EXEC|PROT_READ|PROT_WRITE,
                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        desc.entry_number = 0;
        desc.base_addr = code;
        desc.limit = 1;
        desc.seg_32bit = 1;
        desc.contents = MODIFY_LDT_CONTENTS_CODE;
        desc.read_exec_only = 0;
        desc.limit_in_pages = 1;
        desc.seg_not_present = 0;
        desc.useable = 1;
        if (modify_ldt(1, &desc, sizeof(desc)) != 0) {
                perror("modify_ldt");
        }
        printf("code base is 0x%08x\n", (unsigned)code);
        code[0x0ffe] = 0x0f;  /* rdtsc */
        code[0x0fff] = 0x31;
        code[0x1000] = 0xcb;  /* lret */
        __asm__ __volatile("lcall $7,$0xffe" : "=A" (tsc));
        printf("TSC is 0x%016llx\n", tsc);
}

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/ldt.c  | 7 ++-----
 include/asm-i386/desc.h | 7 +++++++
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c
index bb50afbee92..fe1ffa55587 100644
--- a/arch/i386/kernel/ldt.c
+++ b/arch/i386/kernel/ldt.c
@@ -177,7 +177,7 @@ static int read_default_ldt(void __user * ptr, unsigned long bytecount)
 static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
 {
 	struct mm_struct * mm = current->mm;
-	__u32 entry_1, entry_2, *lp;
+	__u32 entry_1, entry_2;
 	int error;
 	struct user_desc ldt_info;
 
@@ -205,8 +205,6 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
 			goto out_unlock;
 	}
 
-	lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
-
    	/* Allow LDTs to be cleared by the user. */
    	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
 		if (oldmode || LDT_empty(&ldt_info)) {
@@ -223,8 +221,7 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
 
 	/* Install the new entry ...  */
 install:
-	*lp	= entry_1;
-	*(lp+1)	= entry_2;
+	write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2);
 	error = 0;
 
 out_unlock:
diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h
index ccb11c03283..6df1a53c190 100644
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -96,6 +96,13 @@ static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
 	(info)->seg_not_present	== 1	&& \
 	(info)->useable		== 0	)
 
+static inline void write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
+{
+	__u32 *lp = (__u32 *)((char *)ldt + entry*8);
+	*lp = entry_a;
+	*(lp+1) = entry_b;
+}
+
 #if TLS_SIZE != 24
 # error update this code.
 #endif
-- 
cgit v1.2.3-70-g09d2


From 748f2edb52712aa3d926470a888608dc500d17e8 Mon Sep 17 00:00:00 2001
From: George Anzinger <george@mvista.com>
Date: Sat, 3 Sep 2005 15:56:48 -0700
Subject: [PATCH] x86 NMI: better support for debuggers

This patch adds a notify to the die_nmi notify that the system is about to
be taken down.  If the notify is handled with a NOTIFY_STOP return, the
system is given a new lease on life.

We also change the nmi watchdog to carry on if die_nmi returns.

This give debug code a chance to a) catch watchdog timeouts and b) possibly
allow the system to continue, realizing that the time out may be due to
debugger activities such as single stepping which is usually done with
"other" cpus held.

Signed-off-by: George Anzinger<george@mvista.com>
Cc: Keith Owens <kaos@ocs.com.au>
Signed-off-by: George Anzinger <george@mvista.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/nmi.c    |  5 ++++-
 arch/i386/kernel/traps.c  |  4 ++++
 include/asm-i386/kdebug.h | 11 +++++++++--
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 8c242bb1ef4..8bbdbda07a2 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -501,8 +501,11 @@ void nmi_watchdog_tick (struct pt_regs * regs)
 		 */
 		alert_counter[cpu]++;
 		if (alert_counter[cpu] == 5*nmi_hz)
+			/*
+			 * die_nmi will return ONLY if NOTIFY_STOP happens..
+			 */
 			die_nmi(regs, "NMI Watchdog detected LOCKUP");
-	} else {
+
 		last_irq_sums[cpu] = sum;
 		alert_counter[cpu] = 0;
 	}
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index b2b4bb890bb..54629bb5893 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -565,6 +565,10 @@ static DEFINE_SPINLOCK(nmi_print_lock);
 
 void die_nmi (struct pt_regs *regs, const char *msg)
 {
+	if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 0, SIGINT) ==
+	    NOTIFY_STOP)
+		return;
+
 	spin_lock(&nmi_print_lock);
 	/*
 	* We are in trouble anyway, lets at least try
diff --git a/include/asm-i386/kdebug.h b/include/asm-i386/kdebug.h
index b3f8d5f59d5..316138e8991 100644
--- a/include/asm-i386/kdebug.h
+++ b/include/asm-i386/kdebug.h
@@ -41,9 +41,16 @@ enum die_val {
 	DIE_PAGE_FAULT,
 };
 
-static inline int notify_die(enum die_val val,char *str,struct pt_regs *regs,long err,int trap, int sig)
+static inline int notify_die(enum die_val val, const char *str,
+			struct pt_regs *regs, long err, int trap, int sig)
 {
-	struct die_args args = { .regs=regs, .str=str, .err=err, .trapnr=trap,.signr=sig };
+	struct die_args args = {
+		.regs = regs,
+		.str = str,
+		.err = err,
+		.trapnr = trap,
+		.signr = sig
+	};
 	return notifier_call_chain(&i386die_chain, val, &args);
 }
 
-- 
cgit v1.2.3-70-g09d2


From d7271b14b2e9e5905aba0fbf5c4dc4f8980c0cb2 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:50 -0700
Subject: [PATCH] i386: encapsulate copying of pgd entries

Add a clone operation for pgd updates.

This helps complete the encapsulation of updates to page tables (or pages
about to become page tables) into accessor functions rather than using
memcpy() to duplicate them.  This is both generally good for consistency
and also necessary for running in a hypervisor which requires explicit
updates to page table entries.

The new function is:

clone_pgd_range(pgd_t *dst, pgd_t *src, int count);

   dst - pointer to pgd range anwhere on a pgd page
   src - ""
   count - the number of pgds to copy.

   dst and src can be on the same page, but the range must not overlap
   and must not cross a page boundary.

Note that I ommitted using this call to copy pgd entries into the
software suspend page root, since this is not technically a live paging
structure, rather it is used on resume from suspend.  CC'ing Pavel in case
he has any feedback on this.

Thanks to Chris Wright for noticing that this could be more optimal in
PAE compiles by eliminating the memset.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/smpboot.c |  4 ++--
 arch/i386/mm/pgtable.c     | 10 +++++-----
 include/asm-i386/pgtable.h | 15 +++++++++++++++
 3 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 8ac8e9fd561..8e950cdff1a 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -1017,8 +1017,8 @@ int __devinit smp_prepare_cpu(int cpu)
 	tsc_sync_disabled = 1;
 
 	/* init low mem mapping */
-	memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
-			sizeof(swapper_pg_dir[0]) * KERNEL_PGD_PTRS);
+	clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+			KERNEL_PGD_PTRS);
 	flush_tlb_all();
 	schedule_work(&task);
 	wait_for_completion(&done);
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index bd2f7afc7a2..dcdce2c6c53 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -207,19 +207,19 @@ void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
 {
 	unsigned long flags;
 
-	if (PTRS_PER_PMD == 1)
+	if (PTRS_PER_PMD == 1) {
+		memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
 		spin_lock_irqsave(&pgd_lock, flags);
+	}
 
-	memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
+	clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
 			swapper_pg_dir + USER_PTRS_PER_PGD,
-			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
-
+			KERNEL_PGD_PTRS);
 	if (PTRS_PER_PMD > 1)
 		return;
 
 	pgd_list_add(pgd);
 	spin_unlock_irqrestore(&pgd_lock, flags);
-	memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
 }
 
 /* never called when PTRS_PER_PMD > 1 */
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index d74185aee15..47bc1ffa3d4 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -277,6 +277,21 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
 	clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
 }
 
+/*
+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
+ *
+ *  dst - pointer to pgd range anwhere on a pgd page
+ *  src - ""
+ *  count - the number of pgds to copy.
+ *
+ * dst and src can be on the same page, but the range must not overlap,
+ * and must not cross a page boundary.
+ */
+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+{
+       memcpy(dst, src, count * sizeof(pgd_t));
+}
+
 /*
  * Macro to mark a page protection value as "uncacheable".  On processors which do not support
  * it, this is a no-op.
-- 
cgit v1.2.3-70-g09d2


From 4ad8d38342430f8b52f7a8458dce90caf8c8ca64 Mon Sep 17 00:00:00 2001
From: Zwane Mwaikambo <zwane@arm.linux.org.uk>
Date: Sat, 3 Sep 2005 15:56:51 -0700
Subject: [PATCH] i386 boottime for_each_cpu broken

for_each_cpu walks through all processors in cpu_possible_map, which is
defined as cpu_callout_map on i386 and isn't initialised until all
processors have been booted. This breaks things which do for_each_cpu
iterations early during boot. So, define cpu_possible_map as a bitmap with
NR_CPUS bits populated. This was triggered by a patch i'm working on which
does alloc_percpu before bringing up secondary processors.

From: Alexander Nyberg <alexn@telia.com>

i386-boottime-for_each_cpu-broken.patch
i386-boottime-for_each_cpu-broken-fix.patch

The SMP version of __alloc_percpu checks the cpu_possible_map before
allocating memory for a certain cpu.  With the above patches the BSP cpuid
is never set in cpu_possible_map which breaks CONFIG_SMP on uniprocessor
machines (as soon as someone tries to dereference something allocated via
__alloc_percpu, which in fact is never allocated since the cpu is not set
in cpu_possible_map).

Signed-off-by: Zwane Mwaikambo <zwane@arm.linux.org.uk>
Signed-off-by: Alexander Nyberg <alexn@telia.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/mpparse.c           | 8 +++++++-
 arch/i386/kernel/smpboot.c           | 3 +++
 arch/i386/mach-voyager/voyager_smp.c | 3 +++
 include/asm-i386/smp.h               | 2 +-
 4 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 788efffa993..5d0b9a8fc43 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -122,7 +122,7 @@ static int MP_valid_apicid(int apicid, int version)
 
 static void __init MP_processor_info (struct mpc_config_processor *m)
 {
- 	int ver, apicid;
+ 	int ver, apicid, cpu, found_bsp = 0;
 	physid_mask_t tmp;
  	
 	if (!(m->mpc_cpuflag & CPU_ENABLED))
@@ -181,6 +181,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
 	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
 		Dprintk("    Bootup CPU\n");
 		boot_cpu_physical_apicid = m->mpc_apicid;
+		found_bsp = 1;
 	}
 
 	if (num_processors >= NR_CPUS) {
@@ -204,6 +205,11 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
 		return;
 	}
 
+	if (found_bsp)
+		cpu = 0;
+	else
+		cpu = num_processors - 1;
+	cpu_set(cpu, cpu_possible_map);
 	tmp = apicid_to_cpu_present(apicid);
 	physids_or(phys_cpu_present_map, phys_cpu_present_map, tmp);
 	
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 8e950cdff1a..5e4893d2b9f 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -88,6 +88,8 @@ EXPORT_SYMBOL(cpu_online_map);
 cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
 EXPORT_SYMBOL(cpu_callout_map);
+cpumask_t cpu_possible_map;
+EXPORT_SYMBOL(cpu_possible_map);
 static cpumask_t smp_commenced_mask;
 
 /* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
@@ -1265,6 +1267,7 @@ void __devinit smp_prepare_boot_cpu(void)
 	cpu_set(smp_processor_id(), cpu_online_map);
 	cpu_set(smp_processor_id(), cpu_callout_map);
 	cpu_set(smp_processor_id(), cpu_present_map);
+	cpu_set(smp_processor_id(), cpu_possible_map);
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 }
 
diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c
index 16790b79861..46b0cf4a31e 100644
--- a/arch/i386/mach-voyager/voyager_smp.c
+++ b/arch/i386/mach-voyager/voyager_smp.c
@@ -242,6 +242,8 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
 cpumask_t cpu_callin_map = CPU_MASK_NONE;
 cpumask_t cpu_callout_map = CPU_MASK_NONE;
 EXPORT_SYMBOL(cpu_callout_map);
+cpumask_t cpu_possible_map = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_possible_map);
 
 /* The per processor IRQ masks (these are usually kept in sync) */
 static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned;
@@ -1910,6 +1912,7 @@ void __devinit smp_prepare_boot_cpu(void)
 {
 	cpu_set(smp_processor_id(), cpu_online_map);
 	cpu_set(smp_processor_id(), cpu_callout_map);
+	cpu_set(smp_processor_id(), cpu_possible_map);
 }
 
 int __devinit
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index a283738b80b..13250199976 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -59,7 +59,7 @@ extern void cpu_uninit(void);
 
 extern cpumask_t cpu_callout_map;
 extern cpumask_t cpu_callin_map;
-#define cpu_possible_map cpu_callout_map
+extern cpumask_t cpu_possible_map;
 
 /* We don't mark CPUs online until __cpu_up(), so we need another measure */
 static inline int num_booting_cpus(void)
-- 
cgit v1.2.3-70-g09d2


From 52fdd08903a1d1162e184114837e232640191627 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Sat, 3 Sep 2005 15:56:52 -0700
Subject: [PATCH] unify x86/x86-64 semaphore code

This patch moves the common code in x86 and x86-64's semaphore.c into a
single file in lib/semaphore-sleepers.c.  The arch specific asm stubs are
left in the arch tree (in semaphore.c for i386 and in the asm for x86-64).
There should be no changes in code/functionality with this patch.

Signed-off-by: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig              |   4 +
 arch/i386/kernel/semaphore.c   | 162 -------------------------------------
 arch/um/Kconfig_i386           |   4 +
 arch/um/Kconfig_x86_64         |   4 +
 arch/um/sys-x86_64/Makefile    |   5 +-
 arch/x86_64/Kconfig            |   4 +
 arch/x86_64/kernel/Makefile    |   2 +-
 arch/x86_64/kernel/semaphore.c | 180 -----------------------------------------
 lib/Makefile                   |   1 +
 lib/semaphore-sleepers.c       | 177 ++++++++++++++++++++++++++++++++++++++++
 10 files changed, 197 insertions(+), 346 deletions(-)
 delete mode 100644 arch/x86_64/kernel/semaphore.c
 create mode 100644 lib/semaphore-sleepers.c

(limited to 'arch/i386')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index dcb0ad098c6..3b3b017e1c1 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -14,6 +14,10 @@ config X86
 	  486, 586, Pentiums, and various instruction-set-compatible chips by
 	  AMD, Cyrix, and others.
 
+config SEMAPHORE_SLEEPERS
+	bool
+	default y
+
 config MMU
 	bool
 	default y
diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c
index 469f496e55c..7455ab64394 100644
--- a/arch/i386/kernel/semaphore.c
+++ b/arch/i386/kernel/semaphore.c
@@ -13,170 +13,8 @@
  * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
  */
 #include <linux/config.h>
-#include <linux/sched.h>
-#include <linux/err.h>
-#include <linux/init.h>
 #include <asm/semaphore.h>
 
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to acquire the semaphore, while the "sleeping"
- * variable is a count of such acquires.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * "sleeping" and the contention routine ordering is protected
- * by the spinlock in the semaphore's waitqueue head.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-
-/*
- * Logic:
- *  - only on a boundary condition do we need to care. When we go
- *    from a negative count to a non-negative, we wake people up.
- *  - when we go from a non-negative count to a negative do we
- *    (a) synchronize with the "sleeper" count and (b) make sure
- *    that we're on the wakeup list before we synchronize so that
- *    we cannot lose wakeup events.
- */
-
-static fastcall void __attribute_used__  __up(struct semaphore *sem)
-{
-	wake_up(&sem->wait);
-}
-
-static fastcall void __attribute_used__ __sched __down(struct semaphore * sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	unsigned long flags;
-
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock in
-		 * the wait_queue_head.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-		schedule();
-
-		spin_lock_irqsave(&sem->wait.lock, flags);
-		tsk->state = TASK_UNINTERRUPTIBLE;
-	}
-	remove_wait_queue_locked(&sem->wait, &wait);
-	wake_up_locked(&sem->wait);
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-	tsk->state = TASK_RUNNING;
-}
-
-static fastcall int __attribute_used__ __sched __down_interruptible(struct semaphore * sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	unsigned long flags;
-
-	tsk->state = TASK_INTERRUPTIBLE;
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * With signals pending, this turns into
-		 * the trylock failure case - we won't be
-		 * sleeping, and we* can't get the lock as
-		 * it has contention. Just correct the count
-		 * and exit.
-		 */
-		if (signal_pending(current)) {
-			retval = -EINTR;
-			sem->sleepers = 0;
-			atomic_add(sleepers, &sem->count);
-			break;
-		}
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock in
-		 * wait_queue_head. The "-1" is because we're
-		 * still hoping to get the semaphore.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-		schedule();
-
-		spin_lock_irqsave(&sem->wait.lock, flags);
-		tsk->state = TASK_INTERRUPTIBLE;
-	}
-	remove_wait_queue_locked(&sem->wait, &wait);
-	wake_up_locked(&sem->wait);
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-	tsk->state = TASK_RUNNING;
-	return retval;
-}
-
-/*
- * Trylock failed - make sure we correct for
- * having decremented the count.
- *
- * We could have done the trylock with a
- * single "cmpxchg" without failure cases,
- * but then it wouldn't work on a 386.
- */
-static fastcall int __attribute_used__ __down_trylock(struct semaphore * sem)
-{
-	int sleepers;
-	unsigned long flags;
-
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	sleepers = sem->sleepers + 1;
-	sem->sleepers = 0;
-
-	/*
-	 * Add "everybody else" and us into it. They aren't
-	 * playing, because we own the spinlock in the
-	 * wait_queue_head.
-	 */
-	if (!atomic_add_negative(sleepers, &sem->count)) {
-		wake_up_locked(&sem->wait);
-	}
-
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-	return 1;
-}
-
-
 /*
  * The semaphore operations have a special calling sequence that
  * allow us to do a simpler in-line version of them. These routines
diff --git a/arch/um/Kconfig_i386 b/arch/um/Kconfig_i386
index 27c18a8d9d1..8ad156a0049 100644
--- a/arch/um/Kconfig_i386
+++ b/arch/um/Kconfig_i386
@@ -6,6 +6,10 @@ config 64BIT
 	bool
 	default n
 
+config SEMAPHORE_SLEEPERS
+	bool
+	default y
+
 config TOP_ADDR
  	hex
  	default 0xc0000000 if !HOST_2G_2G
diff --git a/arch/um/Kconfig_x86_64 b/arch/um/Kconfig_x86_64
index 735a047c890..6e5357c5bcf 100644
--- a/arch/um/Kconfig_x86_64
+++ b/arch/um/Kconfig_x86_64
@@ -6,6 +6,10 @@ config 64BIT
 	bool
 	default y
 
+config SEMAPHORE_SLEEPERS
+	bool
+	default y
+
 config TOP_ADDR
  	hex
 	default 0x80000000
diff --git a/arch/um/sys-x86_64/Makefile b/arch/um/sys-x86_64/Makefile
index 7488206ce6f..736d3edec58 100644
--- a/arch/um/sys-x86_64/Makefile
+++ b/arch/um/sys-x86_64/Makefile
@@ -6,7 +6,7 @@
 
 #XXX: why into lib-y?
 lib-y = bitops.o bugs.o csum-partial.o delay.o fault.o mem.o memcpy.o \
-	ptrace.o ptrace_user.o semaphore.o sigcontext.o signal.o stub.o \
+	ptrace.o ptrace_user.o sigcontext.o signal.o stub.o \
 	stub_segv.o syscalls.o syscall_table.o sysrq.o thunk.o
 
 obj-y := ksyms.o
@@ -15,7 +15,7 @@ obj-$(CONFIG_MODULES) += module.o um_module.o
 USER_OBJS := ptrace_user.o sigcontext.o
 
 SYMLINKS = bitops.c csum-copy.S csum-partial.c csum-wrappers.c memcpy.S \
-	semaphore.c thunk.S module.c
+	thunk.S module.c
 
 include arch/um/scripts/Makefile.rules
 
@@ -24,7 +24,6 @@ csum-copy.S-dir = lib
 csum-partial.c-dir = lib
 csum-wrappers.c-dir = lib
 memcpy.S-dir = lib
-semaphore.c-dir = kernel
 thunk.S-dir = lib
 module.c-dir = kernel
 
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 660a03a89e6..75e52c57f19 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -24,6 +24,10 @@ config X86
 	bool
 	default y
 
+config SEMAPHORE_SLEEPERS
+	bool
+	default y
+
 config MMU
 	bool
 	default y
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 48f9e2c19cd..427501a336c 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -4,7 +4,7 @@
 
 extra-y 	:= head.o head64.o init_task.o vmlinux.lds
 EXTRA_AFLAGS	:= -traditional
-obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o \
+obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
 		x8664_ksyms.o i387.o syscall.o vsyscall.o \
 		setup64.o bootflag.o e820.o reboot.o quirks.o
diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c
deleted file mode 100644
index 48f7c18172b..00000000000
--- a/arch/x86_64/kernel/semaphore.c
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * x86_64 semaphore implementation.
- *
- * (C) Copyright 1999 Linus Torvalds
- *
- * Portions Copyright 1999 Red Hat, Inc.
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
- */
-#include <linux/config.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <asm/errno.h>
-
-#include <asm/semaphore.h>
-
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to acquire the semaphore, while the "sleeping"
- * variable is a count of such acquires.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * "sleeping" and the contention routine ordering is protected
- * by the spinlock in the semaphore's waitqueue head.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-
-/*
- * Logic:
- *  - only on a boundary condition do we need to care. When we go
- *    from a negative count to a non-negative, we wake people up.
- *  - when we go from a non-negative count to a negative do we
- *    (a) synchronize with the "sleeper" count and (b) make sure
- *    that we're on the wakeup list before we synchronize so that
- *    we cannot lose wakeup events.
- */
-
-void __up(struct semaphore *sem)
-{
-	wake_up(&sem->wait);
-}
-
-void __sched __down(struct semaphore * sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	unsigned long flags;
-
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock in
-		 * the wait_queue_head.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-		schedule();
-
-		spin_lock_irqsave(&sem->wait.lock, flags);
-		tsk->state = TASK_UNINTERRUPTIBLE;
-	}
-	remove_wait_queue_locked(&sem->wait, &wait);
-	wake_up_locked(&sem->wait);
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-	tsk->state = TASK_RUNNING;
-}
-
-int __sched __down_interruptible(struct semaphore * sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	unsigned long flags;
-
-	tsk->state = TASK_INTERRUPTIBLE;
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * With signals pending, this turns into
-		 * the trylock failure case - we won't be
-		 * sleeping, and we* can't get the lock as
-		 * it has contention. Just correct the count
-		 * and exit.
-		 */
-		if (signal_pending(current)) {
-			retval = -EINTR;
-			sem->sleepers = 0;
-			atomic_add(sleepers, &sem->count);
-			break;
-		}
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock in
-		 * wait_queue_head. The "-1" is because we're
-		 * still hoping to get the semaphore.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-		schedule();
-
-		spin_lock_irqsave(&sem->wait.lock, flags);
-		tsk->state = TASK_INTERRUPTIBLE;
-	}
-	remove_wait_queue_locked(&sem->wait, &wait);
-	wake_up_locked(&sem->wait);
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-	tsk->state = TASK_RUNNING;
-	return retval;
-}
-
-/*
- * Trylock failed - make sure we correct for
- * having decremented the count.
- *
- * We could have done the trylock with a
- * single "cmpxchg" without failure cases,
- * but then it wouldn't work on a 386.
- */
-int __down_trylock(struct semaphore * sem)
-{
-	int sleepers;
-	unsigned long flags;
-
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	sleepers = sem->sleepers + 1;
-	sem->sleepers = 0;
-
-	/*
-	 * Add "everybody else" and us into it. They aren't
-	 * playing, because we own the spinlock in the
-	 * wait_queue_head.
-	 */
-	if (!atomic_add_negative(sleepers, &sem->count)) {
-		wake_up_locked(&sem->wait);
-	}
-
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-	return 1;
-}
-
-
diff --git a/lib/Makefile b/lib/Makefile
index 52f83380f70..3e2bd0df23b 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -18,6 +18,7 @@ endif
 
 lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
+lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
diff --git a/lib/semaphore-sleepers.c b/lib/semaphore-sleepers.c
new file mode 100644
index 00000000000..4d5f18889fa
--- /dev/null
+++ b/lib/semaphore-sleepers.c
@@ -0,0 +1,177 @@
+/*
+ * i386 and x86-64 semaphore implementation.
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Portions Copyright 1999 Red Hat, Inc.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <asm/semaphore.h>
+
+/*
+ * Semaphores are implemented using a two-way counter:
+ * The "count" variable is decremented for each process
+ * that tries to acquire the semaphore, while the "sleeping"
+ * variable is a count of such acquires.
+ *
+ * Notably, the inline "up()" and "down()" functions can
+ * efficiently test if they need to do any extra work (up
+ * needs to do something only if count was negative before
+ * the increment operation.
+ *
+ * "sleeping" and the contention routine ordering is protected
+ * by the spinlock in the semaphore's waitqueue head.
+ *
+ * Note that these functions are only called when there is
+ * contention on the lock, and as such all this is the
+ * "non-critical" part of the whole semaphore business. The
+ * critical part is the inline stuff in <asm/semaphore.h>
+ * where we want to avoid any extra jumps and calls.
+ */
+
+/*
+ * Logic:
+ *  - only on a boundary condition do we need to care. When we go
+ *    from a negative count to a non-negative, we wake people up.
+ *  - when we go from a non-negative count to a negative do we
+ *    (a) synchronize with the "sleeper" count and (b) make sure
+ *    that we're on the wakeup list before we synchronize so that
+ *    we cannot lose wakeup events.
+ */
+
+fastcall void __up(struct semaphore *sem)
+{
+	wake_up(&sem->wait);
+}
+
+fastcall void __sched __down(struct semaphore * sem)
+{
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+	unsigned long flags;
+
+	tsk->state = TASK_UNINTERRUPTIBLE;
+	spin_lock_irqsave(&sem->wait.lock, flags);
+	add_wait_queue_exclusive_locked(&sem->wait, &wait);
+
+	sem->sleepers++;
+	for (;;) {
+		int sleepers = sem->sleepers;
+
+		/*
+		 * Add "everybody else" into it. They aren't
+		 * playing, because we own the spinlock in
+		 * the wait_queue_head.
+		 */
+		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
+			sem->sleepers = 0;
+			break;
+		}
+		sem->sleepers = 1;	/* us - see -1 above */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		schedule();
+
+		spin_lock_irqsave(&sem->wait.lock, flags);
+		tsk->state = TASK_UNINTERRUPTIBLE;
+	}
+	remove_wait_queue_locked(&sem->wait, &wait);
+	wake_up_locked(&sem->wait);
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+	tsk->state = TASK_RUNNING;
+}
+
+fastcall int __sched __down_interruptible(struct semaphore * sem)
+{
+	int retval = 0;
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+	unsigned long flags;
+
+	tsk->state = TASK_INTERRUPTIBLE;
+	spin_lock_irqsave(&sem->wait.lock, flags);
+	add_wait_queue_exclusive_locked(&sem->wait, &wait);
+
+	sem->sleepers++;
+	for (;;) {
+		int sleepers = sem->sleepers;
+
+		/*
+		 * With signals pending, this turns into
+		 * the trylock failure case - we won't be
+		 * sleeping, and we* can't get the lock as
+		 * it has contention. Just correct the count
+		 * and exit.
+		 */
+		if (signal_pending(current)) {
+			retval = -EINTR;
+			sem->sleepers = 0;
+			atomic_add(sleepers, &sem->count);
+			break;
+		}
+
+		/*
+		 * Add "everybody else" into it. They aren't
+		 * playing, because we own the spinlock in
+		 * wait_queue_head. The "-1" is because we're
+		 * still hoping to get the semaphore.
+		 */
+		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
+			sem->sleepers = 0;
+			break;
+		}
+		sem->sleepers = 1;	/* us - see -1 above */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		schedule();
+
+		spin_lock_irqsave(&sem->wait.lock, flags);
+		tsk->state = TASK_INTERRUPTIBLE;
+	}
+	remove_wait_queue_locked(&sem->wait, &wait);
+	wake_up_locked(&sem->wait);
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	tsk->state = TASK_RUNNING;
+	return retval;
+}
+
+/*
+ * Trylock failed - make sure we correct for
+ * having decremented the count.
+ *
+ * We could have done the trylock with a
+ * single "cmpxchg" without failure cases,
+ * but then it wouldn't work on a 386.
+ */
+fastcall int __down_trylock(struct semaphore * sem)
+{
+	int sleepers;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+	sleepers = sem->sleepers + 1;
+	sem->sleepers = 0;
+
+	/*
+	 * Add "everybody else" and us into it. They aren't
+	 * playing, because we own the spinlock in the
+	 * wait_queue_head.
+	 */
+	if (!atomic_add_negative(sleepers, &sem->count)) {
+		wake_up_locked(&sem->wait);
+	}
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+	return 1;
+}
-- 
cgit v1.2.3-70-g09d2


From 795312e763569ce4df67e7a0ca726a9901358fa2 Mon Sep 17 00:00:00 2001
From: Pierre Ossman <drzeus@drzeus.cx>
Date: Sat, 3 Sep 2005 15:56:54 -0700
Subject: [PATCH] ISA DMA suspend for i386

Reset the ISA DMA controller into a known state after a suspend.  Primary
concern was reenabling the cascading DMA channel (4).

Signed-off-by: Pierre Ossman <drzeus@drzeus.cx>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/Makefile |  2 +-
 arch/i386/kernel/i8237.c  | 67 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 arch/i386/kernel/i8237.c

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 4cc83b322b3..64682a0edac 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -7,7 +7,7 @@ extra-y := head.o init_task.o vmlinux.lds
 obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
 		pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
-		doublefault.o quirks.o
+		doublefault.o quirks.o i8237.o
 
 obj-y				+= cpu/
 obj-y				+= timers/
diff --git a/arch/i386/kernel/i8237.c b/arch/i386/kernel/i8237.c
new file mode 100644
index 00000000000..c36d1c006c2
--- /dev/null
+++ b/arch/i386/kernel/i8237.c
@@ -0,0 +1,67 @@
+/*
+ * i8237.c: 8237A DMA controller suspend functions.
+ *
+ * Written by Pierre Ossman, 2005.
+ */
+
+#include <linux/init.h>
+#include <linux/sysdev.h>
+
+#include <asm/dma.h>
+
+/*
+ * This module just handles suspend/resume issues with the
+ * 8237A DMA controller (used for ISA and LPC).
+ * Allocation is handled in kernel/dma.c and normal usage is
+ * in asm/dma.h.
+ */
+
+static int i8237A_resume(struct sys_device *dev)
+{
+	unsigned long flags;
+	int i;
+
+	flags = claim_dma_lock();
+
+	dma_outb(DMA1_RESET_REG, 0);
+	dma_outb(DMA2_RESET_REG, 0);
+
+	for (i = 0;i < 8;i++) {
+		set_dma_addr(i, 0x000000);
+		/* DMA count is a bit weird so this is not 0 */
+		set_dma_count(i, 1);
+	}
+
+	/* Enable cascade DMA or channel 0-3 won't work */
+	enable_dma(4);
+
+	release_dma_lock(flags);
+
+	return 0;
+}
+
+static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
+{
+	return 0;
+}
+
+static struct sysdev_class i8237_sysdev_class = {
+	set_kset_name("i8237"),
+	.suspend = i8237A_suspend,
+	.resume = i8237A_resume,
+};
+
+static struct sys_device device_i8237A = {
+	.id	= 0,
+	.cls	= &i8237_sysdev_class,
+};
+
+static int __init i8237A_init_sysfs(void)
+{
+	int error = sysdev_class_register(&i8237_sysdev_class);
+	if (!error)
+		error = sysdev_register(&device_i8237A);
+	return error;
+}
+
+device_initcall(i8237A_init_sysfs);
-- 
cgit v1.2.3-70-g09d2


From 829ca9a30a2ddb727981d80fabdbff2ea86bc9ea Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 3 Sep 2005 15:56:56 -0700
Subject: [PATCH] swsusp: fix remaining u32 vs. pm_message_t confusion

Fix remaining bits of u32 vs.  pm_message confusion.  Should not break
anything.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/mtrr/main.c              |  2 +-
 arch/ppc/platforms/pmac_pic.c                 |  2 +-
 arch/ppc/syslib/open_pic.c                    |  2 +-
 arch/x86_64/kernel/nmi.c                      |  2 +-
 drivers/ide/ppc/pmac.c                        |  8 ++++----
 drivers/macintosh/mediabay.c                  |  2 +-
 drivers/media/video/bttv-driver.c             |  1 -
 drivers/media/video/msp3400.c                 |  4 ++--
 drivers/media/video/tda9887.c                 |  2 +-
 drivers/media/video/tuner-core.c              |  2 +-
 drivers/net/bnx2.c                            | 18 +++++++++---------
 drivers/net/e1000/e1000_main.c                | 14 ++++++--------
 drivers/net/irda/vlsi_ir.c                    | 11 +++--------
 drivers/net/phy/mdio_bus.c                    |  2 +-
 drivers/net/wireless/orinoco_pci.c            |  2 --
 drivers/net/wireless/prism54/islpci_hotplug.c |  2 --
 drivers/video/s1d13xxxfb.c                    |  2 +-
 drivers/video/savage/savagefb_driver.c        |  1 -
 sound/pci/atiixp.c                            |  4 ++--
 19 files changed, 35 insertions(+), 48 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 764cac64e21..dd4ebd6af7e 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -561,7 +561,7 @@ struct mtrr_value {
 
 static struct mtrr_value * mtrr_state;
 
-static int mtrr_save(struct sys_device * sysdev, u32 state)
+static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
 {
 	int i;
 	int size = num_var_ranges * sizeof(struct mtrr_value);
diff --git a/arch/ppc/platforms/pmac_pic.c b/arch/ppc/platforms/pmac_pic.c
index 9f92e1bb7f3..2ce058895e0 100644
--- a/arch/ppc/platforms/pmac_pic.c
+++ b/arch/ppc/platforms/pmac_pic.c
@@ -619,7 +619,7 @@ not_found:
 	return viaint;
 }
 
-static int pmacpic_suspend(struct sys_device *sysdev, u32 state)
+static int pmacpic_suspend(struct sys_device *sysdev, pm_message_t state)
 {
 	int viaint = pmacpic_find_viaint();
 
diff --git a/arch/ppc/syslib/open_pic.c b/arch/ppc/syslib/open_pic.c
index ad39b86ca92..53da58523e3 100644
--- a/arch/ppc/syslib/open_pic.c
+++ b/arch/ppc/syslib/open_pic.c
@@ -948,7 +948,7 @@ static void openpic_cached_disable_irq(u_int irq)
  * we need something better to deal with that... Maybe switch to S1 for
  * cpufreq changes
  */
-int openpic_suspend(struct sys_device *sysdev, u32 state)
+int openpic_suspend(struct sys_device *sysdev, pm_message_t state)
 {
 	int	i;
 	unsigned long flags;
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 4e44d6e6b7e..64a8e05d581 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -290,7 +290,7 @@ void enable_timer_nmi_watchdog(void)
 
 static int nmi_pm_active; /* nmi_active before suspend */
 
-static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
+static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
 {
 	nmi_pm_active = nmi_active;
 	disable_lapic_nmi_watchdog();
diff --git a/drivers/ide/ppc/pmac.c b/drivers/ide/ppc/pmac.c
index ea65b070a36..d2760b8ca15 100644
--- a/drivers/ide/ppc/pmac.c
+++ b/drivers/ide/ppc/pmac.c
@@ -1504,7 +1504,7 @@ pmac_ide_macio_attach(struct macio_dev *mdev, const struct of_device_id *match)
 }
 
 static int
-pmac_ide_macio_suspend(struct macio_dev *mdev, u32 state)
+pmac_ide_macio_suspend(struct macio_dev *mdev, pm_message_t state)
 {
 	ide_hwif_t	*hwif = (ide_hwif_t *)dev_get_drvdata(&mdev->ofdev.dev);
 	int		rc = 0;
@@ -1527,7 +1527,7 @@ pmac_ide_macio_resume(struct macio_dev *mdev)
 	if (mdev->ofdev.dev.power.power_state != 0) {
 		rc = pmac_ide_do_resume(hwif);
 		if (rc == 0)
-			mdev->ofdev.dev.power.power_state = 0;
+			mdev->ofdev.dev.power.power_state = PMSG_ON;
 	}
 
 	return rc;
@@ -1608,7 +1608,7 @@ pmac_ide_pci_attach(struct pci_dev *pdev, const struct pci_device_id *id)
 }
 
 static int
-pmac_ide_pci_suspend(struct pci_dev *pdev, u32 state)
+pmac_ide_pci_suspend(struct pci_dev *pdev, pm_message_t state)
 {
 	ide_hwif_t	*hwif = (ide_hwif_t *)pci_get_drvdata(pdev);
 	int		rc = 0;
@@ -1631,7 +1631,7 @@ pmac_ide_pci_resume(struct pci_dev *pdev)
 	if (pdev->dev.power.power_state != 0) {
 		rc = pmac_ide_do_resume(hwif);
 		if (rc == 0)
-			pdev->dev.power.power_state = 0;
+			pdev->dev.power.power_state = PMSG_ON;
 	}
 
 	return rc;
diff --git a/drivers/macintosh/mediabay.c b/drivers/macintosh/mediabay.c
index 7c16c25fc5d..7e1afca75e4 100644
--- a/drivers/macintosh/mediabay.c
+++ b/drivers/macintosh/mediabay.c
@@ -724,7 +724,7 @@ static int __pmac media_bay_resume(struct macio_dev *mdev)
 	struct media_bay_info	*bay = macio_get_drvdata(mdev);
 
 	if (mdev->ofdev.dev.power.power_state != 0) {
-		mdev->ofdev.dev.power.power_state = 0;
+		mdev->ofdev.dev.power.power_state = PMSG_ON;
 
 	       	/* We re-enable the bay using it's previous content
 	       	   only if it did not change. Note those bozo timings,
diff --git a/drivers/media/video/bttv-driver.c b/drivers/media/video/bttv-driver.c
index eee9322ce21..087efb4dea0 100644
--- a/drivers/media/video/bttv-driver.c
+++ b/drivers/media/video/bttv-driver.c
@@ -4047,7 +4047,6 @@ static int bttv_suspend(struct pci_dev *pci_dev, pm_message_t state)
 	struct bttv_buffer_set idle;
 	unsigned long flags;
 
-	dprintk("bttv%d: suspend %d\n", btv->c.nr, state);
 
 	/* stop dma + irqs */
 	spin_lock_irqsave(&btv->s_lock,flags);
diff --git a/drivers/media/video/msp3400.c b/drivers/media/video/msp3400.c
index 62f1b8ddb98..e956234abf2 100644
--- a/drivers/media/video/msp3400.c
+++ b/drivers/media/video/msp3400.c
@@ -1416,7 +1416,7 @@ static int msp_detach(struct i2c_client *client);
 static int msp_probe(struct i2c_adapter *adap);
 static int msp_command(struct i2c_client *client, unsigned int cmd, void *arg);
 
-static int msp_suspend(struct device * dev, u32 state, u32 level);
+static int msp_suspend(struct device * dev, pm_message_t state, u32 level);
 static int msp_resume(struct device * dev, u32 level);
 
 static void msp_wake_thread(struct i2c_client *client);
@@ -1817,7 +1817,7 @@ static int msp_command(struct i2c_client *client, unsigned int cmd, void *arg)
 	return 0;
 }
 
-static int msp_suspend(struct device * dev, u32 state, u32 level)
+static int msp_suspend(struct device * dev, pm_message_t state, u32 level)
 {
 	struct i2c_client *c = container_of(dev, struct i2c_client, dev);
 
diff --git a/drivers/media/video/tda9887.c b/drivers/media/video/tda9887.c
index 108c3ad7d62..a28a395d6df 100644
--- a/drivers/media/video/tda9887.c
+++ b/drivers/media/video/tda9887.c
@@ -760,7 +760,7 @@ tda9887_command(struct i2c_client *client, unsigned int cmd, void *arg)
 	return 0;
 }
 
-static int tda9887_suspend(struct device * dev, u32 state, u32 level)
+static int tda9887_suspend(struct device * dev, pm_message_t state, u32 level)
 {
 	dprintk("tda9887: suspend\n");
 	return 0;
diff --git a/drivers/media/video/tuner-core.c b/drivers/media/video/tuner-core.c
index f0a579827a2..a155e99a263 100644
--- a/drivers/media/video/tuner-core.c
+++ b/drivers/media/video/tuner-core.c
@@ -672,7 +672,7 @@ static int tuner_command(struct i2c_client *client, unsigned int cmd, void *arg)
 	return 0;
 }
 
-static int tuner_suspend(struct device *dev, u32 state, u32 level)
+static int tuner_suspend(struct device *dev, pm_message_t state, u32 level)
 {
 	struct i2c_client *c = container_of (dev, struct i2c_client, dev);
 	struct tuner *t = i2c_get_clientdata (c);
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 7babf6af4e2..55a72c7ad00 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -2004,14 +2004,14 @@ bnx2_init_cpus(struct bnx2 *bp)
 }
 
 static int
-bnx2_set_power_state(struct bnx2 *bp, int state)
+bnx2_set_power_state(struct bnx2 *bp, pci_power_t state)
 {
 	u16 pmcsr;
 
 	pci_read_config_word(bp->pdev, bp->pm_cap + PCI_PM_CTRL, &pmcsr);
 
 	switch (state) {
-	case 0: {
+	case PCI_D0: {
 		u32 val;
 
 		pci_write_config_word(bp->pdev, bp->pm_cap + PCI_PM_CTRL,
@@ -2032,7 +2032,7 @@ bnx2_set_power_state(struct bnx2 *bp, int state)
 		REG_WR(bp, BNX2_RPM_CONFIG, val);
 		break;
 	}
-	case 3: {
+	case PCI_D3hot: {
 		int i;
 		u32 val, wol_msg;
 
@@ -3886,7 +3886,7 @@ bnx2_open(struct net_device *dev)
 	struct bnx2 *bp = dev->priv;
 	int rc;
 
-	bnx2_set_power_state(bp, 0);
+	bnx2_set_power_state(bp, PCI_D0);
 	bnx2_disable_int(bp);
 
 	rc = bnx2_alloc_mem(bp);
@@ -4197,7 +4197,7 @@ bnx2_close(struct net_device *dev)
 	bnx2_free_mem(bp);
 	bp->link_up = 0;
 	netif_carrier_off(bp->dev);
-	bnx2_set_power_state(bp, 3);
+	bnx2_set_power_state(bp, PCI_D3hot);
 	return 0;
 }
 
@@ -5203,7 +5203,7 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev)
 			       BNX2_PCICFG_MISC_CONFIG_REG_WINDOW_ENA |
 			       BNX2_PCICFG_MISC_CONFIG_TARGET_MB_WORD_SWAP);
 
-	bnx2_set_power_state(bp, 0);
+	bnx2_set_power_state(bp, PCI_D0);
 
 	bp->chip_id = REG_RD(bp, BNX2_MISC_ID);
 
@@ -5495,7 +5495,7 @@ bnx2_remove_one(struct pci_dev *pdev)
 }
 
 static int
-bnx2_suspend(struct pci_dev *pdev, u32 state)
+bnx2_suspend(struct pci_dev *pdev, pm_message_t state)
 {
 	struct net_device *dev = pci_get_drvdata(pdev);
 	struct bnx2 *bp = dev->priv;
@@ -5513,7 +5513,7 @@ bnx2_suspend(struct pci_dev *pdev, u32 state)
 		reset_code = BNX2_DRV_MSG_CODE_SUSPEND_NO_WOL;
 	bnx2_reset_chip(bp, reset_code);
 	bnx2_free_skbs(bp);
-	bnx2_set_power_state(bp, state);
+	bnx2_set_power_state(bp, pci_choose_state(pdev, state));
 	return 0;
 }
 
@@ -5526,7 +5526,7 @@ bnx2_resume(struct pci_dev *pdev)
 	if (!netif_running(dev))
 		return 0;
 
-	bnx2_set_power_state(bp, 0);
+	bnx2_set_power_state(bp, PCI_D0);
 	netif_device_attach(dev);
 	bnx2_init_nic(bp);
 	bnx2_netif_start(bp);
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 9b596e0bbf9..7c8a0a22dcd 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -162,7 +162,7 @@ static void e1000_vlan_rx_add_vid(struct net_device *netdev, uint16_t vid);
 static void e1000_vlan_rx_kill_vid(struct net_device *netdev, uint16_t vid);
 static void e1000_restore_vlan(struct e1000_adapter *adapter);
 
-static int e1000_suspend(struct pci_dev *pdev, uint32_t state);
+static int e1000_suspend(struct pci_dev *pdev, pm_message_t state);
 #ifdef CONFIG_PM
 static int e1000_resume(struct pci_dev *pdev);
 #endif
@@ -3642,7 +3642,7 @@ e1000_set_spd_dplx(struct e1000_adapter *adapter, uint16_t spddplx)
 }
 
 static int
-e1000_suspend(struct pci_dev *pdev, uint32_t state)
+e1000_suspend(struct pci_dev *pdev, pm_message_t state)
 {
 	struct net_device *netdev = pci_get_drvdata(pdev);
 	struct e1000_adapter *adapter = netdev_priv(netdev);
@@ -3726,9 +3726,7 @@ e1000_suspend(struct pci_dev *pdev, uint32_t state)
 	}
 
 	pci_disable_device(pdev);
-
-	state = (state > 0) ? 3 : 0;
-	pci_set_power_state(pdev, state);
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
 
 	return 0;
 }
@@ -3741,13 +3739,13 @@ e1000_resume(struct pci_dev *pdev)
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	uint32_t manc, ret_val, swsm;
 
-	pci_set_power_state(pdev, 0);
+	pci_set_power_state(pdev, PCI_D0);
 	pci_restore_state(pdev);
 	ret_val = pci_enable_device(pdev);
 	pci_set_master(pdev);
 
-	pci_enable_wake(pdev, 3, 0);
-	pci_enable_wake(pdev, 4, 0); /* 4 == D3 cold */
+	pci_enable_wake(pdev, PCI_D3hot, 0);
+	pci_enable_wake(pdev, PCI_D3cold, 0);
 
 	e1000_reset(adapter);
 	E1000_WRITE_REG(&adapter->hw, WUS, ~0);
diff --git a/drivers/net/irda/vlsi_ir.c b/drivers/net/irda/vlsi_ir.c
index 006e4f57560..4be95398bac 100644
--- a/drivers/net/irda/vlsi_ir.c
+++ b/drivers/net/irda/vlsi_ir.c
@@ -1749,11 +1749,6 @@ static int vlsi_irda_suspend(struct pci_dev *pdev, pm_message_t state)
 	struct net_device *ndev = pci_get_drvdata(pdev);
 	vlsi_irda_dev_t *idev;
 
-	if (state < 1 || state > 3 ) {
-		IRDA_ERROR("%s - %s: invalid pm state request: %u\n",
-			   __FUNCTION__, PCIDEV_NAME(pdev), state);
-		return 0;
-	}
 	if (!ndev) {
 		IRDA_ERROR("%s - %s: no netdevice \n",
 			   __FUNCTION__, PCIDEV_NAME(pdev));
@@ -1781,7 +1776,7 @@ static int vlsi_irda_suspend(struct pci_dev *pdev, pm_message_t state)
 			idev->new_baud = idev->baud;
 	}
 
-	pci_set_power_state(pdev,state);
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
 	pdev->current_state = state;
 	idev->resume_ok = 1;
 	up(&idev->sem);
@@ -1807,8 +1802,8 @@ static int vlsi_irda_resume(struct pci_dev *pdev)
 		return 0;
 	}
 	
-	pci_set_power_state(pdev, 0);
-	pdev->current_state = 0;
+	pci_set_power_state(pdev, PCI_D0);
+	pdev->current_state = PM_EVENT_ON;
 
 	if (!idev->resume_ok) {
 		/* should be obsolete now - but used to happen due to:
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 41f62c0c5fc..5e81494e9a9 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -128,7 +128,7 @@ static int mdio_bus_match(struct device *dev, struct device_driver *drv)
 /* Suspend and resume.  Copied from platform_suspend and
  * platform_resume
  */
-static int mdio_bus_suspend(struct device * dev, u32 state)
+static int mdio_bus_suspend(struct device * dev, pm_message_t state)
 {
 	int ret = 0;
 	struct device_driver *drv = dev->driver;
diff --git a/drivers/net/wireless/orinoco_pci.c b/drivers/net/wireless/orinoco_pci.c
index 7a6f52ea7fa..42e03438291 100644
--- a/drivers/net/wireless/orinoco_pci.c
+++ b/drivers/net/wireless/orinoco_pci.c
@@ -301,8 +301,6 @@ static int orinoco_pci_suspend(struct pci_dev *pdev, pm_message_t state)
 	unsigned long flags;
 	int err;
 	
-	printk(KERN_DEBUG "%s: Orinoco-PCI entering sleep mode (state=%d)\n",
-	       dev->name, state);
 
 	err = orinoco_lock(priv, &flags);
 	if (err) {
diff --git a/drivers/net/wireless/prism54/islpci_hotplug.c b/drivers/net/wireless/prism54/islpci_hotplug.c
index c17391d947f..dc040caab7d 100644
--- a/drivers/net/wireless/prism54/islpci_hotplug.c
+++ b/drivers/net/wireless/prism54/islpci_hotplug.c
@@ -267,8 +267,6 @@ prism54_suspend(struct pci_dev *pdev, pm_message_t state)
 	islpci_private *priv = ndev ? netdev_priv(ndev) : NULL;
 	BUG_ON(!priv);
 
-	printk(KERN_NOTICE "%s: got suspend request (state %d)\n",
-	       ndev->name, state);
 
 	pci_save_state(pdev);
 
diff --git a/drivers/video/s1d13xxxfb.c b/drivers/video/s1d13xxxfb.c
index 3848be2b9d2..fa98d91c42e 100644
--- a/drivers/video/s1d13xxxfb.c
+++ b/drivers/video/s1d13xxxfb.c
@@ -655,7 +655,7 @@ bail:
 }
 
 #ifdef CONFIG_PM
-static int s1d13xxxfb_suspend(struct device *dev, u32 state, u32 level)
+static int s1d13xxxfb_suspend(struct device *dev, pm_message_t state, u32 level)
 {
 	struct fb_info *info = dev_get_drvdata(dev);
 	struct s1d13xxxfb_par *s1dfb = info->par;
diff --git a/drivers/video/savage/savagefb_driver.c b/drivers/video/savage/savagefb_driver.c
index f4633d1891f..117ad42f120 100644
--- a/drivers/video/savage/savagefb_driver.c
+++ b/drivers/video/savage/savagefb_driver.c
@@ -2110,7 +2110,6 @@ static int savagefb_suspend (struct pci_dev* dev, pm_message_t state)
 	struct savagefb_par *par = (struct savagefb_par *)info->par;
 
 	DBG("savagefb_suspend");
-	printk(KERN_DEBUG "state: %u\n", state);
 
 	acquire_console_sem();
 	fb_set_suspend(info, pci_choose_state(dev, state));
diff --git a/sound/pci/atiixp.c b/sound/pci/atiixp.c
index 904d17394e1..188df085b7e 100644
--- a/sound/pci/atiixp.c
+++ b/sound/pci/atiixp.c
@@ -1427,7 +1427,7 @@ static int snd_atiixp_suspend(snd_card_t *card, pm_message_t state)
 	snd_atiixp_aclink_down(chip);
 	snd_atiixp_chip_stop(chip);
 
-	pci_set_power_state(chip->pci, 3);
+	pci_set_power_state(chip->pci, PCI_D3hot);
 	pci_disable_device(chip->pci);
 	return 0;
 }
@@ -1438,7 +1438,7 @@ static int snd_atiixp_resume(snd_card_t *card)
 	int i;
 
 	pci_enable_device(chip->pci);
-	pci_set_power_state(chip->pci, 0);
+	pci_set_power_state(chip->pci, PCI_D0);
 	pci_set_master(chip->pci);
 
 	snd_atiixp_aclink_reset(chip);
-- 
cgit v1.2.3-70-g09d2


From c3c433e4f33afe255389ba3b1a003dc8deb3de9a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Sat, 3 Sep 2005 15:57:07 -0700
Subject: [PATCH] add suspend/resume for timer

The timers lack .suspend/.resume methods.  Because of this, jiffies got a
big compensation after a S3 resume.  And then softlockup watchdog reports
an oops.  This occured with HPET enabled, but it's also possible for other
timers.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/time.c              | 10 ++++++++++
 arch/i386/kernel/timers/timer_hpet.c | 14 ++++++++++++++
 arch/i386/kernel/timers/timer_pit.c  | 27 ---------------------------
 arch/i386/kernel/timers/timer_pm.c   |  9 +++++++++
 arch/i386/kernel/timers/timer_tsc.c  | 14 ++++++++++++++
 include/asm-i386/timer.h             |  3 +++
 6 files changed, 50 insertions(+), 27 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 0ee9dee8af0..6f794a78ee1 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -383,6 +383,7 @@ void notify_arch_cmos_timer(void)
 
 static long clock_cmos_diff, sleep_start;
 
+static struct timer_opts *last_timer;
 static int timer_suspend(struct sys_device *dev, pm_message_t state)
 {
 	/*
@@ -391,6 +392,10 @@ static int timer_suspend(struct sys_device *dev, pm_message_t state)
 	clock_cmos_diff = -get_cmos_time();
 	clock_cmos_diff += get_seconds();
 	sleep_start = get_cmos_time();
+	last_timer = cur_timer;
+	cur_timer = &timer_none;
+	if (last_timer->suspend)
+		last_timer->suspend(state);
 	return 0;
 }
 
@@ -404,6 +409,7 @@ static int timer_resume(struct sys_device *dev)
 	if (is_hpet_enabled())
 		hpet_reenable();
 #endif
+	setup_pit_timer();
 	sec = get_cmos_time() + clock_cmos_diff;
 	sleep_length = (get_cmos_time() - sleep_start) * HZ;
 	write_seqlock_irqsave(&xtime_lock, flags);
@@ -412,6 +418,10 @@ static int timer_resume(struct sys_device *dev)
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 	jiffies += sleep_length;
 	wall_jiffies += sleep_length;
+	if (last_timer->resume)
+		last_timer->resume();
+	cur_timer = last_timer;
+	last_timer = NULL;
 	return 0;
 }
 
diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
index cbb3f221ced..001de97c9e4 100644
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ b/arch/i386/kernel/timers/timer_hpet.c
@@ -181,6 +181,19 @@ static int __init init_hpet(char* override)
 	return 0;
 }
 
+static int hpet_resume(void)
+{
+	write_seqlock(&monotonic_lock);
+	/* Assume this is the last mark offset time */
+	rdtsc(last_tsc_low, last_tsc_high);
+
+	if (hpet_use_timer)
+		hpet_last = hpet_readl(HPET_T0_CMP) - hpet_tick;
+	else
+		hpet_last = hpet_readl(HPET_COUNTER);
+	write_sequnlock(&monotonic_lock);
+	return 0;
+}
 /************************************************************/
 
 /* tsc timer_opts struct */
@@ -190,6 +203,7 @@ static struct timer_opts timer_hpet __read_mostly = {
 	.get_offset =		get_offset_hpet,
 	.monotonic_clock =	monotonic_clock_hpet,
 	.delay = 		delay_hpet,
+	.resume	=		hpet_resume,
 };
 
 struct init_timer_opts __initdata timer_hpet_init = {
diff --git a/arch/i386/kernel/timers/timer_pit.c b/arch/i386/kernel/timers/timer_pit.c
index 06de036a820..eddb6403823 100644
--- a/arch/i386/kernel/timers/timer_pit.c
+++ b/arch/i386/kernel/timers/timer_pit.c
@@ -175,30 +175,3 @@ void setup_pit_timer(void)
 	outb(LATCH >> 8 , PIT_CH0);	/* MSB */
 	spin_unlock_irqrestore(&i8253_lock, flags);
 }
-
-static int timer_resume(struct sys_device *dev)
-{
-	setup_pit_timer();
-	return 0;
-}
-
-static struct sysdev_class timer_sysclass = {
-	set_kset_name("timer_pit"),
-	.resume	= timer_resume,
-};
-
-static struct sys_device device_timer = {
-	.id	= 0,
-	.cls	= &timer_sysclass,
-};
-
-static int __init init_timer_sysfs(void)
-{
-	int error = sysdev_class_register(&timer_sysclass);
-	if (!error)
-		error = sysdev_register(&device_timer);
-	return error;
-}
-
-device_initcall(init_timer_sysfs);
-
diff --git a/arch/i386/kernel/timers/timer_pm.c b/arch/i386/kernel/timers/timer_pm.c
index 4ef20e66349..264edaaac31 100644
--- a/arch/i386/kernel/timers/timer_pm.c
+++ b/arch/i386/kernel/timers/timer_pm.c
@@ -186,6 +186,14 @@ static void mark_offset_pmtmr(void)
 	}
 }
 
+static int pmtmr_resume(void)
+{
+	write_seqlock(&monotonic_lock);
+	/* Assume this is the last mark offset time */
+	offset_tick = read_pmtmr();
+	write_sequnlock(&monotonic_lock);
+	return 0;
+}
 
 static unsigned long long monotonic_clock_pmtmr(void)
 {
@@ -247,6 +255,7 @@ static struct timer_opts timer_pmtmr = {
 	.monotonic_clock 	= monotonic_clock_pmtmr,
 	.delay 			= delay_pmtmr,
 	.read_timer 		= read_timer_tsc,
+	.resume			= pmtmr_resume,
 };
 
 struct init_timer_opts __initdata timer_pmtmr_init = {
diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c
index 8f4e4d5bc56..6dd470cc9f7 100644
--- a/arch/i386/kernel/timers/timer_tsc.c
+++ b/arch/i386/kernel/timers/timer_tsc.c
@@ -543,6 +543,19 @@ static int __init init_tsc(char* override)
 	return -ENODEV;
 }
 
+static int tsc_resume(void)
+{
+	write_seqlock(&monotonic_lock);
+	/* Assume this is the last mark offset time */
+	rdtsc(last_tsc_low, last_tsc_high);
+#ifdef CONFIG_HPET_TIMER
+	if (is_hpet_enabled() && hpet_use_timer)
+		hpet_last = hpet_readl(HPET_COUNTER);
+#endif
+	write_sequnlock(&monotonic_lock);
+	return 0;
+}
+
 #ifndef CONFIG_X86_TSC
 /* disable flag for tsc.  Takes effect by clearing the TSC cpu flag
  * in cpu/common.c */
@@ -573,6 +586,7 @@ static struct timer_opts timer_tsc = {
 	.monotonic_clock = monotonic_clock_tsc,
 	.delay = delay_tsc,
 	.read_timer = read_timer_tsc,
+	.resume	= tsc_resume,
 };
 
 struct init_timer_opts __initdata timer_tsc_init = {
diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h
index dcf1e07db08..aed16437479 100644
--- a/include/asm-i386/timer.h
+++ b/include/asm-i386/timer.h
@@ -1,6 +1,7 @@
 #ifndef _ASMi386_TIMER_H
 #define _ASMi386_TIMER_H
 #include <linux/init.h>
+#include <linux/pm.h>
 
 /**
  * struct timer_ops - used to define a timer source
@@ -23,6 +24,8 @@ struct timer_opts {
 	unsigned long long (*monotonic_clock)(void);
 	void (*delay)(unsigned long);
 	unsigned long (*read_timer)(void);
+	int (*suspend)(pm_message_t state);
+	int (*resume)(void);
 };
 
 struct init_timer_opts {
-- 
cgit v1.2.3-70-g09d2


From 94c80b2598dbd2b8a6fe5f5c2c3af1beb37f66c7 Mon Sep 17 00:00:00 2001
From: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
Date: Sat, 3 Sep 2005 15:57:13 -0700
Subject: [PATCH] Ptrace/i386: fix "syscall audit" interaction with singlestep

      Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>

Avoid giving two traps for singlestep instead of one, when syscall auditing is
enabled.

In fact no singlestep trap is sent on syscall entry, only on syscall exit, as
can be seen in entry.S:

# Note that in this mask _TIF_SINGLESTEP is not tested !!! <<<<<<<<<<<<<<
        testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
        jnz syscall_trace_entry
	...
syscall_trace_entry:
	...
	call do_syscall_trace

But auditing a SINGLESTEP'ed process causes do_syscall_trace to be called, so
the tracer will get one more trap on the syscall entry path, which it
shouldn't.

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
CC: Roland McGrath <roland@redhat.com>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/ptrace.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 0da59b42843..5ee9e1d6065 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -683,8 +683,19 @@ void do_syscall_trace(struct pt_regs *regs, int entryexit)
 	/* do the secure computing check first */
 	secure_computing(regs->orig_eax);
 
-	if (unlikely(current->audit_context) && entryexit)
-		audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
+	if (unlikely(current->audit_context)) {
+		if (entryexit)
+			audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
+
+		/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
+		 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
+		 * not used, entry.S will call us only on syscall exit, not
+		 * entry ; so when TIF_SYSCALL_AUDIT is used we must avoid
+		 * calling send_sigtrap() on syscall entry.
+		 */
+		else if (is_singlestep)
+			goto out;
+	}
 
 	if (!(current->ptrace & PT_PTRACED))
 		goto out;
-- 
cgit v1.2.3-70-g09d2


From ed75e8d58010fdc06e2c3a81bfbebae92314c7e3 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <LaurentVivier@wanadoo.fr>
Date: Sat, 3 Sep 2005 15:57:18 -0700
Subject: [PATCH] UML Support - Ptrace: adds the host SYSEMU support, for UML
 and general usage

      Jeff Dike <jdike@addtoit.com>,
      Paolo 'Blaisorblade' Giarrusso <blaisorblade_spam@yahoo.it>,
      Bodo Stroesser <bstroesser@fujitsu-siemens.com>

Adds a new ptrace(2) mode, called PTRACE_SYSEMU, resembling PTRACE_SYSCALL
except that the kernel does not execute the requested syscall; this is useful
to improve performance for virtual environments, like UML, which want to run
the syscall on their own.

In fact, using PTRACE_SYSCALL means stopping child execution twice, on entry
and on exit, and each time you also have two context switches; with SYSEMU you
avoid the 2nd stop and so save two context switches per syscall.

Also, some architectures don't have support in the host for changing the
syscall number via ptrace(), which is currently needed to skip syscall
execution (UML turns any syscall into getpid() to avoid it being executed on
the host).  Fixing that is hard, while SYSEMU is easier to implement.

* This version of the patch includes some suggestions of Jeff Dike to avoid
  adding any instructions to the syscall fast path, plus some other little
  changes, by myself, to make it work even when the syscall is executed with
  SYSENTER (but I'm unsure about them). It has been widely tested for quite a
  lot of time.

* Various fixed were included to handle the various switches between
  various states, i.e. when for instance a syscall entry is traced with one of
  PT_SYSCALL / _SYSEMU / _SINGLESTEP and another one is used on exit.
  Basically, this is done by remembering which one of them was used even after
  the call to ptrace_notify().

* We're combining TIF_SYSCALL_EMU with TIF_SYSCALL_TRACE or TIF_SINGLESTEP
  to make do_syscall_trace() notice that the current syscall was started with
  SYSEMU on entry, so that no notification ought to be done in the exit path;
  this is a bit of a hack, so this problem is solved in another way in next
  patches.

* Also, the effects of the patch:
"Ptrace - i386: fix Syscall Audit interaction with singlestep"
are cancelled; they are restored back in the last patch of this series.

Detailed descriptions of the patches doing this kind of processing follow (but
I've already summed everything up).

* Fix behaviour when changing interception kind #1.

  In do_syscall_trace(), we check the status of the TIF_SYSCALL_EMU flag
  only after doing the debugger notification; but the debugger might have
  changed the status of this flag because he continued execution with
  PTRACE_SYSCALL, so this is wrong.  This patch fixes it by saving the flag
  status before calling ptrace_notify().

* Fix behaviour when changing interception kind #2:
  avoid intercepting syscall on return when using SYSCALL again.

  A guest process switching from using PTRACE_SYSEMU to PTRACE_SYSCALL
  crashes.

  The problem is in arch/i386/kernel/entry.S.  The current SYSEMU patch
  inhibits the syscall-handler to be called, but does not prevent
  do_syscall_trace() to be called after this for syscall completion
  interception.

  The appended patch fixes this.  It reuses the flag TIF_SYSCALL_EMU to
  remember "we come from PTRACE_SYSEMU and now are in PTRACE_SYSCALL", since
  the flag is unused in the depicted situation.

* Fix behaviour when changing interception kind #3:
  avoid intercepting syscall on return when using SINGLESTEP.

  When testing 2.6.9 and the skas3.v6 patch, with my latest patch and had
  problems with singlestepping on UML in SKAS with SYSEMU.  It looped
  receiving SIGTRAPs without moving forward.  EIP of the traced process was
  the same for all SIGTRAPs.

What's missing is to handle switching from PTRACE_SYSCALL_EMU to
PTRACE_SINGLESTEP in a way very similar to what is done for the change from
PTRACE_SYSCALL_EMU to PTRACE_SYSCALL_TRACE.

I.e., after calling ptrace(PTRACE_SYSEMU), on the return path, the debugger is
notified and then wake ups the process; the syscall is executed (or skipped,
when do_syscall_trace() returns 0, i.e.  when using PTRACE_SYSEMU), and
do_syscall_trace() is called again.  Since we are on the return path of a
SYSEMU'd syscall, if the wake up is performed through ptrace(PTRACE_SYSCALL),
we must still avoid notifying the parent of the syscall exit.  Now, this
behaviour is extended even to resuming with PTRACE_SINGLESTEP.

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/entry.S       |  9 ++++---
 arch/i386/kernel/ptrace.c      | 57 +++++++++++++++++++++++++++++-------------
 include/asm-i386/thread_info.h |  5 +++-
 include/linux/ptrace.h         |  1 +
 kernel/fork.c                  |  3 +++
 5 files changed, 53 insertions(+), 22 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index a991d4e5edd..b389e5f3bde 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -203,7 +203,7 @@ sysenter_past_esp:
 	GET_THREAD_INFO(%ebp)
 
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
+	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
@@ -226,9 +226,9 @@ ENTRY(system_call)
 	pushl %eax			# save orig_eax
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
-					# system call tracing in operation
+					# system call tracing in operation / emulation
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
+	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
@@ -338,6 +338,9 @@ syscall_trace_entry:
 	movl %esp, %eax
 	xorl %edx,%edx
 	call do_syscall_trace
+	cmpl $0, %eax
+	jne syscall_exit		# ret != 0 -> running under PTRACE_SYSEMU,
+					# so must skip actual syscall
 	movl ORIG_EAX(%esp), %eax
 	cmpl $(nr_syscalls), %eax
 	jnae syscall_call
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 5ee9e1d6065..5b569dc1c22 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -509,15 +509,27 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		  }
 		  break;
 
+	case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
 	case PTRACE_SYSCALL:	/* continue and stop at next (return from) syscall */
 	case PTRACE_CONT:	/* restart after signal. */
 		ret = -EIO;
 		if (!valid_signal(data))
 			break;
+		/* If we came here with PTRACE_SYSEMU and now continue with
+		 * PTRACE_SYSCALL, entry.S used to intercept the syscall return.
+		 * But it shouldn't!
+		 * So we don't clear TIF_SYSCALL_EMU, which is always unused in
+		 * this special case, to remember, we came from SYSEMU. That
+		 * flag will be cleared by do_syscall_trace().
+		 */
+		if (request == PTRACE_SYSEMU) {
+			set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+		} else if (request == PTRACE_CONT) {
+			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+		}
 		if (request == PTRACE_SYSCALL) {
 			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		}
-		else {
+		} else {
 			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 		}
 		child->exit_code = data;
@@ -546,6 +558,8 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		ret = -EIO;
 		if (!valid_signal(data))
 			break;
+		/*See do_syscall_trace to know why we don't clear
+		 * TIF_SYSCALL_EMU.*/
 		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 		set_singlestep(child);
 		child->exit_code = data;
@@ -678,37 +692,43 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
  * - triggered by current->work.syscall_trace
  */
 __attribute__((regparm(3)))
-void do_syscall_trace(struct pt_regs *regs, int entryexit)
+int do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
+	int is_sysemu, is_systrace, is_singlestep, ret = 0;
 	/* do the secure computing check first */
 	secure_computing(regs->orig_eax);
 
-	if (unlikely(current->audit_context)) {
-		if (entryexit)
-			audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
-
-		/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
-		 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
-		 * not used, entry.S will call us only on syscall exit, not
-		 * entry ; so when TIF_SYSCALL_AUDIT is used we must avoid
-		 * calling send_sigtrap() on syscall entry.
-		 */
-		else if (is_singlestep)
-			goto out;
-	}
+	if (unlikely(current->audit_context) && entryexit)
+		audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
 
 	if (!(current->ptrace & PT_PTRACED))
 		goto out;
 
+	is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
+	is_systrace = test_thread_flag(TIF_SYSCALL_TRACE);
+	is_singlestep = test_thread_flag(TIF_SINGLESTEP);
+
+	/* We can detect the case of coming from PTRACE_SYSEMU and now running
+	 * with PTRACE_SYSCALL or PTRACE_SINGLESTEP, by TIF_SYSCALL_EMU being
+	 * set additionally.
+	 * If so let's reset the flag and return without action (no singlestep
+	 * nor syscall tracing, since no actual step has been executed).
+	 */
+	if (is_sysemu && (is_systrace || is_singlestep)) {
+		clear_thread_flag(TIF_SYSCALL_EMU);
+		goto out;
+	}
+
 	/* Fake a debug trap */
 	if (test_thread_flag(TIF_SINGLESTEP))
 		send_sigtrap(current, regs, 0);
 
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
+	if (!is_systrace && !is_sysemu)
 		goto out;
 
 	/* the 0x80 provides a way for the tracing parent to distinguish
 	   between a syscall stop and SIGTRAP delivery */
+	/* Note that the debugger could change the result of test_thread_flag!*/
 	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
 
 	/*
@@ -720,9 +740,10 @@ void do_syscall_trace(struct pt_regs *regs, int entryexit)
 		send_sig(current->exit_code, current, 1);
 		current->exit_code = 0;
 	}
+	ret = is_sysemu;
  out:
 	if (unlikely(current->audit_context) && !entryexit)
 		audit_syscall_entry(current, AUDIT_ARCH_I386, regs->orig_eax,
 				    regs->ebx, regs->ecx, regs->edx, regs->esi);
-
+	return ret;
 }
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index 95add81237e..e2cb9fa6f56 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -139,6 +139,7 @@ register unsigned long current_stack_pointer asm("esp") __attribute_used__;
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_IRET		5	/* return with iret */
+#define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
@@ -150,13 +151,15 @@ register unsigned long current_stack_pointer asm("esp") __attribute_used__;
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_IRET		(1<<TIF_IRET)
+#define _TIF_SYSCALL_EMU	(1<<TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
-  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|_TIF_SECCOMP))
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|\
+		  _TIF_SECCOMP|_TIF_SYSCALL_EMU))
 /* work to do on any return to u-space */
 #define _TIF_ALLWORK_MASK	(0x0000FFFF & ~_TIF_SECCOMP)
 
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index a373fc254df..7528afb6b2a 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -20,6 +20,7 @@
 #define PTRACE_DETACH		0x11
 
 #define PTRACE_SYSCALL		  24
+#define PTRACE_SYSEMU		  31
 
 /* 0x4200-0x4300 are reserved for architecture-independent additions.  */
 #define PTRACE_SETOPTIONS	0x4200
diff --git a/kernel/fork.c b/kernel/fork.c
index b65187f0c74..7e1ead9a6ba 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -994,6 +994,9 @@ static task_t *copy_process(unsigned long clone_flags,
 	 * of CLONE_PTRACE.
 	 */
 	clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
+#ifdef TIF_SYSCALL_EMU
+	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
+#endif
 
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */
-- 
cgit v1.2.3-70-g09d2


From c8c86cecd1d1a2722acb28a01d1babf7b6993697 Mon Sep 17 00:00:00 2001
From: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
Date: Sat, 3 Sep 2005 15:57:19 -0700
Subject: [PATCH] Uml support: reorganize PTRACE_SYSEMU support

With this patch, we change the way we handle switching from PTRACE_SYSEMU to
PTRACE_{SINGLESTEP,SYSCALL}, to free TIF_SYSCALL_EMU from double use as a
preparation for PTRACE_SYSEMU_SINGLESTEP extension, without changing the
behavior of the host kernel.

Signed-off-by: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/entry.S  |  8 +++++++-
 arch/i386/kernel/ptrace.c | 43 ++++++++++++++-----------------------------
 2 files changed, 21 insertions(+), 30 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index b389e5f3bde..9a47723469c 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -339,12 +339,18 @@ syscall_trace_entry:
 	xorl %edx,%edx
 	call do_syscall_trace
 	cmpl $0, %eax
-	jne syscall_exit		# ret != 0 -> running under PTRACE_SYSEMU,
+	jne syscall_skip		# ret != 0 -> running under PTRACE_SYSEMU,
 					# so must skip actual syscall
 	movl ORIG_EAX(%esp), %eax
 	cmpl $(nr_syscalls), %eax
 	jnae syscall_call
 	jmp syscall_exit
+syscall_skip:
+	cli				# make sure we don't miss an interrupt
+					# setting need_resched or sigpending
+					# between sampling and the iret
+	movl TI_flags(%ebp), %ecx
+	jmp work_pending
 
 	# perform syscall exit tracing
 	ALIGN
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 5b569dc1c22..18642f05dde 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -515,21 +515,14 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		ret = -EIO;
 		if (!valid_signal(data))
 			break;
-		/* If we came here with PTRACE_SYSEMU and now continue with
-		 * PTRACE_SYSCALL, entry.S used to intercept the syscall return.
-		 * But it shouldn't!
-		 * So we don't clear TIF_SYSCALL_EMU, which is always unused in
-		 * this special case, to remember, we came from SYSEMU. That
-		 * flag will be cleared by do_syscall_trace().
-		 */
 		if (request == PTRACE_SYSEMU) {
 			set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-		} else if (request == PTRACE_CONT) {
-			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-		}
-		if (request == PTRACE_SYSCALL) {
+			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+		} else if (request == PTRACE_SYSCALL) {
 			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 		} else {
+			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 		}
 		child->exit_code = data;
@@ -558,8 +551,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		ret = -EIO;
 		if (!valid_signal(data))
 			break;
-		/*See do_syscall_trace to know why we don't clear
-		 * TIF_SYSCALL_EMU.*/
+		clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 		set_singlestep(child);
 		child->exit_code = data;
@@ -694,7 +686,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 __attribute__((regparm(3)))
 int do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
-	int is_sysemu, is_systrace, is_singlestep, ret = 0;
+	int is_sysemu, is_singlestep, ret = 0;
 	/* do the secure computing check first */
 	secure_computing(regs->orig_eax);
 
@@ -705,25 +697,13 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit)
 		goto out;
 
 	is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
-	is_systrace = test_thread_flag(TIF_SYSCALL_TRACE);
 	is_singlestep = test_thread_flag(TIF_SINGLESTEP);
 
-	/* We can detect the case of coming from PTRACE_SYSEMU and now running
-	 * with PTRACE_SYSCALL or PTRACE_SINGLESTEP, by TIF_SYSCALL_EMU being
-	 * set additionally.
-	 * If so let's reset the flag and return without action (no singlestep
-	 * nor syscall tracing, since no actual step has been executed).
-	 */
-	if (is_sysemu && (is_systrace || is_singlestep)) {
-		clear_thread_flag(TIF_SYSCALL_EMU);
-		goto out;
-	}
-
 	/* Fake a debug trap */
-	if (test_thread_flag(TIF_SINGLESTEP))
+	if (is_singlestep)
 		send_sigtrap(current, regs, 0);
 
-	if (!is_systrace && !is_sysemu)
+ 	if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
 		goto out;
 
 	/* the 0x80 provides a way for the tracing parent to distinguish
@@ -745,5 +725,10 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit)
 	if (unlikely(current->audit_context) && !entryexit)
 		audit_syscall_entry(current, AUDIT_ARCH_I386, regs->orig_eax,
 				    regs->ebx, regs->ecx, regs->edx, regs->esi);
-	return ret;
+	if (ret == 0)
+		return 0;
+
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
+	return 1;
 }
-- 
cgit v1.2.3-70-g09d2


From 1b38f0064e4e0b9ec626e39f0740b1cf2e295743 Mon Sep 17 00:00:00 2001
From: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
Date: Sat, 3 Sep 2005 15:57:20 -0700
Subject: [PATCH] Uml support: add PTRACE_SYSEMU_SINGLESTEP option to i386

This patch implements the new ptrace option PTRACE_SYSEMU_SINGLESTEP, which
can be used by UML to singlestep a process: it will receive SINGLESTEP
interceptions for normal instructions and syscalls, but syscall execution will
be skipped just like with PTRACE_SYSEMU.

Signed-off-by: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/ptrace.c | 21 +++++++++++++++++----
 include/linux/ptrace.h    |  1 +
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 18642f05dde..3196ba50fcd 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -547,11 +547,17 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		wake_up_process(child);
 		break;
 
+	case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */
 	case PTRACE_SINGLESTEP:	/* set the trap flag. */
 		ret = -EIO;
 		if (!valid_signal(data))
 			break;
-		clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+
+		if (request == PTRACE_SYSEMU_SINGLESTEP)
+			set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+		else
+			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+
 		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 		set_singlestep(child);
 		child->exit_code = data;
@@ -686,7 +692,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 __attribute__((regparm(3)))
 int do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
-	int is_sysemu, is_singlestep, ret = 0;
+	int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU), ret = 0;
+	/* With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP */
+	int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
+
 	/* do the secure computing check first */
 	secure_computing(regs->orig_eax);
 
@@ -696,8 +705,11 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit)
 	if (!(current->ptrace & PT_PTRACED))
 		goto out;
 
-	is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
-	is_singlestep = test_thread_flag(TIF_SINGLESTEP);
+	/* If a process stops on the 1st tracepoint with SYSCALL_TRACE
+	 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
+	 * here. We have to check this and return */
+	if (is_sysemu && entryexit)
+		return 0;
 
 	/* Fake a debug trap */
 	if (is_singlestep)
@@ -728,6 +740,7 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit)
 	if (ret == 0)
 		return 0;
 
+	regs->orig_eax = -1; /* force skip of syscall restarting */
 	if (unlikely(current->audit_context))
 		audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
 	return 1;
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 7528afb6b2a..2afdafb6212 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -21,6 +21,7 @@
 
 #define PTRACE_SYSCALL		  24
 #define PTRACE_SYSEMU		  31
+#define PTRACE_SYSEMU_SINGLESTEP  32
 
 /* 0x4200-0x4300 are reserved for architecture-independent additions.  */
 #define PTRACE_SETOPTIONS	0x4200
-- 
cgit v1.2.3-70-g09d2


From ab1c23c24471c760c573f4fb0dd78e166ddfd844 Mon Sep 17 00:00:00 2001
From: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
Date: Sat, 3 Sep 2005 15:57:21 -0700
Subject: [PATCH] SYSEMU: fix sysaudit / singlestep interaction

      Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>

This is simply an adjustment for "Ptrace - i386: fix Syscall Audit interaction
with singlestep" to work on top of SYSEMU patches, too.  On this patch, I have
some doubts: I wonder why we need to alter that way ptrace_disable().

I left the patch this way because it has been extensively tested, but I don't
understand the reason.

The current PTRACE_DETACH handling simply clears child->ptrace; actually this
is not enough because entry.S just looks at the thread_flags; actually,
do_syscall_trace checks current->ptrace but I don't think depending on that is
good, at least for performance, so I think the clearing is done elsewhere.
For instance, on PTRACE_CONT it's done, but doing PTRACE_DETACH without
PTRACE_CONT is possible (and happens when gdb crashes and one kills it
manually).

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
CC: Roland McGrath <roland@redhat.com>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/ptrace.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 3196ba50fcd..340980203b0 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -271,6 +271,8 @@ static void clear_singlestep(struct task_struct *child)
 void ptrace_disable(struct task_struct *child)
 { 
 	clear_singlestep(child);
+	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 }
 
 /*
@@ -693,14 +695,29 @@ __attribute__((regparm(3)))
 int do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
 	int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU), ret = 0;
-	/* With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP */
+	/* With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
+	 * interception. */
 	int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
 
 	/* do the secure computing check first */
 	secure_computing(regs->orig_eax);
 
-	if (unlikely(current->audit_context) && entryexit)
-		audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
+	if (unlikely(current->audit_context)) {
+		if (entryexit)
+			audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
+		/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
+		 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
+		 * not used, entry.S will call us only on syscall exit, not
+		 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
+		 * calling send_sigtrap() on syscall entry.
+		 *
+		 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
+		 * is_singlestep is false, despite his name, so we will still do
+		 * the correct thing.
+		 */
+		else if (is_singlestep)
+			goto out;
+	}
 
 	if (!(current->ptrace & PT_PTRACED))
 		goto out;
-- 
cgit v1.2.3-70-g09d2


From 640aa46e25922a00b805e6b0d0b5181ad9cf736a Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Sat, 3 Sep 2005 15:57:22 -0700
Subject: [PATCH] uml: SYSEMU: slight cleanup and speedup

As a follow-up to "UML Support - Ptrace: adds the host SYSEMU support, for
UML and general usage" (i.e.  uml-support-* in current mm).

Avoid unconditionally jumping to work_pending and code copying, just reuse
the already existing resume_userspace path.

One interesting note, from Charles P.  Wright, suggested that the API is
improvable with no downsides for UML (except that it will have to support
yet another host API, since dropping support for the current API, for UML,
is not reasonable from users' point of view).

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
CC: Charles P. Wright <cwright@cs.sunysb.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/entry.S | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 9a47723469c..abb909793ef 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -339,18 +339,12 @@ syscall_trace_entry:
 	xorl %edx,%edx
 	call do_syscall_trace
 	cmpl $0, %eax
-	jne syscall_skip		# ret != 0 -> running under PTRACE_SYSEMU,
+	jne resume_userspace		# ret != 0 -> running under PTRACE_SYSEMU,
 					# so must skip actual syscall
 	movl ORIG_EAX(%esp), %eax
 	cmpl $(nr_syscalls), %eax
 	jnae syscall_call
 	jmp syscall_exit
-syscall_skip:
-	cli				# make sure we don't miss an interrupt
-					# setting need_resched or sigpending
-					# between sampling and the iret
-	movl TI_flags(%ebp), %ecx
-	jmp work_pending
 
 	# perform syscall exit tracing
 	ALIGN
-- 
cgit v1.2.3-70-g09d2


From 54d5d42404e7705cf3804593189e963350d470e5 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Tue, 6 Sep 2005 15:16:15 -0700
Subject: [PATCH] x86/x86_64: deferred handling of writes to
 /proc/irqxx/smp_affinity

When handling writes to /proc/irq, current code is re-programming rte
entries directly. This is not recommended and could potentially cause
chipset's to lockup, or cause missing interrupts.

CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the
interrupt is pending. The same needs to be done for /proc/irq handling as well.
Otherwise user space irq balancers are really not doing the right thing.

- Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for
  lack of a generic name.
- added move_irq out of IRQ_BALANCE, and added this same to X86_64
- Added new proc handler for write, so we can do deferred write at irq
  handling time.
- Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead
  it now shows only active cpu masks, or exactly what was set.
- Provided a common move_irq implementation, instead of duplicating
  when using generic irq framework.

Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off.
Tested UP builds as well.

MSI testing: tbd: I have cards, need to look for a x-over cable, although I
did test an earlier version of this patch.  Will test in a couple days.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Acked-by: Zwane Mwaikambo <zwane@holomorphy.com>
Grudgingly-acked-by: Andi Kleen <ak@muc.de>
Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig            |   5 ++
 arch/i386/kernel/io_apic.c   |  55 ++++++++++---------
 arch/ia64/Kconfig            |   5 ++
 arch/ia64/kernel/irq.c       |  39 +-------------
 arch/x86_64/Kconfig          |   5 ++
 arch/x86_64/kernel/io_apic.c | 102 ++++++++++++++++++++++-------------
 drivers/pci/msi.c            |  17 ++----
 drivers/pci/msi.h            |   5 --
 include/asm-ia64/hw_irq.h    |   7 ---
 include/asm-ia64/irq.h       |   6 ---
 include/linux/irq.h          | 123 +++++++++++++++++++++++++++++++++++++++++++
 kernel/irq/manage.c          |   4 ++
 kernel/irq/proc.c            |  14 ++++-
 13 files changed, 253 insertions(+), 134 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 3b3b017e1c1..4b7de3e1e57 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1318,6 +1318,11 @@ config GENERIC_IRQ_PROBE
 	bool
 	default y
 
+config GENERIC_PENDING_IRQ
+	bool
+	depends on GENERIC_HARDIRQS && SMP
+	default y
+
 config X86_SMP
 	bool
 	depends on SMP && !X86_VOYAGER
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 6578f40bd50..4a594043157 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -33,6 +33,7 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/sysdev.h>
+
 #include <asm/io.h>
 #include <asm/smp.h>
 #include <asm/desc.h>
@@ -222,13 +223,21 @@ static void clear_IO_APIC (void)
 			clear_IO_APIC_pin(apic, pin);
 }
 
+#ifdef CONFIG_SMP
 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 {
 	unsigned long flags;
 	int pin;
 	struct irq_pin_list *entry = irq_2_pin + irq;
 	unsigned int apicid_value;
+	cpumask_t tmp;
 	
+	cpus_and(tmp, cpumask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	cpus_and(cpumask, tmp, CPU_MASK_ALL);
+
 	apicid_value = cpu_mask_to_apicid(cpumask);
 	/* Prepare to do the io_apic_write */
 	apicid_value = apicid_value << 24;
@@ -242,6 +251,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 			break;
 		entry = irq_2_pin + entry->next;
 	}
+	set_irq_info(irq, cpumask);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -259,7 +269,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 #  define Dprintk(x...) 
 # endif
 
-cpumask_t __cacheline_aligned pending_irq_balance_cpumask[NR_IRQS];
 
 #define IRQBALANCE_CHECK_ARCH -999
 static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
@@ -328,12 +337,7 @@ static inline void balance_irq(int cpu, int irq)
 	cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]);
 	new_cpu = move(cpu, allowed_mask, now, 1);
 	if (cpu != new_cpu) {
-		irq_desc_t *desc = irq_desc + irq;
-		unsigned long flags;
-
-		spin_lock_irqsave(&desc->lock, flags);
-		pending_irq_balance_cpumask[irq] = cpumask_of_cpu(new_cpu);
-		spin_unlock_irqrestore(&desc->lock, flags);
+		set_pending_irq(irq, cpumask_of_cpu(new_cpu));
 	}
 }
 
@@ -528,16 +532,12 @@ tryanotherirq:
 	cpus_and(tmp, target_cpu_mask, allowed_mask);
 
 	if (!cpus_empty(tmp)) {
-		irq_desc_t *desc = irq_desc + selected_irq;
-		unsigned long flags;
 
 		Dprintk("irq = %d moved to cpu = %d\n",
 				selected_irq, min_loaded);
 		/* mark for change destination */
-		spin_lock_irqsave(&desc->lock, flags);
-		pending_irq_balance_cpumask[selected_irq] =
-					cpumask_of_cpu(min_loaded);
-		spin_unlock_irqrestore(&desc->lock, flags);
+		set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
+
 		/* Since we made a change, come back sooner to 
 		 * check for more variation.
 		 */
@@ -568,7 +568,8 @@ static int balanced_irq(void *unused)
 	
 	/* push everything to CPU 0 to give us a starting point.  */
 	for (i = 0 ; i < NR_IRQS ; i++) {
-		pending_irq_balance_cpumask[i] = cpumask_of_cpu(0);
+		pending_irq_cpumask[i] = cpumask_of_cpu(0);
+		set_pending_irq(i, cpumask_of_cpu(0));
 	}
 
 	for ( ; ; ) {
@@ -647,20 +648,9 @@ int __init irqbalance_disable(char *str)
 
 __setup("noirqbalance", irqbalance_disable);
 
-static inline void move_irq(int irq)
-{
-	/* note - we hold the desc->lock */
-	if (unlikely(!cpus_empty(pending_irq_balance_cpumask[irq]))) {
-		set_ioapic_affinity_irq(irq, pending_irq_balance_cpumask[irq]);
-		cpus_clear(pending_irq_balance_cpumask[irq]);
-	}
-}
-
 late_initcall(balanced_irq_init);
-
-#else /* !CONFIG_IRQBALANCE */
-static inline void move_irq(int irq) { }
 #endif /* CONFIG_IRQBALANCE */
+#endif /* CONFIG_SMP */
 
 #ifndef CONFIG_SMP
 void fastcall send_IPI_self(int vector)
@@ -820,6 +810,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
  * we need to reprogram the ioredtbls to cater for the cpus which have come online
  * so mask in all cases should simply be TARGET_CPUS
  */
+#ifdef CONFIG_SMP
 void __init setup_ioapic_dest(void)
 {
 	int pin, ioapic, irq, irq_entry;
@@ -838,6 +829,7 @@ void __init setup_ioapic_dest(void)
 
 	}
 }
+#endif
 
 /*
  * EISA Edge/Level control register, ELCR
@@ -1249,6 +1241,7 @@ static void __init setup_IO_APIC_irqs(void)
 		spin_lock_irqsave(&ioapic_lock, flags);
 		io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
 		io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+		set_native_irq_info(irq, TARGET_CPUS);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 	}
@@ -1944,6 +1937,7 @@ static void ack_edge_ioapic_vector(unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_irq(vector);
 	ack_edge_ioapic_irq(irq);
 }
 
@@ -1958,6 +1952,7 @@ static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_irq(vector);
 	end_level_ioapic_irq(irq);
 }
 
@@ -1975,14 +1970,17 @@ static void unmask_IO_APIC_vector (unsigned int vector)
 	unmask_IO_APIC_irq(irq);
 }
 
+#ifdef CONFIG_SMP
 static void set_ioapic_affinity_vector (unsigned int vector,
 					cpumask_t cpu_mask)
 {
 	int irq = vector_to_irq(vector);
 
+	set_native_irq_info(vector, cpu_mask);
 	set_ioapic_affinity_irq(irq, cpu_mask);
 }
 #endif
+#endif
 
 /*
  * Level and edge triggered IO-APIC interrupts need different handling,
@@ -2000,7 +1998,9 @@ static struct hw_interrupt_type ioapic_edge_type = {
 	.disable 	= disable_edge_ioapic,
 	.ack 		= ack_edge_ioapic,
 	.end 		= end_edge_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity 	= set_ioapic_affinity,
+#endif
 };
 
 static struct hw_interrupt_type ioapic_level_type = {
@@ -2011,7 +2011,9 @@ static struct hw_interrupt_type ioapic_level_type = {
 	.disable 	= disable_level_ioapic,
 	.ack 		= mask_and_ack_level_ioapic,
 	.end 		= end_level_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity 	= set_ioapic_affinity,
+#endif
 };
 
 static inline void init_IO_APIC_traps(void)
@@ -2569,6 +2571,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 	spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
 	io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
+	set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return 0;
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 3deced637f0..17b5dbf8c31 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -434,6 +434,11 @@ config GENERIC_IRQ_PROBE
 	bool
 	default y
 
+config GENERIC_PENDING_IRQ
+	bool
+	depends on GENERIC_HARDIRQS && SMP
+	default y
+
 source "arch/ia64/hp/sim/Kconfig"
 
 source "arch/ia64/oprofile/Kconfig"
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index 28f2aadc38d..205d9802826 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -91,23 +91,8 @@ skip:
 }
 
 #ifdef CONFIG_SMP
-/*
- * This is updated when the user sets irq affinity via /proc
- */
-static cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
-static unsigned long pending_irq_redir[BITS_TO_LONGS(NR_IRQS)];
-
 static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
 
-/*
- * Arch specific routine for deferred write to iosapic rte to reprogram
- * intr destination.
- */
-void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
-{
-	pending_irq_cpumask[irq] = mask_val;
-}
-
 void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
 {
 	cpumask_t mask = CPU_MASK_NONE;
@@ -116,32 +101,10 @@ void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
 
 	if (irq < NR_IRQS) {
 		irq_affinity[irq] = mask;
+		set_irq_info(irq, mask);
 		irq_redir[irq] = (char) (redir & 0xff);
 	}
 }
-
-
-void move_irq(int irq)
-{
-	/* note - we hold desc->lock */
-	cpumask_t tmp;
-	irq_desc_t *desc = irq_descp(irq);
-	int redir = test_bit(irq, pending_irq_redir);
-
-	if (unlikely(!desc->handler->set_affinity))
-		return;
-
-	if (!cpus_empty(pending_irq_cpumask[irq])) {
-		cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
-		if (unlikely(!cpus_empty(tmp))) {
-			desc->handler->set_affinity(irq | (redir ? IA64_IRQ_REDIRECTED : 0),
-						    pending_irq_cpumask[irq]);
-		}
-		cpus_clear(pending_irq_cpumask[irq]);
-	}
-}
-
-
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 75e52c57f19..251ce7cf1a3 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -441,6 +441,11 @@ config ISA_DMA_API
 	bool
 	default y
 
+config GENERIC_PENDING_IRQ
+	bool
+	depends on GENERIC_HARDIRQS && SMP
+	default y
+
 menu "Power management options"
 
 source kernel/power/Kconfig
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index d206d7e49cf..76bcc4e6979 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -78,6 +78,54 @@ int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
 #define vector_to_irq(vector)	(vector)
 #endif
 
+#define __DO_ACTION(R, ACTION, FINAL)					\
+									\
+{									\
+	int pin;							\
+	struct irq_pin_list *entry = irq_2_pin + irq;			\
+									\
+	for (;;) {							\
+		unsigned int reg;					\
+		pin = entry->pin;					\
+		if (pin == -1)						\
+			break;						\
+		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
+		reg ACTION;						\
+		io_apic_modify(entry->apic, reg);			\
+		if (!entry->next)					\
+			break;						\
+		entry = irq_2_pin + entry->next;			\
+	}								\
+	FINAL;								\
+}
+
+#ifdef CONFIG_SMP
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+	unsigned long flags;
+	unsigned int dest;
+	cpumask_t tmp;
+
+	cpus_and(tmp, mask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	cpus_and(mask, tmp, CPU_MASK_ALL);
+
+	dest = cpu_mask_to_apicid(mask);
+
+	/*
+	 * Only the high 8 bits are valid.
+	 */
+	dest = SET_APIC_LOGICAL_ID(dest);
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__DO_ACTION(1, = dest, )
+	set_irq_info(irq, mask);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+#endif
+
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -101,26 +149,6 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 	entry->pin = pin;
 }
 
-#define __DO_ACTION(R, ACTION, FINAL)					\
-									\
-{									\
-	int pin;							\
-	struct irq_pin_list *entry = irq_2_pin + irq;			\
-									\
-	for (;;) {							\
-		unsigned int reg;					\
-		pin = entry->pin;					\
-		if (pin == -1)						\
-			break;						\
-		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
-		reg ACTION;						\
-		io_apic_modify(entry->apic, reg);			\
-		if (!entry->next)					\
-			break;						\
-		entry = irq_2_pin + entry->next;			\
-	}								\
-	FINAL;								\
-}
 
 #define DO_ACTION(name,R,ACTION, FINAL)					\
 									\
@@ -767,6 +795,7 @@ static void __init setup_IO_APIC_irqs(void)
 		spin_lock_irqsave(&ioapic_lock, flags);
 		io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
 		io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+		set_native_irq_info(irq, TARGET_CPUS);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 	}
@@ -1314,6 +1343,7 @@ static unsigned int startup_edge_ioapic_irq(unsigned int irq)
  */
 static void ack_edge_ioapic_irq(unsigned int irq)
 {
+	move_irq(irq);
 	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
 					== (IRQ_PENDING | IRQ_DISABLED))
 		mask_IO_APIC_irq(irq);
@@ -1343,26 +1373,10 @@ static unsigned int startup_level_ioapic_irq (unsigned int irq)
 
 static void end_level_ioapic_irq (unsigned int irq)
 {
+	move_irq(irq);
 	ack_APIC_irq();
 }
 
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
-{
-	unsigned long flags;
-	unsigned int dest;
-
-	dest = cpu_mask_to_apicid(mask);
-
-	/*
-	 * Only the high 8 bits are valid.
-	 */
-	dest = SET_APIC_LOGICAL_ID(dest);
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	__DO_ACTION(1, = dest, )
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
 #ifdef CONFIG_PCI_MSI
 static unsigned int startup_edge_ioapic_vector(unsigned int vector)
 {
@@ -1375,6 +1389,7 @@ static void ack_edge_ioapic_vector(unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_native_irq(vector);
 	ack_edge_ioapic_irq(irq);
 }
 
@@ -1389,6 +1404,7 @@ static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_native_irq(vector);
 	end_level_ioapic_irq(irq);
 }
 
@@ -1406,14 +1422,17 @@ static void unmask_IO_APIC_vector (unsigned int vector)
 	unmask_IO_APIC_irq(irq);
 }
 
+#ifdef CONFIG_SMP
 static void set_ioapic_affinity_vector (unsigned int vector,
 					cpumask_t cpu_mask)
 {
 	int irq = vector_to_irq(vector);
 
+	set_native_irq_info(vector, cpu_mask);
 	set_ioapic_affinity_irq(irq, cpu_mask);
 }
-#endif
+#endif // CONFIG_SMP
+#endif // CONFIG_PCI_MSI
 
 /*
  * Level and edge triggered IO-APIC interrupts need different handling,
@@ -1432,7 +1451,9 @@ static struct hw_interrupt_type ioapic_edge_type = {
 	.disable 	= disable_edge_ioapic,
 	.ack 		= ack_edge_ioapic,
 	.end 		= end_edge_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity = set_ioapic_affinity,
+#endif
 };
 
 static struct hw_interrupt_type ioapic_level_type = {
@@ -1443,7 +1464,9 @@ static struct hw_interrupt_type ioapic_level_type = {
 	.disable 	= disable_level_ioapic,
 	.ack 		= mask_and_ack_level_ioapic,
 	.end 		= end_level_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity = set_ioapic_affinity,
+#endif
 };
 
 static inline void init_IO_APIC_traps(void)
@@ -1918,6 +1941,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 	spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
 	io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
+	set_native_irq_info(use_pci_vector() ?  entry.vector : irq, TARGET_CPUS);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return 0;
@@ -1931,6 +1955,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
  * we need to reprogram the ioredtbls to cater for the cpus which have come online
  * so mask in all cases should simply be TARGET_CPUS
  */
+#ifdef CONFIG_SMP
 void __init setup_ioapic_dest(void)
 {
 	int pin, ioapic, irq, irq_entry;
@@ -1949,3 +1974,4 @@ void __init setup_ioapic_dest(void)
 
 	}
 }
+#endif
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 2b85aa39f95..532f73bb222 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -91,6 +91,7 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 {
 	struct msi_desc *entry;
 	struct msg_address address;
+	unsigned int irq = vector;
 
 	entry = (struct msi_desc *)msi_desc[vector];
 	if (!entry || !entry->dev)
@@ -112,6 +113,7 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 		entry->msi_attrib.current_cpu = cpu_mask_to_apicid(cpu_mask);
 		pci_write_config_dword(entry->dev, msi_lower_address_reg(pos),
 			address.lo_address.value);
+		set_native_irq_info(irq, cpu_mask);
 		break;
 	}
 	case PCI_CAP_ID_MSIX:
@@ -125,22 +127,13 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 			MSI_TARGET_CPU_SHIFT);
 		entry->msi_attrib.current_cpu = cpu_mask_to_apicid(cpu_mask);
 		writel(address.lo_address.value, entry->mask_base + offset);
+		set_native_irq_info(irq, cpu_mask);
 		break;
 	}
 	default:
 		break;
 	}
 }
-
-#ifdef CONFIG_IRQBALANCE
-static inline void move_msi(int vector)
-{
-	if (!cpus_empty(pending_irq_balance_cpumask[vector])) {
-		set_msi_affinity(vector, pending_irq_balance_cpumask[vector]);
-		cpus_clear(pending_irq_balance_cpumask[vector]);
-	}
-}
-#endif /* CONFIG_IRQBALANCE */
 #endif /* CONFIG_SMP */
 
 static void mask_MSI_irq(unsigned int vector)
@@ -191,13 +184,13 @@ static void shutdown_msi_irq(unsigned int vector)
 
 static void end_msi_irq_wo_maskbit(unsigned int vector)
 {
-	move_msi(vector);
+	move_native_irq(vector);
 	ack_APIC_irq();
 }
 
 static void end_msi_irq_w_maskbit(unsigned int vector)
 {
-	move_msi(vector);
+	move_native_irq(vector);
 	unmask_MSI_irq(vector);
 	ack_APIC_irq();
 }
diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index 390f1851c0f..402136a5c9e 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -19,7 +19,6 @@
 #define NR_HP_RESERVED_VECTORS 	20
 
 extern int vector_irq[NR_VECTORS];
-extern cpumask_t pending_irq_balance_cpumask[NR_IRQS];
 extern void (*interrupt[NR_IRQS])(void);
 extern int pci_vector_resources(int last, int nr_released);
 
@@ -29,10 +28,6 @@ extern int pci_vector_resources(int last, int nr_released);
 #define set_msi_irq_affinity	NULL
 #endif
 
-#ifndef CONFIG_IRQBALANCE
-static inline void move_msi(int vector) {}
-#endif
-
 /*
  * MSI-X Address Register
  */
diff --git a/include/asm-ia64/hw_irq.h b/include/asm-ia64/hw_irq.h
index 041ab8c51a6..0cf119b42f7 100644
--- a/include/asm-ia64/hw_irq.h
+++ b/include/asm-ia64/hw_irq.h
@@ -116,13 +116,6 @@ __ia64_local_vector_to_irq (ia64_vector vec)
  * and to obtain the irq descriptor for a given irq number.
  */
 
-/* Return a pointer to the irq descriptor for IRQ.  */
-static inline irq_desc_t *
-irq_descp (int irq)
-{
-	return irq_desc + irq;
-}
-
 /* Extract the IA-64 vector that corresponds to IRQ.  */
 static inline ia64_vector
 irq_to_vector (int irq)
diff --git a/include/asm-ia64/irq.h b/include/asm-ia64/irq.h
index bd07d11d9f3..5d930fdc0be 100644
--- a/include/asm-ia64/irq.h
+++ b/include/asm-ia64/irq.h
@@ -30,12 +30,6 @@ extern void disable_irq_nosync (unsigned int);
 extern void enable_irq (unsigned int);
 extern void set_irq_affinity_info (unsigned int irq, int dest, int redir);
 
-#ifdef CONFIG_SMP
-extern void move_irq(int irq);
-#else
-#define move_irq(irq)
-#endif
-
 struct irqaction;
 struct pt_regs;
 int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 069d3b84d31..4a362b9ec96 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -71,16 +71,139 @@ typedef struct irq_desc {
 	unsigned int irq_count;		/* For detecting broken interrupts */
 	unsigned int irqs_unhandled;
 	spinlock_t lock;
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+	unsigned int move_irq;		/* Flag need to re-target intr dest*/
+#endif
 } ____cacheline_aligned irq_desc_t;
 
 extern irq_desc_t irq_desc [NR_IRQS];
 
+/* Return a pointer to the irq descriptor for IRQ.  */
+static inline irq_desc_t *
+irq_descp (int irq)
+{
+	return irq_desc + irq;
+}
+
 #include <asm/hw_irq.h> /* the arch dependent stuff */
 
 extern int setup_irq(unsigned int irq, struct irqaction * new);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
 extern cpumask_t irq_affinity[NR_IRQS];
+
+#ifdef CONFIG_SMP
+static inline void set_native_irq_info(int irq, cpumask_t mask)
+{
+	irq_affinity[irq] = mask;
+}
+#else
+static inline void set_native_irq_info(int irq, cpumask_t mask)
+{
+}
+#endif
+
+#ifdef CONFIG_SMP
+
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+extern cpumask_t pending_irq_cpumask[NR_IRQS];
+
+static inline void set_pending_irq(unsigned int irq, cpumask_t mask)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->move_irq = 1;
+	pending_irq_cpumask[irq] = mask;
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+static inline void
+move_native_irq(int irq)
+{
+	cpumask_t tmp;
+	irq_desc_t *desc = irq_descp(irq);
+
+	if (likely (!desc->move_irq))
+		return;
+
+	desc->move_irq = 0;
+
+	if (likely(cpus_empty(pending_irq_cpumask[irq])))
+		return;
+
+	if (!desc->handler->set_affinity)
+		return;
+
+	/* note - we hold the desc->lock */
+	cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
+
+	/*
+	 * If there was a valid mask to work with, please
+	 * do the disable, re-program, enable sequence.
+	 * This is *not* particularly important for level triggered
+	 * but in a edge trigger case, we might be setting rte
+	 * when an active trigger is comming in. This could
+	 * cause some ioapics to mal-function.
+	 * Being paranoid i guess!
+	 */
+	if (unlikely(!cpus_empty(tmp))) {
+		desc->handler->disable(irq);
+		desc->handler->set_affinity(irq,tmp);
+		desc->handler->enable(irq);
+	}
+	cpus_clear(pending_irq_cpumask[irq]);
+}
+
+#ifdef CONFIG_PCI_MSI
+/*
+ * Wonder why these are dummies?
+ * For e.g the set_ioapic_affinity_vector() calls the set_ioapic_affinity_irq()
+ * counter part after translating the vector to irq info. We need to perform
+ * this operation on the real irq, when we dont use vector, i.e when
+ * pci_use_vector() is false.
+ */
+static inline void move_irq(int irq)
+{
+}
+
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+}
+
+#else // CONFIG_PCI_MSI
+
+static inline void move_irq(int irq)
+{
+	move_native_irq(irq);
+}
+
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+	set_native_irq_info(irq, mask);
+}
+#endif // CONFIG_PCI_MSI
+
+#else	// CONFIG_GENERIC_PENDING_IRQ || CONFIG_IRQBALANCE
+
+#define move_irq(x)
+#define move_native_irq(x)
+#define set_pending_irq(x,y)
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+	set_native_irq_info(irq, mask);
+}
+
+#endif // CONFIG_GENERIC_PENDING_IRQ
+
+#else // CONFIG_SMP
+
+#define move_irq(x)
+#define move_native_irq(x)
+
+#endif // CONFIG_SMP
+
 extern int no_irq_affinity;
 extern int noirqdebug_setup(char *str);
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac670098570..1cfdb08ddf2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,6 +18,10 @@
 
 cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
 
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
+#endif
+
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
  *
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 85d08daa660..f26e534c658 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
  */
 static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
 
-void __attribute__((weak))
-proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+{
+	/*
+	 * Save these away for later use. Re-progam when the
+	 * interrupt is pending
+	 */
+	set_pending_irq(irq, mask_val);
+}
+#else
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
 	irq_affinity[irq] = mask_val;
 	irq_desc[irq].handler->set_affinity(irq, mask_val);
 }
+#endif
 
 static int irq_affinity_read_proc(char *page, char **start, off_t off,
 				  int count, int *eof, void *data)
-- 
cgit v1.2.3-70-g09d2


From a888cebe17e39476e5ca18c3a4bd96c6775070db Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Tue, 6 Sep 2005 15:16:19 -0700
Subject: [PATCH] x86_64: create sysfs entries for cpu only for present cpus

Need to create sysfs only for cpus that are present.  Without which we see
NR_CPUS entries created when we have CONFIG_HOTPLUG and CONFIG_HOTPLUG_CPU
enabled.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Acked-by: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mach-default/topology.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/mach-default/topology.c b/arch/i386/mach-default/topology.c
index 23395fff35d..b64314069e7 100644
--- a/arch/i386/mach-default/topology.c
+++ b/arch/i386/mach-default/topology.c
@@ -76,7 +76,7 @@ static int __init topology_init(void)
 	for_each_online_node(i)
 		arch_register_node(i);
 
-	for_each_cpu(i)
+	for_each_present_cpu(i)
 		arch_register_cpu(i);
 	return 0;
 }
@@ -87,7 +87,7 @@ static int __init topology_init(void)
 {
 	int i;
 
-	for_each_cpu(i)
+	for_each_present_cpu(i)
 		arch_register_cpu(i);
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 8446f1d391f3d27e6bf9c43d4cbcdac0ca720417 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 6 Sep 2005 15:16:27 -0700
Subject: [PATCH] detect soft lockups

This patch adds a new kernel debug feature: CONFIG_DETECT_SOFTLOCKUP.

When enabled then per-CPU watchdog threads are started, which try to run
once per second.  If they get delayed for more than 10 seconds then a
callback from the timer interrupt detects this condition and prints out a
warning message and a stack dump (once per lockup incident).  The feature
is otherwise non-intrusive, it doesnt try to unlock the box in any way, it
only gets the debug info out, automatically, and on all CPUs affected by
the lockup.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-Off-By: Matthias Urlichs <smurf@smurf.noris.de>
Signed-off-by: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/nmi.c       |   5 ++
 arch/i386/kernel/time.c      |   1 +
 arch/x86_64/kernel/nmi.c     |   2 +
 arch/x86_64/kernel/time.c    |   1 +
 drivers/mtd/nand/nand_base.c |   1 +
 include/linux/sched.h        |  17 +++++
 init/main.c                  |   1 +
 kernel/Makefile              |   1 +
 kernel/power/swsusp.c        |   1 +
 kernel/softlockup.c          | 151 +++++++++++++++++++++++++++++++++++++++++++
 kernel/timer.c               |   1 +
 lib/Kconfig.debug            |  19 ++++++
 12 files changed, 201 insertions(+)
 create mode 100644 kernel/softlockup.c

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 8bbdbda07a2..0178457db72 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -478,6 +478,11 @@ void touch_nmi_watchdog (void)
 	 */
 	for (i = 0; i < NR_CPUS; i++)
 		alert_counter[i] = 0;
+
+	/*
+	 * Tickle the softlockup detector too:
+	 */
+	touch_softlockup_watchdog();
 }
 
 extern void die_nmi(struct pt_regs *, const char *msg);
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 6f794a78ee1..b0c5ee2b344 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -422,6 +422,7 @@ static int timer_resume(struct sys_device *dev)
 		last_timer->resume();
 	cur_timer = last_timer;
 	last_timer = NULL;
+	touch_softlockup_watchdog();
 	return 0;
 }
 
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 64a8e05d581..84cae81fff8 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -463,6 +463,8 @@ void touch_nmi_watchdog (void)
 	 */
 	for (i = 0; i < NR_CPUS; i++)
 		per_cpu(nmi_touch, i) = 1;
+
+ 	touch_softlockup_watchdog();
 }
 
 void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 66bf6ddeb0c..2b5d9da912a 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -1041,6 +1041,7 @@ static int timer_resume(struct sys_device *dev)
 	write_sequnlock_irqrestore(&xtime_lock,flags);
 	jiffies += sleep_length;
 	wall_jiffies += sleep_length;
+	touch_softlockup_watchdog();
 	return 0;
 }
 
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index eee5115658c..04e54318bc6 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -526,6 +526,7 @@ static void nand_wait_ready(struct mtd_info *mtd)
 	do {
 		if (this->dev_ready(mtd))
 			return;
+		touch_softlockup_watchdog();
 	} while (time_before(jiffies, timeo));	
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dec5827c774..5fb31bede10 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -176,6 +176,23 @@ extern void trap_init(void);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
 
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+extern void softlockup_tick(struct pt_regs *regs);
+extern void spawn_softlockup_task(void);
+extern void touch_softlockup_watchdog(void);
+#else
+static inline void softlockup_tick(struct pt_regs *regs)
+{
+}
+static inline void spawn_softlockup_task(void)
+{
+}
+static inline void touch_softlockup_watchdog(void)
+{
+}
+#endif
+
+
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
 /* Is this address in the __sched functions? */
diff --git a/init/main.c b/init/main.c
index ff410063e4e..a29fb2ac724 100644
--- a/init/main.c
+++ b/init/main.c
@@ -614,6 +614,7 @@ static void do_pre_smp_initcalls(void)
 	migration_init();
 #endif
 	spawn_ksoftirqd();
+	spawn_softlockup_task();
 }
 
 static void run_init_process(char *init_filename)
diff --git a/kernel/Makefile b/kernel/Makefile
index cb05cd05d23..8d57a2f1226 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index eaacd5cb588..d967e875ee8 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1059,6 +1059,7 @@ int swsusp_resume(void)
 	BUG_ON(!error);
 	restore_processor_state();
 	restore_highmem();
+	touch_softlockup_watchdog();
 	device_power_up();
 	local_irq_enable();
 	return error;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
new file mode 100644
index 00000000000..75976209cea
--- /dev/null
+++ b/kernel/softlockup.c
@@ -0,0 +1,151 @@
+/*
+ * Detect Soft Lockups
+ *
+ * started by Ingo Molnar, (C) 2005, Red Hat
+ *
+ * this code detects soft lockups: incidents in where on a CPU
+ * the kernel does not reschedule for 10 seconds or more.
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+
+static DEFINE_SPINLOCK(print_lock);
+
+static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
+static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
+
+static int did_panic = 0;
+static int softlock_panic(struct notifier_block *this, unsigned long event,
+				void *ptr)
+{
+	did_panic = 1;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+	.notifier_call = softlock_panic,
+};
+
+void touch_softlockup_watchdog(void)
+{
+	per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
+}
+EXPORT_SYMBOL(touch_softlockup_watchdog);
+
+/*
+ * This callback runs from the timer interrupt, and checks
+ * whether the watchdog thread has hung or not:
+ */
+void softlockup_tick(struct pt_regs *regs)
+{
+	int this_cpu = smp_processor_id();
+	unsigned long timestamp = per_cpu(timestamp, this_cpu);
+
+	if (per_cpu(print_timestamp, this_cpu) == timestamp)
+		return;
+
+	/* Do not cause a second panic when there already was one */
+	if (did_panic)
+		return;
+
+	if (time_after(jiffies, timestamp + 10*HZ)) {
+		per_cpu(print_timestamp, this_cpu) = timestamp;
+
+		spin_lock(&print_lock);
+		printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
+			this_cpu);
+		show_regs(regs);
+		spin_unlock(&print_lock);
+	}
+}
+
+/*
+ * The watchdog thread - runs every second and touches the timestamp.
+ */
+static int watchdog(void * __bind_cpu)
+{
+	struct sched_param param = { .sched_priority = 99 };
+	int this_cpu = (long) __bind_cpu;
+
+	printk("softlockup thread %d started up.\n", this_cpu);
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+	current->flags |= PF_NOFREEZE;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	/*
+	 * Run briefly once per second - if this gets delayed for
+	 * more than 10 seconds then the debug-printout triggers
+	 * in softlockup_tick():
+	 */
+	while (!kthread_should_stop()) {
+		msleep_interruptible(1000);
+		touch_softlockup_watchdog();
+	}
+	__set_current_state(TASK_RUNNING);
+
+	return 0;
+}
+
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __devinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		BUG_ON(per_cpu(watchdog_task, hotcpu));
+		p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("watchdog for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+  		per_cpu(watchdog_task, hotcpu) = p;
+		kthread_bind(p, hotcpu);
+ 		break;
+	case CPU_ONLINE:
+
+		wake_up_process(per_cpu(watchdog_task, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
+	case CPU_DEAD:
+		p = per_cpu(watchdog_task, hotcpu);
+		per_cpu(watchdog_task, hotcpu) = NULL;
+		kthread_stop(p);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ 	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init void spawn_softlockup_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+
+	notifier_chain_register(&panic_notifier_list, &panic_block);
+}
+
diff --git a/kernel/timer.c b/kernel/timer.c
index 5377f40723f..1433d87f46b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs)
 {
 	jiffies_64++;
 	update_times();
+	softlockup_tick(regs);
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 299f7f3b5b0..3754c9a8f5c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -46,6 +46,25 @@ config LOG_BUF_SHIFT
 		     13 =>  8 KB
 		     12 =>  4 KB
 
+config DETECT_SOFTLOCKUP
+	bool "Detect Soft Lockups"
+	depends on DEBUG_KERNEL
+	default y
+	help
+	  Say Y here to enable the kernel to detect "soft lockups",
+	  which are bugs that cause the kernel to loop in kernel
+	  mode for more than 10 seconds, without giving other tasks a
+	  chance to run.
+
+	  When a soft-lockup is detected, the kernel will print the
+	  current stack trace (which you should report), but the
+	  system will stay locked up. This feature has negligible
+	  overhead.
+
+	  (Note that "hard lockups" are separate type of bugs that
+	   can be detected via the NMI-watchdog, on platforms that
+	   support it.)
+
 config SCHEDSTATS
 	bool "Collect scheduler statistics"
 	depends on DEBUG_KERNEL && PROC_FS
-- 
cgit v1.2.3-70-g09d2


From c3d8c1414573be8cf7c8fdc1e076935697c7f6af Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@scalex86.org>
Date: Tue, 6 Sep 2005 15:16:33 -0700
Subject: [PATCH] More __read_mostly variables

Move some more frequently read variables that showed up during some of our
performance tests as sometimes ending up in hot cachelines to the
read_mostly section.

Fix: Move the __read_mostly from before hpet_usec_quotient to follow the
variable like the other uses of __read_mostly.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Christoph Lameter <christoph@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/setup.c             | 2 +-
 arch/i386/kernel/timers/timer_hpet.c | 2 +-
 mm/mmap.c                            | 2 +-
 mm/page_alloc.c                      | 8 ++++----
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 294bcca985a..e29fd5aeaf8 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -82,7 +82,7 @@ EXPORT_SYMBOL(efi_enabled);
 /* cpu data as detected by the assembly code in head.S */
 struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
 /* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
 EXPORT_SYMBOL(boot_cpu_data);
 
 unsigned long mmu_cr4_features;
diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
index 001de97c9e4..6dbb29f834e 100644
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ b/arch/i386/kernel/timers/timer_hpet.c
@@ -18,7 +18,7 @@
 #include "mach_timer.h"
 #include <asm/hpet.h>
 
-static unsigned long __read_mostly hpet_usec_quotient;	/* convert hpet clks to usec */
+static unsigned long hpet_usec_quotient __read_mostly;	/* convert hpet clks to usec */
 static unsigned long tsc_hpet_quotient;		/* convert tsc to hpet clks */
 static unsigned long hpet_last; 	/* hpet counter value at last tick*/
 static unsigned long last_tsc_low;	/* lsb 32 bits of Time Stamp Counter */
diff --git a/mm/mmap.c b/mm/mmap.c
index 404319477e7..bb43340d397 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -61,7 +61,7 @@ pgprot_t protection_map[16] = {
 
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50;	/* default is 50% */
-int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
+int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 atomic_t vm_committed_space = ATOMIC_INIT(0);
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b06a9636d97..34bba8f1144 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -42,11 +42,11 @@
  * MCD - HACK: Find somewhere to initialize this EARLY, or make this
  * initializer cleaner
  */
-nodemask_t node_online_map = { { [0] = 1UL } };
+nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
 EXPORT_SYMBOL(node_online_map);
-nodemask_t node_possible_map = NODE_MASK_ALL;
+nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
-struct pglist_data *pgdat_list;
+struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages;
 unsigned long totalhigh_pages;
 long nr_swap_pages;
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages);
  * Used by page_zone() to look up the address of the struct zone whose
  * id is encoded in the upper bits of page->flags
  */
-struct zone *zone_table[1 << ZONETABLE_SHIFT];
+struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
 EXPORT_SYMBOL(zone_table);
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
-- 
cgit v1.2.3-70-g09d2


From 19306059cd7fedaf96b4b0260a9a8a45e513c857 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Tue, 6 Sep 2005 15:16:35 -0700
Subject: [PATCH] NMI: Update NMI users of RCU to use new API

Uses of RCU for dynamically changeable NMI handlers need to use the new
rcu_dereference() and rcu_assign_pointer() facilities.  This change makes
it clear that these uses are safe from a memory-barrier viewpoint, but the
main purpose is to document exactly what operations are being protected by
RCU.  This has been tested on x86 and x86-64, which are the only
architectures affected by this change.

Signed-off-by: <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/RCU/NMI-RCU.txt | 112 ++++++++++++++++++++++++++++++++++++++++++
 arch/i386/kernel/traps.c      |   4 +-
 arch/x86_64/kernel/nmi.c      |   4 +-
 3 files changed, 116 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/RCU/NMI-RCU.txt

(limited to 'arch/i386')

diff --git a/Documentation/RCU/NMI-RCU.txt b/Documentation/RCU/NMI-RCU.txt
new file mode 100644
index 00000000000..d0634a5c344
--- /dev/null
+++ b/Documentation/RCU/NMI-RCU.txt
@@ -0,0 +1,112 @@
+Using RCU to Protect Dynamic NMI Handlers
+
+
+Although RCU is usually used to protect read-mostly data structures,
+it is possible to use RCU to provide dynamic non-maskable interrupt
+handlers, as well as dynamic irq handlers.  This document describes
+how to do this, drawing loosely from Zwane Mwaikambo's NMI-timer
+work in "arch/i386/oprofile/nmi_timer_int.c" and in
+"arch/i386/kernel/traps.c".
+
+The relevant pieces of code are listed below, each followed by a
+brief explanation.
+
+	static int dummy_nmi_callback(struct pt_regs *regs, int cpu)
+	{
+		return 0;
+	}
+
+The dummy_nmi_callback() function is a "dummy" NMI handler that does
+nothing, but returns zero, thus saying that it did nothing, allowing
+the NMI handler to take the default machine-specific action.
+
+	static nmi_callback_t nmi_callback = dummy_nmi_callback;
+
+This nmi_callback variable is a global function pointer to the current
+NMI handler.
+
+	fastcall void do_nmi(struct pt_regs * regs, long error_code)
+	{
+		int cpu;
+
+		nmi_enter();
+
+		cpu = smp_processor_id();
+		++nmi_count(cpu);
+
+		if (!rcu_dereference(nmi_callback)(regs, cpu))
+			default_do_nmi(regs);
+
+		nmi_exit();
+	}
+
+The do_nmi() function processes each NMI.  It first disables preemption
+in the same way that a hardware irq would, then increments the per-CPU
+count of NMIs.  It then invokes the NMI handler stored in the nmi_callback
+function pointer.  If this handler returns zero, do_nmi() invokes the
+default_do_nmi() function to handle a machine-specific NMI.  Finally,
+preemption is restored.
+
+Strictly speaking, rcu_dereference() is not needed, since this code runs
+only on i386, which does not need rcu_dereference() anyway.  However,
+it is a good documentation aid, particularly for anyone attempting to
+do something similar on Alpha.
+
+Quick Quiz:  Why might the rcu_dereference() be necessary on Alpha,
+	     given that the code referenced by the pointer is read-only?
+
+
+Back to the discussion of NMI and RCU...
+
+	void set_nmi_callback(nmi_callback_t callback)
+	{
+		rcu_assign_pointer(nmi_callback, callback);
+	}
+
+The set_nmi_callback() function registers an NMI handler.  Note that any
+data that is to be used by the callback must be initialized up -before-
+the call to set_nmi_callback().  On architectures that do not order
+writes, the rcu_assign_pointer() ensures that the NMI handler sees the
+initialized values.
+
+	void unset_nmi_callback(void)
+	{
+		rcu_assign_pointer(nmi_callback, dummy_nmi_callback);
+	}
+
+This function unregisters an NMI handler, restoring the original
+dummy_nmi_handler().  However, there may well be an NMI handler
+currently executing on some other CPU.  We therefore cannot free
+up any data structures used by the old NMI handler until execution
+of it completes on all other CPUs.
+
+One way to accomplish this is via synchronize_sched(), perhaps as
+follows:
+
+	unset_nmi_callback();
+	synchronize_sched();
+	kfree(my_nmi_data);
+
+This works because synchronize_sched() blocks until all CPUs complete
+any preemption-disabled segments of code that they were executing.
+Since NMI handlers disable preemption, synchronize_sched() is guaranteed
+not to return until all ongoing NMI handlers exit.  It is therefore safe
+to free up the handler's data as soon as synchronize_sched() returns.
+
+
+Answer to Quick Quiz
+
+	Why might the rcu_dereference() be necessary on Alpha, given
+	that the code referenced by the pointer is read-only?
+
+	Answer: The caller to set_nmi_callback() might well have
+		initialized some data that is to be used by the
+		new NMI handler.  In this case, the rcu_dereference()
+		would be needed, because otherwise a CPU that received
+		an NMI just after the new handler was set might see
+		the pointer to the new NMI handler, but the old
+		pre-initialized version of the handler's data.
+
+		More important, the rcu_dereference() makes it clear
+		to someone reading the code that the pointer is being
+		protected by RCU.
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 54629bb5893..029bf94cda7 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -657,7 +657,7 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code)
 
 	++nmi_count(cpu);
 
-	if (!nmi_callback(regs, cpu))
+	if (!rcu_dereference(nmi_callback)(regs, cpu))
 		default_do_nmi(regs);
 
 	nmi_exit();
@@ -665,7 +665,7 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code)
 
 void set_nmi_callback(nmi_callback_t callback)
 {
-	nmi_callback = callback;
+	rcu_assign_pointer(nmi_callback, callback);
 }
 EXPORT_SYMBOL_GPL(set_nmi_callback);
 
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 84cae81fff8..caf164959e1 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -524,14 +524,14 @@ asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
 
 	nmi_enter();
 	add_pda(__nmi_count,1);
-	if (!nmi_callback(regs, cpu))
+	if (!rcu_dereference(nmi_callback)(regs, cpu))
 		default_do_nmi(regs);
 	nmi_exit();
 }
 
 void set_nmi_callback(nmi_callback_t callback)
 {
-	nmi_callback = callback;
+	rcu_assign_pointer(nmi_callback, callback);
 }
 
 void unset_nmi_callback(void)
-- 
cgit v1.2.3-70-g09d2


From f8eeaaf4180334a8e5c3582fe62a5f8176a8c124 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 6 Sep 2005 15:17:24 -0700
Subject: [PATCH] Make the bzImage format self-terminating

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Frank Sorenson <frank@tuxrocks.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/i386/boot.txt    | 35 ++++++++++++++++++++++-------------
 arch/i386/boot/setup.S         |  2 +-
 arch/i386/boot/tools/build.c   |  4 +++-
 arch/x86_64/boot/setup.S       |  2 +-
 arch/x86_64/boot/tools/build.c |  4 +++-
 5 files changed, 30 insertions(+), 17 deletions(-)

(limited to 'arch/i386')

diff --git a/Documentation/i386/boot.txt b/Documentation/i386/boot.txt
index 1c48f0eba6f..10312bebe55 100644
--- a/Documentation/i386/boot.txt
+++ b/Documentation/i386/boot.txt
@@ -2,7 +2,7 @@
 		     ----------------------------
 
 		    H. Peter Anvin <hpa@zytor.com>
-			Last update 2002-01-01
+			Last update 2005-09-02
 
 On the i386 platform, the Linux kernel uses a rather complicated boot
 convention.  This has evolved partially due to historical aspects, as
@@ -34,6 +34,8 @@ Protocol 2.02:	(Kernel 2.4.0-test3-pre3) New command line protocol.
 Protocol 2.03:	(Kernel 2.4.18-pre1) Explicitly makes the highest possible
 		initrd address available to the bootloader.
 
+Protocol 2.04:	(Kernel 2.6.14) Extend the syssize field to four bytes.
+
 
 **** MEMORY LAYOUT
 
@@ -103,10 +105,9 @@ The header looks like:
 Offset	Proto	Name		Meaning
 /Size
 
-01F1/1	ALL	setup_sects	The size of the setup in sectors
+01F1/1	ALL(1	setup_sects	The size of the setup in sectors
 01F2/2	ALL	root_flags	If set, the root is mounted readonly
-01F4/2	ALL	syssize		DO NOT USE - for bootsect.S use only
-01F6/2	ALL	swap_dev	DO NOT USE - obsolete
+01F4/4	2.04+(2	syssize		The size of the 32-bit code in 16-byte paras
 01F8/2	ALL	ram_size	DO NOT USE - for bootsect.S use only
 01FA/2	ALL	vid_mode	Video mode control
 01FC/2	ALL	root_dev	Default root device number
@@ -129,8 +130,12 @@ Offset	Proto	Name		Meaning
 0228/4	2.02+	cmd_line_ptr	32-bit pointer to the kernel command line
 022C/4	2.03+	initrd_addr_max	Highest legal initrd address
 
-For backwards compatibility, if the setup_sects field contains 0, the
-real value is 4.
+(1) For backwards compatibility, if the setup_sects field contains 0, the
+    real value is 4.
+
+(2) For boot protocol prior to 2.04, the upper two bytes of the syssize
+    field are unusable, which means the size of a bzImage kernel
+    cannot be determined.
 
 If the "HdrS" (0x53726448) magic number is not found at offset 0x202,
 the boot protocol version is "old".  Loading an old kernel, the
@@ -230,12 +235,16 @@ loader to communicate with the kernel.  Some of its options are also
 relevant to the boot loader itself, see "special command line options"
 below.
 
-The kernel command line is a null-terminated string up to 255
-characters long, plus the final null.
+The kernel command line is a null-terminated string currently up to
+255 characters long, plus the final null.  A string that is too long
+will be automatically truncated by the kernel, a boot loader may allow
+a longer command line to be passed to permit future kernels to extend
+this limit.
 
 If the boot protocol version is 2.02 or later, the address of the
 kernel command line is given by the header field cmd_line_ptr (see
-above.)
+above.)  This address can be anywhere between the end of the setup
+heap and 0xA0000.
 
 If the protocol version is *not* 2.02 or higher, the kernel
 command line is entered using the following protocol:
@@ -255,7 +264,7 @@ command line is entered using the following protocol:
 **** SAMPLE BOOT CONFIGURATION
 
 As a sample configuration, assume the following layout of the real
-mode segment:
+mode segment (this is a typical, and recommended layout):
 
 	0x0000-0x7FFF	Real mode kernel
 	0x8000-0x8FFF	Stack and heap
@@ -312,9 +321,9 @@ Such a boot loader should enter the following fields in the header:
 
 **** LOADING THE REST OF THE KERNEL
 
-The non-real-mode kernel starts at offset (setup_sects+1)*512 in the
-kernel file (again, if setup_sects == 0 the real value is 4.)  It
-should be loaded at address 0x10000 for Image/zImage kernels and
+The 32-bit (non-real-mode) kernel starts at offset (setup_sects+1)*512
+in the kernel file (again, if setup_sects == 0 the real value is 4.)
+It should be loaded at address 0x10000 for Image/zImage kernels and
 0x100000 for bzImage kernels.
 
 The kernel is a bzImage kernel if the protocol >= 2.00 and the 0x01
diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S
index 8cb420f40c5..ca668d9df16 100644
--- a/arch/i386/boot/setup.S
+++ b/arch/i386/boot/setup.S
@@ -82,7 +82,7 @@ start:
 # This is the setup header, and it must start at %cs:2 (old 0x9020:2)
 
 		.ascii	"HdrS"		# header signature
-		.word	0x0203		# header version number (>= 0x0105)
+		.word	0x0204		# header version number (>= 0x0105)
 					# or else old loadlin-1.5 will fail)
 realmode_swtch:	.word	0, 0		# default_switch, SETUPSEG
 start_sys_seg:	.word	SYSSEG
diff --git a/arch/i386/boot/tools/build.c b/arch/i386/boot/tools/build.c
index 6835f6d47c3..05798419a6a 100644
--- a/arch/i386/boot/tools/build.c
+++ b/arch/i386/boot/tools/build.c
@@ -177,7 +177,9 @@ int main(int argc, char ** argv)
 		die("Output: seek failed");
 	buf[0] = (sys_size & 0xff);
 	buf[1] = ((sys_size >> 8) & 0xff);
-	if (write(1, buf, 2) != 2)
+	buf[2] = ((sys_size >> 16) & 0xff);
+	buf[3] = ((sys_size >> 24) & 0xff);
+	if (write(1, buf, 4) != 4)
 		die("Write of image length failed");
 
 	return 0;					    /* Everything is OK */
diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S
index ff58b2832b7..12ea0b6c52e 100644
--- a/arch/x86_64/boot/setup.S
+++ b/arch/x86_64/boot/setup.S
@@ -81,7 +81,7 @@ start:
 # This is the setup header, and it must start at %cs:2 (old 0x9020:2)
 
 		.ascii	"HdrS"		# header signature
-		.word	0x0203		# header version number (>= 0x0105)
+		.word	0x0204		# header version number (>= 0x0105)
 					# or else old loadlin-1.5 will fail)
 realmode_swtch:	.word	0, 0		# default_switch, SETUPSEG
 start_sys_seg:	.word	SYSSEG
diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c
index 18b5bac1c42..c44f5e2ec10 100644
--- a/arch/x86_64/boot/tools/build.c
+++ b/arch/x86_64/boot/tools/build.c
@@ -178,7 +178,9 @@ int main(int argc, char ** argv)
 		die("Output: seek failed");
 	buf[0] = (sys_size & 0xff);
 	buf[1] = ((sys_size >> 8) & 0xff);
-	if (write(1, buf, 2) != 2)
+	buf[2] = ((sys_size >> 16) & 0xff);
+	buf[3] = ((sys_size >> 24) & 0xff);
+	if (write(1, buf, 4) != 4)
 		die("Write of image length failed");
 
 	return 0;					    /* Everything is OK */
-- 
cgit v1.2.3-70-g09d2


From 96d0821cacd095e25a39dfff5232a45b63ed18dd Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 6 Sep 2005 15:17:26 -0700
Subject: [PATCH] Fix function/macro name collision on i386 oprofile

The i386 OProfile code has a function named nmi_exit(), which collides with
the nmi_exit() macro in linux/hardirq.h.  At the moment, we get away with
it, because hardirq.h isn't included in the oprofile code.  I hit this as a
bug when working with a patch which (indirectly) adds a #include of
hardirq.h to oprofile.

Regardless, the name collision is probably not a good idea, so this patch
fixes it, renaming the oprofile function to op_nmi_exit().  It also renames
the nmi_init() and nmi_timer_init() functions similarly, for consistency.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/oprofile/init.c          | 12 ++++++------
 arch/i386/oprofile/nmi_int.c       |  4 ++--
 arch/i386/oprofile/nmi_timer_int.c |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/oprofile/init.c b/arch/i386/oprofile/init.c
index c90332de582..5341d481d92 100644
--- a/arch/i386/oprofile/init.c
+++ b/arch/i386/oprofile/init.c
@@ -15,9 +15,9 @@
  * with the NMI mode driver.
  */
  
-extern int nmi_init(struct oprofile_operations * ops);
-extern int nmi_timer_init(struct oprofile_operations * ops);
-extern void nmi_exit(void);
+extern int op_nmi_init(struct oprofile_operations * ops);
+extern int op_nmi_timer_init(struct oprofile_operations * ops);
+extern void op_nmi_exit(void);
 extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
 
 
@@ -28,11 +28,11 @@ int __init oprofile_arch_init(struct oprofile_operations * ops)
 	ret = -ENODEV;
 
 #ifdef CONFIG_X86_LOCAL_APIC
-	ret = nmi_init(ops);
+	ret = op_nmi_init(ops);
 #endif
 #ifdef CONFIG_X86_IO_APIC
 	if (ret < 0)
-		ret = nmi_timer_init(ops);
+		ret = op_nmi_timer_init(ops);
 #endif
 	ops->backtrace = x86_backtrace;
 
@@ -43,6 +43,6 @@ int __init oprofile_arch_init(struct oprofile_operations * ops)
 void oprofile_arch_exit(void)
 {
 #ifdef CONFIG_X86_LOCAL_APIC
-	nmi_exit();
+	op_nmi_exit();
 #endif
 }
diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c
index 255e4702d18..0493e8b8ec4 100644
--- a/arch/i386/oprofile/nmi_int.c
+++ b/arch/i386/oprofile/nmi_int.c
@@ -355,7 +355,7 @@ static int __init ppro_init(char ** cpu_type)
 /* in order to get driverfs right */
 static int using_nmi;
 
-int __init nmi_init(struct oprofile_operations *ops)
+int __init op_nmi_init(struct oprofile_operations *ops)
 {
 	__u8 vendor = boot_cpu_data.x86_vendor;
 	__u8 family = boot_cpu_data.x86;
@@ -420,7 +420,7 @@ int __init nmi_init(struct oprofile_operations *ops)
 }
 
 
-void nmi_exit(void)
+void op_nmi_exit(void)
 {
 	if (using_nmi)
 		exit_driverfs();
diff --git a/arch/i386/oprofile/nmi_timer_int.c b/arch/i386/oprofile/nmi_timer_int.c
index c58d0c14f27..ad93cdd55d6 100644
--- a/arch/i386/oprofile/nmi_timer_int.c
+++ b/arch/i386/oprofile/nmi_timer_int.c
@@ -40,7 +40,7 @@ static void timer_stop(void)
 }
 
 
-int __init nmi_timer_init(struct oprofile_operations * ops)
+int __init op_nmi_timer_init(struct oprofile_operations * ops)
 {
 	extern int nmi_active;
 
-- 
cgit v1.2.3-70-g09d2


From 7f4bde9a3486cd7e70bedd2aff35b38667d50173 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 6 Sep 2005 15:17:39 -0700
Subject: [PATCH] remove the second arg of do_timer_interrupt()

The second arg of do_timer_interrupt() is not used in the functions, and
all callers pass NULL.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Paul Mundt <lethal@Linux-SH.ORG>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/time.c | 5 ++---
 arch/sh/kernel/time.c   | 4 ++--
 arch/sh64/kernel/time.c | 4 ++--
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index b0c5ee2b344..9b94d84a6c3 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -252,8 +252,7 @@ EXPORT_SYMBOL(profile_pc);
  * timer_interrupt() needs to keep up the real-time clock,
  * as well as call the "do_timer()" routine every clocktick
  */
-static inline void do_timer_interrupt(int irq, void *dev_id,
-					struct pt_regs *regs)
+static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
 {
 #ifdef CONFIG_X86_IO_APIC
 	if (timer_ack) {
@@ -307,7 +306,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 
 	cur_timer->mark_offset();
  
-	do_timer_interrupt(irq, NULL, regs);
+	do_timer_interrupt(irq, regs);
 
 	write_sequnlock(&xtime_lock);
 	return IRQ_HANDLED;
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index df7a9b9d4cb..d5f5aedde0a 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -234,7 +234,7 @@ static long last_rtc_update;
  * timer_interrupt() needs to keep up the real-time clock,
  * as well as call the "do_timer()" routine every clocktick
  */
-static inline void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
 {
 	do_timer(regs);
 #ifndef CONFIG_SMP
@@ -285,7 +285,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	 * locally disabled. -arca
 	 */
 	write_seqlock(&xtime_lock);
-	do_timer_interrupt(irq, NULL, regs);
+	do_timer_interrupt(irq, regs);
 	write_sequnlock(&xtime_lock);
 
 	return IRQ_HANDLED;
diff --git a/arch/sh64/kernel/time.c b/arch/sh64/kernel/time.c
index 6c84da3efc7..926c6fc0619 100644
--- a/arch/sh64/kernel/time.c
+++ b/arch/sh64/kernel/time.c
@@ -303,7 +303,7 @@ static long last_rtc_update = 0;
  * timer_interrupt() needs to keep up the real-time clock,
  * as well as call the "do_timer()" routine every clocktick
  */
-static inline void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
 {
 	unsigned long long current_ctc;
 	asm ("getcon cr62, %0" : "=r" (current_ctc));
@@ -361,7 +361,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	 * locally disabled. -arca
 	 */
 	write_lock(&xtime_lock);
-	do_timer_interrupt(irq, NULL, regs);
+	do_timer_interrupt(irq, regs);
 	write_unlock(&xtime_lock);
 
 	return IRQ_HANDLED;
-- 
cgit v1.2.3-70-g09d2


From 6c231b7bab0aa6860cd9da2de8a064eddc34c146 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Tue, 6 Sep 2005 15:17:45 -0700
Subject: [PATCH] Additions to .data.read_mostly section

Mark variables which are usually accessed for reads with __readmostly.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/io_apic.c           | 10 +++++-----
 arch/i386/kernel/timers/timer_hpet.c |  2 +-
 arch/i386/mm/discontig.c             |  8 ++++----
 arch/i386/mm/init.c                  |  2 +-
 arch/x86_64/kernel/genapic.c         |  2 +-
 arch/x86_64/kernel/io_apic.c         | 10 +++++-----
 arch/x86_64/kernel/setup.c           |  2 +-
 arch/x86_64/kernel/setup64.c         |  2 +-
 arch/x86_64/kernel/smpboot.c         | 10 +++++-----
 arch/x86_64/mm/numa.c                |  6 +++---
 fs/namespace.c                       |  2 +-
 mm/page_alloc.c                      |  4 ++--
 mm/shmem.c                           |  2 +-
 13 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 4a594043157..0e727e6da5c 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -78,7 +78,7 @@ static struct irq_pin_list {
 	int apic, pin, next;
 } irq_2_pin[PIN_MAP_SIZE];
 
-int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
+int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
 #ifdef CONFIG_PCI_MSI
 #define vector_to_irq(vector) 	\
 	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
@@ -1119,7 +1119,7 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
+u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
 
 int assign_irq_vector(int irq)
 {
@@ -1990,7 +1990,7 @@ static void set_ioapic_affinity_vector (unsigned int vector,
  * edge-triggered handler, without risking IRQ storms and other ugly
  * races.
  */
-static struct hw_interrupt_type ioapic_edge_type = {
+static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
 	.typename 	= "IO-APIC-edge",
 	.startup 	= startup_edge_ioapic,
 	.shutdown 	= shutdown_edge_ioapic,
@@ -2003,7 +2003,7 @@ static struct hw_interrupt_type ioapic_edge_type = {
 #endif
 };
 
-static struct hw_interrupt_type ioapic_level_type = {
+static struct hw_interrupt_type ioapic_level_type __read_mostly = {
 	.typename 	= "IO-APIC-level",
 	.startup 	= startup_level_ioapic,
 	.shutdown 	= shutdown_level_ioapic,
@@ -2076,7 +2076,7 @@ static void ack_lapic_irq (unsigned int irq)
 
 static void end_lapic_irq (unsigned int i) { /* nothing */ }
 
-static struct hw_interrupt_type lapic_irq_type = {
+static struct hw_interrupt_type lapic_irq_type __read_mostly = {
 	.typename 	= "local-APIC-edge",
 	.startup 	= NULL, /* startup_irq() not used for IRQ0 */
 	.shutdown 	= NULL, /* shutdown_irq() not used for IRQ0 */
diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
index 6dbb29f834e..d973a8b681f 100644
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ b/arch/i386/kernel/timers/timer_hpet.c
@@ -19,7 +19,7 @@
 #include <asm/hpet.h>
 
 static unsigned long hpet_usec_quotient __read_mostly;	/* convert hpet clks to usec */
-static unsigned long tsc_hpet_quotient;		/* convert tsc to hpet clks */
+static unsigned long tsc_hpet_quotient __read_mostly;	/* convert tsc to hpet clks */
 static unsigned long hpet_last; 	/* hpet counter value at last tick*/
 static unsigned long last_tsc_low;	/* lsb 32 bits of Time Stamp Counter */
 static unsigned long last_tsc_high; 	/* msb 32 bits of Time Stamp Counter */
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index 6711ce3f691..244d8ec66be 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -37,7 +37,7 @@
 #include <asm/mmzone.h>
 #include <bios_ebda.h>
 
-struct pglist_data *node_data[MAX_NUMNODES];
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 bootmem_data_t node0_bdata;
 
@@ -49,8 +49,8 @@ bootmem_data_t node0_bdata;
  * 2) node_start_pfn   - the starting page frame number for a node
  * 3) node_end_pfn     - the ending page fram number for a node
  */
-unsigned long node_start_pfn[MAX_NUMNODES];
-unsigned long node_end_pfn[MAX_NUMNODES];
+unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
+unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
 
 
 #ifdef CONFIG_DISCONTIGMEM
@@ -66,7 +66,7 @@ unsigned long node_end_pfn[MAX_NUMNODES];
  *     physnode_map[4-7] = 1;
  *     physnode_map[8- ] = -1;
  */
-s8 physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1};
+s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
 EXPORT_SYMBOL(physnode_map);
 
 void memory_present(int nid, unsigned long start, unsigned long end)
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 9edfc058b89..2ebaf75f732 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -393,7 +393,7 @@ void zap_low_mappings (void)
 }
 
 static int disable_nx __initdata = 0;
-u64 __supported_pte_mask = ~_PAGE_NX;
+u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
 
 /*
  * noexec = on|off
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index 30c843a5efd..f062aa03bab 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -25,7 +25,7 @@
 #endif
 
 /* which logical CPU number maps to which CPU (physical APIC ID) */
-u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 EXPORT_SYMBOL(x86_cpu_to_apicid);
 u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
 
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 76bcc4e6979..ba1a744e9bf 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -70,7 +70,7 @@ static struct irq_pin_list {
 	short apic, pin, next;
 } irq_2_pin[PIN_MAP_SIZE];
 
-int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
+int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
 #ifdef CONFIG_PCI_MSI
 #define vector_to_irq(vector) 	\
 	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
@@ -683,7 +683,7 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
+u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
 
 int assign_irq_vector(int irq)
 {
@@ -1443,7 +1443,7 @@ static void set_ioapic_affinity_vector (unsigned int vector,
  * races.
  */
 
-static struct hw_interrupt_type ioapic_edge_type = {
+static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
 	.typename = "IO-APIC-edge",
 	.startup 	= startup_edge_ioapic,
 	.shutdown 	= shutdown_edge_ioapic,
@@ -1456,7 +1456,7 @@ static struct hw_interrupt_type ioapic_edge_type = {
 #endif
 };
 
-static struct hw_interrupt_type ioapic_level_type = {
+static struct hw_interrupt_type ioapic_level_type __read_mostly = {
 	.typename = "IO-APIC-level",
 	.startup 	= startup_level_ioapic,
 	.shutdown 	= shutdown_level_ioapic,
@@ -1529,7 +1529,7 @@ static void ack_lapic_irq (unsigned int irq)
 
 static void end_lapic_irq (unsigned int i) { /* nothing */ }
 
-static struct hw_interrupt_type lapic_irq_type = {
+static struct hw_interrupt_type lapic_irq_type __read_mostly = {
 	.typename = "local-APIC-edge",
 	.startup = NULL, /* startup_irq() not used for IRQ0 */
 	.shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 116a491e296..b356f8e6adf 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -65,7 +65,7 @@
  * Machine setup..
  */
 
-struct cpuinfo_x86 boot_cpu_data;
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
 
 unsigned long mmu_cr4_features;
 
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 34082c1cc41..e3ffcacc8c9 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -36,7 +36,7 @@ struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table };
 
 char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
 
-unsigned long __supported_pte_mask = ~0UL;
+unsigned long __supported_pte_mask __read_mostly = ~0UL;
 static int do_not_nx __initdata = 0;
 
 /* noexec=on|off
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index fa25e39fe54..90aeccd1519 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -62,13 +62,13 @@
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
 /* Package ID of each logical CPU */
-u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
-u8 cpu_core_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 EXPORT_SYMBOL(phys_proc_id);
 EXPORT_SYMBOL(cpu_core_id);
 
 /* Bitmask of currently online CPUs */
-cpumask_t cpu_online_map;
+cpumask_t cpu_online_map __read_mostly;
 
 EXPORT_SYMBOL(cpu_online_map);
 
@@ -88,8 +88,8 @@ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
 /* Set when the idlers are all forked */
 int smp_threads_ready;
 
-cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
-cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
+cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_core_map);
 
 /*
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 6a156f5692a..04f7a33e144 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -22,14 +22,14 @@
 #define Dprintk(x...)
 #endif
 
-struct pglist_data *node_data[MAX_NUMNODES];
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
 
 int memnode_shift;
 u8  memnodemap[NODEMAPSIZE];
 
-unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
-cpumask_t     node_to_cpumask[MAX_NUMNODES];
+unsigned char cpu_to_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
+cpumask_t     node_to_cpumask[MAX_NUMNODES] __read_mostly;
 
 int numa_off __initdata;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 79bd8a46e1e..29b70669435 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -40,7 +40,7 @@ static inline int sysfs_init(void)
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 
 static struct list_head *mount_hashtable;
-static int hash_mask, hash_bits;
+static int hash_mask __read_mostly, hash_bits __read_mostly;
 static kmem_cache_t *mnt_cache; 
 
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 34bba8f1144..14d7032c1d1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -47,8 +47,8 @@ EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
 struct pglist_data *pgdat_list __read_mostly;
-unsigned long totalram_pages;
-unsigned long totalhigh_pages;
+unsigned long totalram_pages __read_mostly;
+unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index bdc4bbb6ddb..db2c9e8d990 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -180,7 +180,7 @@ static struct inode_operations shmem_inode_operations;
 static struct inode_operations shmem_dir_inode_operations;
 static struct vm_operations_struct shmem_vm_ops;
 
-static struct backing_dev_info shmem_backing_dev_info = {
+static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
 	.unplug_io_fn	= default_unplug_io_fn,
-- 
cgit v1.2.3-70-g09d2


From b149ee2233edf08fb59b11e879a2c5941929bcb8 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Tue, 6 Sep 2005 15:17:46 -0700
Subject: [PATCH] NTP: ntp-helper functions

This patch cleans up a commonly repeated set of changes to the NTP state
variables by adding two helper inline functions:

ntp_clear(): Clears the ntp state variables

ntp_synced(): Returns 1 if the system is synced with a time server.

This was compile tested for alpha, arm, i386, x86-64, ppc64, s390, sparc,
sparc64.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/time.c         |  7 ++-----
 arch/arm/kernel/time.c           |  7 ++-----
 arch/arm26/kernel/time.c         |  7 ++-----
 arch/cris/arch-v10/kernel/time.c |  2 +-
 arch/cris/kernel/time.c          |  5 +----
 arch/frv/kernel/time.c           |  7 ++-----
 arch/h8300/kernel/time.c         |  5 +----
 arch/i386/kernel/time.c          |  7 ++-----
 arch/m32r/kernel/time.c          |  7 ++-----
 arch/m68k/kernel/time.c          |  5 +----
 arch/m68knommu/kernel/time.c     |  7 ++-----
 arch/mips/kernel/sysirix.c       |  5 +----
 arch/mips/kernel/time.c          |  7 ++-----
 arch/mips/sgi-ip27/ip27-timer.c  |  2 +-
 arch/parisc/kernel/time.c        |  5 +----
 arch/ppc/kernel/time.c           |  7 ++-----
 arch/ppc64/kernel/time.c         |  7 ++-----
 arch/s390/kernel/time.c          |  5 +----
 arch/sh/kernel/time.c            |  7 ++-----
 arch/sh64/kernel/time.c          |  7 ++-----
 arch/sparc/kernel/pcic.c         |  5 +----
 arch/sparc/kernel/time.c         |  7 ++-----
 arch/sparc64/kernel/time.c       |  2 +-
 arch/v850/kernel/time.c          |  7 ++-----
 arch/x86_64/kernel/time.c        |  7 ++-----
 arch/xtensa/kernel/time.c        |  7 ++-----
 include/linux/timex.h            | 23 +++++++++++++++++++++++
 27 files changed, 65 insertions(+), 111 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index 8226c5cd788..67be50b7d80 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -149,7 +149,7 @@ irqreturn_t timer_interrupt(int irq, void *dev, struct pt_regs * regs)
 	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0
+	if (ntp_synced()
 	    && xtime.tv_sec > state.last_rtc_update + 660
 	    && xtime.tv_nsec >= 500000 - ((unsigned) TICK_SIZE) / 2
 	    && xtime.tv_nsec <= 500000 + ((unsigned) TICK_SIZE) / 2) {
@@ -502,10 +502,7 @@ do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index 8880482dcbf..69449a818dc 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -102,7 +102,7 @@ static unsigned long next_rtc_update;
  */
 static inline void do_set_rtc(void)
 {
-	if (time_status & STA_UNSYNC || set_rtc == NULL)
+	if (!ntp_synced() || set_rtc == NULL)
 		return;
 
 	if (next_rtc_update &&
@@ -292,10 +292,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/arm26/kernel/time.c b/arch/arm26/kernel/time.c
index 549a6b2e177..e66aedd02fa 100644
--- a/arch/arm26/kernel/time.c
+++ b/arch/arm26/kernel/time.c
@@ -114,7 +114,7 @@ static unsigned long next_rtc_update;
  */
 static inline void do_set_rtc(void)
 {
-	if (time_status & STA_UNSYNC || set_rtc == NULL)
+	if (!ntp_synced() || set_rtc == NULL)
 		return;
 
 //FIXME - timespec.tv_sec is a time_t not unsigned long
@@ -189,10 +189,7 @@ int do_settimeofday(struct timespec *tv)
 
 	xtime.tv_sec = tv->tv_sec;
 	xtime.tv_nsec = tv->tv_nsec;
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/cris/arch-v10/kernel/time.c b/arch/cris/arch-v10/kernel/time.c
index 6b7b4e0802e..dc3dfe9b4a1 100644
--- a/arch/cris/arch-v10/kernel/time.c
+++ b/arch/cris/arch-v10/kernel/time.c
@@ -240,7 +240,7 @@ timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	 * The division here is not time critical since it will run once in 
 	 * 11 minutes
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - (tick_nsec / 1000) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + (tick_nsec / 1000) / 2) {
diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c
index fa2d4323da2..a2d99b4aedc 100644
--- a/arch/cris/kernel/time.c
+++ b/arch/cris/kernel/time.c
@@ -114,10 +114,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c
index 075db664469..8d6558b00e4 100644
--- a/arch/frv/kernel/time.c
+++ b/arch/frv/kernel/time.c
@@ -85,7 +85,7 @@ static irqreturn_t timer_interrupt(int irq, void *dummy, struct pt_regs * regs)
 	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2
@@ -216,10 +216,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/h8300/kernel/time.c b/arch/h8300/kernel/time.c
index 8a600218334..af8c5d2057d 100644
--- a/arch/h8300/kernel/time.c
+++ b/arch/h8300/kernel/time.c
@@ -116,10 +116,7 @@ int do_settimeofday(struct timespec *tv)
 
 	xtime.tv_sec = tv->tv_sec;
 	xtime.tv_nsec = tv->tv_nsec;
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 9b94d84a6c3..eefea7c5500 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -194,10 +194,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
@@ -347,7 +344,7 @@ static void sync_cmos_clock(unsigned long dummy)
 	 * This code is run on a timer.  If the clock is set, that timer
 	 * may not expire at the correct time.  Thus, we adjust...
 	 */
-	if ((time_status & STA_UNSYNC) != 0)
+	if (!ntp_synced())
 		/*
 		 * Not synced, exit, do not restart a timer (if one is
 		 * running, let it run out).
diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c
index 8a2b77bc574..539c562cd54 100644
--- a/arch/m32r/kernel/time.c
+++ b/arch/m32r/kernel/time.c
@@ -171,10 +171,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 
@@ -221,7 +218,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
 	write_seqlock(&xtime_lock);
-	if ((time_status & STA_UNSYNC) == 0
+	if (ntp_synced()
 		&& xtime.tv_sec > last_rtc_update + 660
 		&& (xtime.tv_nsec / 1000) >= 500000 - ((unsigned)TICK_SIZE) / 2
 		&& (xtime.tv_nsec / 1000) <= 500000 + ((unsigned)TICK_SIZE) / 2)
diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c
index e47e1958852..4ec95e3cb87 100644
--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -166,10 +166,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c
index 5c3ca671627..b17c1ecba96 100644
--- a/arch/m68knommu/kernel/time.c
+++ b/arch/m68knommu/kernel/time.c
@@ -68,7 +68,7 @@ static irqreturn_t timer_interrupt(int irq, void *dummy, struct pt_regs * regs)
 	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec  / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
@@ -178,10 +178,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c
index f3bf0e43b8b..b4659546271 100644
--- a/arch/mips/kernel/sysirix.c
+++ b/arch/mips/kernel/sysirix.c
@@ -632,10 +632,7 @@ asmlinkage int irix_stime(int value)
 	write_seqlock_irq(&xtime_lock);
 	xtime.tv_sec = value;
 	xtime.tv_nsec = 0;
-	time_adjust = 0;			/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 
 	return 0;
diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c
index 648c82292ed..0dd0df7a3b0 100644
--- a/arch/mips/kernel/time.c
+++ b/arch/mips/kernel/time.c
@@ -223,10 +223,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;			/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
@@ -442,7 +439,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
 	write_seqlock(&xtime_lock);
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c
index 8c1b96fffa7..cddf1cedf00 100644
--- a/arch/mips/sgi-ip27/ip27-timer.c
+++ b/arch/mips/sgi-ip27/ip27-timer.c
@@ -118,7 +118,7 @@ again:
 	 * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to when a second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 6cf7407344b..7ff67f8e9f8 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -188,10 +188,7 @@ do_settimeofday (struct timespec *tv)
 		set_normalized_timespec(&xtime, sec, nsec);
 		set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-		time_adjust = 0;		/* stop active adjtime() */
-		time_status |= STA_UNSYNC;
-		time_maxerror = NTP_PHASE_LIMIT;
-		time_esterror = NTP_PHASE_LIMIT;
+		ntp_clear();
 	}
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
diff --git a/arch/ppc/kernel/time.c b/arch/ppc/kernel/time.c
index bf4ddca5e85..a3c5281a5d2 100644
--- a/arch/ppc/kernel/time.c
+++ b/arch/ppc/kernel/time.c
@@ -169,7 +169,7 @@ void timer_interrupt(struct pt_regs * regs)
 		 * We should have an rtc call that only sets the minutes and
 		 * seconds like on Intel to avoid problems with non UTC clocks.
 		 */
-		if ( ppc_md.set_rtc_time && (time_status & STA_UNSYNC) == 0 &&
+		if ( ppc_md.set_rtc_time && ntp_synced() &&
 		     xtime.tv_sec - last_rtc_update >= 659 &&
 		     abs((xtime.tv_nsec / 1000) - (1000000-1000000/HZ)) < 500000/HZ &&
 		     jiffies - wall_jiffies == 1) {
@@ -271,10 +271,7 @@ int do_settimeofday(struct timespec *tv)
 	 */
 	last_rtc_update = new_sec - 658;
 
-	time_adjust = 0;                /* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 	clock_was_set();
 	return 0;
diff --git a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c
index 91ef95ccda4..9939c206afa 100644
--- a/arch/ppc64/kernel/time.c
+++ b/arch/ppc64/kernel/time.c
@@ -128,7 +128,7 @@ static __inline__ void timer_check_rtc(void)
          * We should have an rtc call that only sets the minutes and
          * seconds like on Intel to avoid problems with non UTC clocks.
          */
-        if ( (time_status & STA_UNSYNC) == 0 &&
+        if (ntp_synced() &&
              xtime.tv_sec - last_rtc_update >= 659 &&
              abs((xtime.tv_nsec/1000) - (1000000-1000000/HZ)) < 500000/HZ &&
              jiffies - wall_jiffies == 1) {
@@ -435,10 +435,7 @@ int do_settimeofday(struct timespec *tv)
 	 */
 	last_rtc_update = new_sec - 658;
 
-	time_adjust = 0;                /* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 
 	delta_xsec = mulhdu( (tb_last_stamp-do_gtod.varp->tb_orig_stamp),
 			     do_gtod.varp->tb_to_xs );
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 8ca48567678..2fd75da1549 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -139,10 +139,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index d5f5aedde0a..02ca69918d7 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -215,10 +215,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 
@@ -252,7 +249,7 @@ static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
 	 * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
diff --git a/arch/sh64/kernel/time.c b/arch/sh64/kernel/time.c
index 926c6fc0619..f4a62a10053 100644
--- a/arch/sh64/kernel/time.c
+++ b/arch/sh64/kernel/time.c
@@ -247,10 +247,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 
@@ -328,7 +325,7 @@ static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
 	 * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c
index 597d3ff6ad6..36a40697b8d 100644
--- a/arch/sparc/kernel/pcic.c
+++ b/arch/sparc/kernel/pcic.c
@@ -840,10 +840,7 @@ static int pci_do_settimeofday(struct timespec *tv)
 
 	xtime.tv_sec = tv->tv_sec;
 	xtime.tv_nsec = tv->tv_nsec;
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	return 0;
 }
 
diff --git a/arch/sparc/kernel/time.c b/arch/sparc/kernel/time.c
index 3b759aefc17..bc015e98034 100644
--- a/arch/sparc/kernel/time.c
+++ b/arch/sparc/kernel/time.c
@@ -139,7 +139,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs * regs)
 
 
 	/* Determine when to update the Mostek clock. */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
@@ -554,10 +554,7 @@ static int sbus_do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	return 0;
 }
 
diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c
index 362b9c26871..3f08a32f51a 100644
--- a/arch/sparc64/kernel/time.c
+++ b/arch/sparc64/kernel/time.c
@@ -449,7 +449,7 @@ static inline void timer_check_rtc(void)
 	static long last_rtc_update;
 
 	/* Determine when to update the Mostek clock. */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
diff --git a/arch/v850/kernel/time.c b/arch/v850/kernel/time.c
index f722a268238..ea3fd8844ff 100644
--- a/arch/v850/kernel/time.c
+++ b/arch/v850/kernel/time.c
@@ -66,7 +66,7 @@ static irqreturn_t timer_interrupt (int irq, void *dummy, struct pt_regs *regs)
 	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
@@ -169,10 +169,7 @@ int do_settimeofday(struct timespec *tv)
 	xtime.tv_sec = tv->tv_sec;
 	xtime.tv_nsec = tv->tv_nsec;
 
-	time_adjust = 0;		/* stop active adjtime () */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 
 	write_sequnlock_irq (&xtime_lock);
 	clock_was_set();
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 2b5d9da912a..7b6abe05825 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -176,10 +176,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
@@ -471,7 +468,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
  * off) isn't likely to go away much sooner anyway.
  */
 
-	if ((~time_status & STA_UNSYNC) && xtime.tv_sec > rtc_update &&
+	if (ntp_synced() && xtime.tv_sec > rtc_update &&
 		abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) {
 		set_rtc_mmss(xtime.tv_sec);
 		rtc_update = xtime.tv_sec + 660;
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index e07287db5a4..1ac7d5ce745 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -122,10 +122,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;                /* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	return 0;
 }
@@ -184,7 +181,7 @@ again:
 		next += CCOUNT_PER_JIFFY;
 		do_timer (regs); /* Linux handler in kernel/timer.c */
 
-		if ((time_status & STA_UNSYNC) == 0 &&
+		if (ntp_synced() &&
 		    xtime.tv_sec - last_rtc_update >= 659 &&
 		    abs((xtime.tv_nsec/1000)-(1000000-1000000/HZ))<5000000/HZ &&
 		    jiffies - wall_jiffies == 1) {
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 74fdd07d379..7e050a2cc35 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -260,6 +260,29 @@ extern long pps_calcnt;		/* calibration intervals */
 extern long pps_errcnt;		/* calibration errors */
 extern long pps_stbcnt;		/* stability limit exceeded */
 
+/**
+ * ntp_clear - Clears the NTP state variables
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void ntp_clear(void)
+{
+	time_adjust = 0;		/* stop active adjtime() */
+	time_status |= STA_UNSYNC;
+	time_maxerror = NTP_PHASE_LIMIT;
+	time_esterror = NTP_PHASE_LIMIT;
+}
+
+/**
+ * ntp_synced - Returns 1 if the NTP status is not UNSYNC
+ *
+ */
+static inline int ntp_synced(void)
+{
+	return !(time_status & STA_UNSYNC);
+}
+
+
 #ifdef CONFIG_TIME_INTERPOLATION
 
 #define TIME_SOURCE_CPU 0
-- 
cgit v1.2.3-70-g09d2


From 61e032fa2f659fada02ede5087b46963a1c7de34 Mon Sep 17 00:00:00 2001
From: Andrey Panin <pazke@donpac.ru>
Date: Tue, 6 Sep 2005 15:18:26 -0700
Subject: [PATCH] dmi: remove uneeded function

After elimination of central DMI blacklist dmi_scan_machine() function became
a wrapper for dmi_iterate().  This patch moves some code around to kill
unneeded function.

Signed-off-by: Andrey Panin <pazke@donpac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/dmi_scan.c | 85 +++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 45 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
index a3cdf894302..6ae4ef472db 100644
--- a/arch/i386/kernel/dmi_scan.c
+++ b/arch/i386/kernel/dmi_scan.c
@@ -84,49 +84,6 @@ static int __init dmi_checksum(u8 *buf)
 	return sum == 0;
 }
 
-static int __init dmi_iterate(void (*decode)(struct dmi_header *))
-{
-	u8 buf[15];
-	char __iomem *p, *q;
-
-	/*
-	 * no iounmap() for that ioremap(); it would be a no-op, but it's
-	 * so early in setup that sucker gets confused into doing what
-	 * it shouldn't if we actually call it.
-	 */
-	p = ioremap(0xF0000, 0x10000);
-	if (p == NULL)
-		return -1;
-
-	for (q = p; q < p + 0x10000; q += 16) {
-		memcpy_fromio(buf, q, 15);
-		if ((memcmp(buf, "_DMI_", 5) == 0) && dmi_checksum(buf)) {
-			u16 num = (buf[13] << 8) | buf[12];
-			u16 len = (buf[7] << 8) | buf[6];
-			u32 base = (buf[11] << 24) | (buf[10] << 16) |
-				   (buf[9] << 8) | buf[8];
-
-			/*
-			 * DMI version 0.0 means that the real version is taken from
-			 * the SMBIOS version, which we don't know at this point.
-			 */
-			if (buf[14] != 0)
-				printk(KERN_INFO "DMI %d.%d present.\n",
-					buf[14] >> 4, buf[14] & 0xF);
-			else
-				printk(KERN_INFO "DMI present.\n");
-
-			dmi_printk((KERN_INFO "%d structures occupying %d bytes.\n",
-				num, len));
-			dmi_printk((KERN_INFO "DMI table at 0x%08X.\n", base));
-
-			if (dmi_table(base,len, num, decode) == 0)
-				return 0;
-		}
-	}
-	return -1;
-}
-
 static char *dmi_ident[DMI_STRING_MAX];
 
 /*
@@ -190,8 +147,46 @@ static void __init dmi_decode(struct dmi_header *dm)
 
 void __init dmi_scan_machine(void)
 {
-	if (dmi_iterate(dmi_decode))
-		printk(KERN_INFO "DMI not present.\n");
+	u8 buf[15];
+	char __iomem *p, *q;
+
+	/*
+	 * no iounmap() for that ioremap(); it would be a no-op, but it's
+	 * so early in setup that sucker gets confused into doing what
+	 * it shouldn't if we actually call it.
+	 */
+	p = ioremap(0xF0000, 0x10000);
+	if (p == NULL)
+		goto out;
+
+	for (q = p; q < p + 0x10000; q += 16) {
+		memcpy_fromio(buf, q, 15);
+		if ((memcmp(buf, "_DMI_", 5) == 0) && dmi_checksum(buf)) {
+			u16 num = (buf[13] << 8) | buf[12];
+			u16 len = (buf[7] << 8) | buf[6];
+			u32 base = (buf[11] << 24) | (buf[10] << 16) |
+				   (buf[9] << 8) | buf[8];
+
+			/*
+			 * DMI version 0.0 means that the real version is taken from
+			 * the SMBIOS version, which we don't know at this point.
+			 */
+			if (buf[14] != 0)
+				printk(KERN_INFO "DMI %d.%d present.\n",
+					buf[14] >> 4, buf[14] & 0xF);
+			else
+				printk(KERN_INFO "DMI present.\n");
+
+			dmi_printk((KERN_INFO "%d structures occupying %d bytes.\n",
+				num, len));
+			dmi_printk((KERN_INFO "DMI table at 0x%08X.\n", base));
+
+			if (dmi_table(base,len, num, dmi_decode) == 0)
+				return;
+		}
+	}
+
+out:	printk(KERN_INFO "DMI not present.\n");
 }
 
 
-- 
cgit v1.2.3-70-g09d2


From 4e70b9a3d68909ad7e79bf6e1b0dcec6de922a7c Mon Sep 17 00:00:00 2001
From: Andrey Panin <pazke@donpac.ru>
Date: Tue, 6 Sep 2005 15:18:28 -0700
Subject: [PATCH] dmi: remove old debugging code

DMI debugging code is unused for ages.  This patch removes it.

Signed-off-by: Andrey Panin <pazke@donpac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/dmi_scan.c | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
index 6ae4ef472db..ecdaae26275 100644
--- a/arch/i386/kernel/dmi_scan.c
+++ b/arch/i386/kernel/dmi_scan.c
@@ -12,13 +12,6 @@ struct dmi_header {
 	u16 handle;
 };
 
-#undef DMI_DEBUG
-
-#ifdef DMI_DEBUG
-#define dmi_printk(x) printk x
-#else
-#define dmi_printk(x)
-#endif
 
 static char * __init dmi_string(struct dmi_header *dm, u8 s)
 {
@@ -117,29 +110,19 @@ static void __init dmi_decode(struct dmi_header *dm)
 	
 	switch(dm->type) {
 	case  0:
-		dmi_printk(("BIOS Vendor: %s\n", dmi_string(dm, data[4])));
 		dmi_save_ident(dm, DMI_BIOS_VENDOR, 4);
-		dmi_printk(("BIOS Version: %s\n", dmi_string(dm, data[5])));
 		dmi_save_ident(dm, DMI_BIOS_VERSION, 5);
-		dmi_printk(("BIOS Release: %s\n", dmi_string(dm, data[8])));
 		dmi_save_ident(dm, DMI_BIOS_DATE, 8);
 		break;
 	case 1:
-		dmi_printk(("System Vendor: %s\n", dmi_string(dm, data[4])));
 		dmi_save_ident(dm, DMI_SYS_VENDOR, 4);
-		dmi_printk(("Product Name: %s\n", dmi_string(dm, data[5])));
 		dmi_save_ident(dm, DMI_PRODUCT_NAME, 5);
-		dmi_printk(("Version: %s\n", dmi_string(dm, data[6])));
 		dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6);
-		dmi_printk(("Serial Number: %s\n", dmi_string(dm, data[7])));
 		dmi_save_ident(dm, DMI_PRODUCT_SERIAL, 7);
 		break;
 	case 2:
-		dmi_printk(("Board Vendor: %s\n", dmi_string(dm, data[4])));
 		dmi_save_ident(dm, DMI_BOARD_VENDOR, 4);
-		dmi_printk(("Board Name: %s\n", dmi_string(dm, data[5])));
 		dmi_save_ident(dm, DMI_BOARD_NAME, 5);
-		dmi_printk(("Board Version: %s\n", dmi_string(dm, data[6])));
 		dmi_save_ident(dm, DMI_BOARD_VERSION, 6);
 		break;
 	}
@@ -177,10 +160,6 @@ void __init dmi_scan_machine(void)
 			else
 				printk(KERN_INFO "DMI present.\n");
 
-			dmi_printk((KERN_INFO "%d structures occupying %d bytes.\n",
-				num, len));
-			dmi_printk((KERN_INFO "DMI table at 0x%08X.\n", base));
-
 			if (dmi_table(base,len, num, dmi_decode) == 0)
 				return;
 		}
-- 
cgit v1.2.3-70-g09d2


From c3c7120d552989be94c9137989be5abb6da8954f Mon Sep 17 00:00:00 2001
From: Andrey Panin <pazke@donpac.ru>
Date: Tue, 6 Sep 2005 15:18:28 -0700
Subject: [PATCH] dmi: make dmi_string() behave like strdup()

This patch changes dmi_string() function to allocate string copy by itself, to
avoid code duplication in the next patch.

Signed-off-by: Andrey Panin <pazke@donpac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/dmi_scan.c | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
index ecdaae26275..ae1a1aed2fc 100644
--- a/arch/i386/kernel/dmi_scan.c
+++ b/arch/i386/kernel/dmi_scan.c
@@ -16,15 +16,25 @@ struct dmi_header {
 static char * __init dmi_string(struct dmi_header *dm, u8 s)
 {
 	u8 *bp = ((u8 *) dm) + dm->length;
+	char *str = "";
 
-	if (!s)
-		return "";
-	s--;
-	while (s > 0 && *bp) {
-		bp += strlen(bp) + 1;
+	if (s) {
 		s--;
-	}
-	return bp;
+		while (s > 0 && *bp) {
+			bp += strlen(bp) + 1;
+			s--;
+		}
+
+		if (*bp != 0) {
+			str = alloc_bootmem(strlen(bp) + 1);
+			if (str != NULL)
+				strcpy(str, bp);
+			else
+				printk(KERN_ERR "dmi_string: out of memory.\n");
+		}
+ 	}
+
+	return str;
 }
 
 /*
@@ -84,19 +94,16 @@ static char *dmi_ident[DMI_STRING_MAX];
  */
 static void __init dmi_save_ident(struct dmi_header *dm, int slot, int string)
 {
-	char *d = (char*)dm;
-	char *p = dmi_string(dm, d[string]);
+	char *p, *d = (char*) dm;
 
-	if (p == NULL || *p == 0)
-		return;
 	if (dmi_ident[slot])
 		return;
 
-	dmi_ident[slot] = alloc_bootmem(strlen(p) + 1);
-	if(dmi_ident[slot])
-		strcpy(dmi_ident[slot], p);
-	else
-		printk(KERN_ERR "dmi_save_ident: out of memory.\n");
+	p = dmi_string(dm, d[string]);
+	if (p == NULL)
+		return;
+
+	dmi_ident[slot] = p;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From ebad6a4230bdb5927495e28bc7837f515bf667a7 Mon Sep 17 00:00:00 2001
From: Andrey Panin <pazke@donpac.ru>
Date: Tue, 6 Sep 2005 15:18:29 -0700
Subject: [PATCH] dmi: add onboard devices discovery

This patch adds onboard devices and IPMI BMC discovery into DMI scan code.
Drivers can use dmi_find_device() function to search for devices by type and
name.

Signed-off-by: Andrey Panin <pazke@donpac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/dmi_scan.c | 102 ++++++++++++++++++++++++++++++++++++++------
 include/linux/dmi.h         |  36 ++++++++++++++--
 2 files changed, 123 insertions(+), 15 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
index ae1a1aed2fc..c4a73855e38 100644
--- a/arch/i386/kernel/dmi_scan.c
+++ b/arch/i386/kernel/dmi_scan.c
@@ -6,13 +6,6 @@
 #include <linux/bootmem.h>
 
 
-struct dmi_header {
-	u8 type;
-	u8 length;
-	u16 handle;
-};
-
-
 static char * __init dmi_string(struct dmi_header *dm, u8 s)
 {
 	u8 *bp = ((u8 *) dm) + dm->length;
@@ -88,6 +81,7 @@ static int __init dmi_checksum(u8 *buf)
 }
 
 static char *dmi_ident[DMI_STRING_MAX];
+static LIST_HEAD(dmi_devices);
 
 /*
  *	Save a DMI string
@@ -106,6 +100,58 @@ static void __init dmi_save_ident(struct dmi_header *dm, int slot, int string)
 	dmi_ident[slot] = p;
 }
 
+static void __init dmi_save_devices(struct dmi_header *dm)
+{
+	int i, count = (dm->length - sizeof(struct dmi_header)) / 2;
+	struct dmi_device *dev;
+
+	for (i = 0; i < count; i++) {
+		char *d = ((char *) dm) + (i * 2);
+
+		/* Skip disabled device */
+		if ((*d & 0x80) == 0)
+			continue;
+
+		dev = alloc_bootmem(sizeof(*dev));
+		if (!dev) {
+			printk(KERN_ERR "dmi_save_devices: out of memory.\n");
+			break;
+		}
+
+		dev->type = *d++ & 0x7f;
+		dev->name = dmi_string(dm, *d);
+		dev->device_data = NULL;
+
+		list_add(&dev->list, &dmi_devices);
+	}
+}
+
+static void __init dmi_save_ipmi_device(struct dmi_header *dm)
+{
+	struct dmi_device *dev;
+	void * data;
+
+	data = alloc_bootmem(dm->length);
+	if (data == NULL) {
+		printk(KERN_ERR "dmi_save_ipmi_device: out of memory.\n");
+		return;
+	}
+
+	memcpy(data, dm, dm->length);
+
+	dev = alloc_bootmem(sizeof(*dev));
+	if (!dev) {
+		printk(KERN_ERR "dmi_save_ipmi_device: out of memory.\n");
+		return;
+	}
+
+	dev->type = DMI_DEV_TYPE_IPMI;
+	dev->name = "IPMI controller";
+	dev->device_data = data;
+
+	list_add(&dev->list, &dmi_devices);
+}
+
 /*
  *	Process a DMI table entry. Right now all we care about are the BIOS
  *	and machine entries. For 2.5 we should pull the smbus controller info
@@ -113,25 +159,28 @@ static void __init dmi_save_ident(struct dmi_header *dm, int slot, int string)
  */
 static void __init dmi_decode(struct dmi_header *dm)
 {
-	u8 *data __attribute__((__unused__)) = (u8 *)dm;
-	
 	switch(dm->type) {
-	case  0:
+	case 0:		/* BIOS Information */
 		dmi_save_ident(dm, DMI_BIOS_VENDOR, 4);
 		dmi_save_ident(dm, DMI_BIOS_VERSION, 5);
 		dmi_save_ident(dm, DMI_BIOS_DATE, 8);
 		break;
-	case 1:
+	case 1:		/* System Information */
 		dmi_save_ident(dm, DMI_SYS_VENDOR, 4);
 		dmi_save_ident(dm, DMI_PRODUCT_NAME, 5);
 		dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6);
 		dmi_save_ident(dm, DMI_PRODUCT_SERIAL, 7);
 		break;
-	case 2:
+	case 2:		/* Base Board Information */
 		dmi_save_ident(dm, DMI_BOARD_VENDOR, 4);
 		dmi_save_ident(dm, DMI_BOARD_NAME, 5);
 		dmi_save_ident(dm, DMI_BOARD_VERSION, 6);
 		break;
+	case 10:	/* Onboard Devices Information */
+		dmi_save_devices(dm);
+		break;
+	case 38:	/* IPMI Device Information */
+		dmi_save_ipmi_device(dm);
 	}
 }
 
@@ -221,3 +270,32 @@ char *dmi_get_system_info(int field)
 	return dmi_ident[field];
 }
 EXPORT_SYMBOL(dmi_get_system_info);
+
+/**
+ *	dmi_find_device - find onboard device by type/name
+ *	@type: device type or %DMI_DEV_TYPE_ANY to match all device types
+ *	@desc: device name string or %NULL to match all
+ *	@from: previous device found in search, or %NULL for new search.
+ *
+ *	Iterates through the list of known onboard devices. If a device is
+ *	found with a matching @vendor and @device, a pointer to its device
+ *	structure is returned.  Otherwise, %NULL is returned.
+ *	A new search is initiated by passing %NULL to the @from argument.
+ *	If @from is not %NULL, searches continue from next device.
+ */
+struct dmi_device * dmi_find_device(int type, const char *name,
+				    struct dmi_device *from)
+{
+	struct list_head *d, *head = from ? &from->list : &dmi_devices;
+
+	for(d = head->next; d != &dmi_devices; d = d->next) {
+		struct dmi_device *dev = list_entry(d, struct dmi_device, list);
+
+		if (((type == DMI_DEV_TYPE_ANY) || (dev->type == type)) &&
+		    ((name == NULL) || (strcmp(dev->name, name) == 0)))
+			return dev;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(dmi_find_device);
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index 5e93e6dce9a..c30175e8dec 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -1,6 +1,8 @@
 #ifndef __DMI_H__
 #define __DMI_H__
 
+#include <linux/list.h>
+
 enum dmi_field {
 	DMI_NONE,
 	DMI_BIOS_VENDOR,
@@ -16,6 +18,24 @@ enum dmi_field {
 	DMI_STRING_MAX,
 };
 
+enum dmi_device_type {
+	DMI_DEV_TYPE_ANY = 0,
+	DMI_DEV_TYPE_OTHER,
+	DMI_DEV_TYPE_UNKNOWN,
+	DMI_DEV_TYPE_VIDEO,
+	DMI_DEV_TYPE_SCSI,
+	DMI_DEV_TYPE_ETHERNET,
+	DMI_DEV_TYPE_TOKENRING,
+	DMI_DEV_TYPE_SOUND,
+	DMI_DEV_TYPE_IPMI = -1
+};
+
+struct dmi_header {
+	u8 type;
+	u8 length;
+	u16 handle;
+};
+
 /*
  *	DMI callbacks for problem boards
  */
@@ -26,22 +46,32 @@ struct dmi_strmatch {
 
 struct dmi_system_id {
 	int (*callback)(struct dmi_system_id *);
-	char *ident;
+	const char *ident;
 	struct dmi_strmatch matches[4];
 	void *driver_data;
 };
 
-#define DMI_MATCH(a,b)	{ a, b }
+#define DMI_MATCH(a, b)	{ a, b }
+
+struct dmi_device {
+	struct list_head list;
+	int type;
+	const char *name;
+	void *device_data;	/* Type specific data */
+};
 
 #if defined(CONFIG_X86) && !defined(CONFIG_X86_64)
 
 extern int dmi_check_system(struct dmi_system_id *list);
 extern char * dmi_get_system_info(int field);
-
+extern struct dmi_device * dmi_find_device(int type, const char *name,
+	struct dmi_device *from);
 #else
 
 static inline int dmi_check_system(struct dmi_system_id *list) { return 0; }
 static inline char * dmi_get_system_info(int field) { return NULL; }
+static struct dmi_device * dmi_find_device(int type, const char *name,
+	struct dmi_device *from) { return NULL; }
 
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 640e803376b9c4072f69fec42e304c974a631298 Mon Sep 17 00:00:00 2001
From: Robert Love <rml@novell.com>
Date: Tue, 6 Sep 2005 15:18:30 -0700
Subject: [PATCH] fix: dmi_check_system

Background:

	1) dmi_check_system() returns the count of the number of
	   matches.  Zero thus means no matches.
	2) A match callback can return nonzero to stop the match
	   checking.

Bug: The count is incremented after we check for the nonzero return value,
so it does not reflect the actual count.  We could say this is intended,
for some dumb reason, except that it means that a match on the first check
returns zero--no matches--if the callback returns nonzero.

Attached patch implements the count before calling the callback and thus
before potentially short-circuiting.

Signed-off-by: Robert Love <rml@novell.com>
Cc: Andrey Panin <pazke@donpac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/dmi_scan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
index c4a73855e38..58516e2ac17 100644
--- a/arch/i386/kernel/dmi_scan.c
+++ b/arch/i386/kernel/dmi_scan.c
@@ -248,9 +248,9 @@ int dmi_check_system(struct dmi_system_id *list)
 			/* No match */
 			goto fail;
 		}
+		count++;
 		if (d->callback && d->callback(d))
 			break;
-		count++;
 fail:		d++;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 3d97ae5b958855ac007b6f56a0f94ab8ade09e9e Mon Sep 17 00:00:00 2001
From: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Date: Tue, 6 Sep 2005 15:19:27 -0700
Subject: [PATCH] kprobes: prevent possible race conditions i386 changes

This patch contains the i386 architecture specific changes to prevent the
possible race conditions.

Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/entry.S       | 13 ++++++++-----
 arch/i386/kernel/kprobes.c     | 29 +++++++++++++++--------------
 arch/i386/kernel/traps.c       | 12 +++++++-----
 arch/i386/kernel/vmlinux.lds.S |  1 +
 arch/i386/mm/fault.c           |  4 +++-
 5 files changed, 34 insertions(+), 25 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index abb909793ef..3aad0383966 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -507,7 +507,7 @@ label:						\
 	pushl $__KERNEL_CS;			\
 	pushl $sysenter_past_esp
 
-ENTRY(debug)
+KPROBE_ENTRY(debug)
 	cmpl $sysenter_entry,(%esp)
 	jne debug_stack_correct
 	FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
@@ -518,7 +518,7 @@ debug_stack_correct:
 	movl %esp,%eax			# pt_regs pointer
 	call do_debug
 	jmp ret_from_exception
-
+	.previous .text
 /*
  * NMI is doubly nasty. It can happen _while_ we're handling
  * a debug fault, and the debug fault hasn't yet been able to
@@ -591,13 +591,14 @@ nmi_16bit_stack:
 	.long 1b,iret_exc
 .previous
 
-ENTRY(int3)
+KPROBE_ENTRY(int3)
 	pushl $-1			# mark this as an int
 	SAVE_ALL
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
 	call do_int3
 	jmp ret_from_exception
+	.previous .text
 
 ENTRY(overflow)
 	pushl $0
@@ -631,17 +632,19 @@ ENTRY(stack_segment)
 	pushl $do_stack_segment
 	jmp error_code
 
-ENTRY(general_protection)
+KPROBE_ENTRY(general_protection)
 	pushl $do_general_protection
 	jmp error_code
+	.previous .text
 
 ENTRY(alignment_check)
 	pushl $do_alignment_check
 	jmp error_code
 
-ENTRY(page_fault)
+KPROBE_ENTRY(page_fault)
 	pushl $do_page_fault
 	jmp error_code
+	.previous .text
 
 #ifdef CONFIG_X86_MCE
 ENTRY(machine_check)
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index a6d8c45961d..7fb5a6f4a56 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -62,32 +62,32 @@ static inline int is_IF_modifier(kprobe_opcode_t opcode)
 	return 0;
 }
 
-int arch_prepare_kprobe(struct kprobe *p)
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
 	return 0;
 }
 
-void arch_copy_kprobe(struct kprobe *p)
+void __kprobes arch_copy_kprobe(struct kprobe *p)
 {
 	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
 	p->opcode = *p->addr;
 }
 
-void arch_arm_kprobe(struct kprobe *p)
+void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
 	*p->addr = BREAKPOINT_INSTRUCTION;
 	flush_icache_range((unsigned long) p->addr,
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-void arch_disarm_kprobe(struct kprobe *p)
+void __kprobes arch_disarm_kprobe(struct kprobe *p)
 {
 	*p->addr = p->opcode;
 	flush_icache_range((unsigned long) p->addr,
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-void arch_remove_kprobe(struct kprobe *p)
+void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
 }
 
@@ -127,7 +127,8 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 		regs->eip = (unsigned long)&p->ainsn.insn;
 }
 
-void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
+void __kprobes arch_prepare_kretprobe(struct kretprobe *rp,
+				      struct pt_regs *regs)
 {
 	unsigned long *sara = (unsigned long *)&regs->esp;
         struct kretprobe_instance *ri;
@@ -150,7 +151,7 @@ void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
  * Interrupts are disabled on entry as trap3 is an interrupt gate and they
  * remain disabled thorough out this function.
  */
-static int kprobe_handler(struct pt_regs *regs)
+static int __kprobes kprobe_handler(struct pt_regs *regs)
 {
 	struct kprobe *p;
 	int ret = 0;
@@ -259,7 +260,7 @@ no_kprobe:
 /*
  * Called when we hit the probe point at kretprobe_trampoline
  */
-int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 {
         struct kretprobe_instance *ri = NULL;
         struct hlist_head *head;
@@ -338,7 +339,7 @@ int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
  * that is atop the stack is the address following the copied instruction.
  * We need to make it the address following the original instruction.
  */
-static void resume_execution(struct kprobe *p, struct pt_regs *regs)
+static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
 {
 	unsigned long *tos = (unsigned long *)&regs->esp;
 	unsigned long next_eip = 0;
@@ -444,8 +445,8 @@ static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 /*
  * Wrapper routine to for handling exceptions.
  */
-int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
-			     void *data)
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+				       unsigned long val, void *data)
 {
 	struct die_args *args = (struct die_args *)data;
 	switch (val) {
@@ -473,7 +474,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
 	return NOTIFY_DONE;
 }
 
-int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
 	unsigned long addr;
@@ -495,7 +496,7 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	return 1;
 }
 
-void jprobe_return(void)
+void __kprobes jprobe_return(void)
 {
 	preempt_enable_no_resched();
 	asm volatile ("       xchgl   %%ebx,%%esp     \n"
@@ -506,7 +507,7 @@ void jprobe_return(void)
 		      (jprobe_saved_esp):"memory");
 }
 
-int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	u8 *addr = (u8 *) (regs->eip - 1);
 	unsigned long stack_addr = (unsigned long)jprobe_saved_esp;
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 029bf94cda7..09a58cb6daa 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -363,8 +363,9 @@ static inline void die_if_kernel(const char * str, struct pt_regs * regs, long e
 		die(str, regs, err);
 }
 
-static void do_trap(int trapnr, int signr, char *str, int vm86,
-			   struct pt_regs * regs, long error_code, siginfo_t *info)
+static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
+			      struct pt_regs * regs, long error_code,
+			      siginfo_t *info)
 {
 	struct task_struct *tsk = current;
 	tsk->thread.error_code = error_code;
@@ -460,7 +461,8 @@ DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
 DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
 
-fastcall void do_general_protection(struct pt_regs * regs, long error_code)
+fastcall void __kprobes do_general_protection(struct pt_regs * regs,
+					      long error_code)
 {
 	int cpu = get_cpu();
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
@@ -676,7 +678,7 @@ void unset_nmi_callback(void)
 EXPORT_SYMBOL_GPL(unset_nmi_callback);
 
 #ifdef CONFIG_KPROBES
-fastcall void do_int3(struct pt_regs *regs, long error_code)
+fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {
 	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
 			== NOTIFY_STOP)
@@ -710,7 +712,7 @@ fastcall void do_int3(struct pt_regs *regs, long error_code)
  * find every occurrence of the TF bit that could be saved away even
  * by user code)
  */
-fastcall void do_debug(struct pt_regs * regs, long error_code)
+fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
 {
 	unsigned int condition;
 	struct task_struct *tsk = current;
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 761972f8cb6..13b9c62cbbb 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -22,6 +22,7 @@ SECTIONS
 	*(.text)
 	SCHED_TEXT
 	LOCK_TEXT
+	KPROBES_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x9090
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 411b8500ad1..9edd4485b91 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -21,6 +21,7 @@
 #include <linux/vt_kern.h>		/* For unblank_screen() */
 #include <linux/highmem.h>
 #include <linux/module.h>
+#include <linux/kprobes.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -223,7 +224,8 @@ fastcall void do_invalid_op(struct pt_regs *, unsigned long);
  *	bit 1 == 0 means read, 1 means write
  *	bit 2 == 0 means kernel, 1 means user-mode
  */
-fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code)
+fastcall void __kprobes do_page_fault(struct pt_regs *regs,
+				      unsigned long error_code)
 {
 	struct task_struct *tsk;
 	struct mm_struct *mm;
-- 
cgit v1.2.3-70-g09d2


From bce0649417d6e71f6df8ab7b11103d247913b142 Mon Sep 17 00:00:00 2001
From: Jim Keniston <jkenisto@us.ibm.com>
Date: Tue, 6 Sep 2005 15:19:34 -0700
Subject: [PATCH] kprobes: fix handling of simultaneous probe hit/unregister

This patch fixes a bug in kprobes's handling of a corner case on i386 and
x86_64.  On an SMP system, if one CPU unregisters a kprobe just after
another CPU hits that probepoint, kprobe_handler() on the latter CPU sees
that the kprobe has been unregistered, and attempts to let the CPU continue
as if the probepoint hadn't been hit.  The bug is that on i386 and x86_64,
we were neglecting to set the IP back to the beginning of the probed
instruction.  This could cause an oops or crash.

This bug doesn't exist on ppc64 and ia64, where a breakpoint instruction
leaves the IP pointing to the beginning of the instruction.  I don't know
about sparc64.  (Dave, could you please advise?)

This fix has been tested on i386 and x86_64 SMP systems.  To reproduce the
problem, set one CPU to work registering and unregistering a kprobe
repeatedly, and another CPU pounding the probepoint in a tight loop.

Acked-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c   | 3 +++
 arch/x86_64/kernel/kprobes.c | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index 7fb5a6f4a56..e5cec32018a 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -221,7 +221,10 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 			 * either a probepoint or a debugger breakpoint
 			 * at this address.  In either case, no further
 			 * handling of this interrupt is appropriate.
+			 * Back up over the (now missing) int3 and run
+			 * the original instruction.
 			 */
+			regs->eip -= sizeof(kprobe_opcode_t);
 			ret = 1;
 		}
 		/* Not one of ours: let kernel handle it */
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index c21cceaea27..2d7658fbbb2 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -361,7 +361,10 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 			 * either a probepoint or a debugger breakpoint
 			 * at this address.  In either case, no further
 			 * handling of this interrupt is appropriate.
+			 * Back up over the (now missing) int3 and run
+			 * the original instruction.
 			 */
+			regs->rip = (unsigned long)addr;
 			ret = 1;
 		}
 		/* Not one of ours: let kernel handle it */
-- 
cgit v1.2.3-70-g09d2


From deac66ae454cacf942c051b86d9232af546fb187 Mon Sep 17 00:00:00 2001
From: Keshavamurthy Anil S <anil.s.keshavamurthy@intel.com>
Date: Tue, 6 Sep 2005 15:19:35 -0700
Subject: [PATCH] kprobes: fix bug when probed on task and isr functions

This patch fixes a race condition where in system used to hang or sometime
crash within minutes when kprobes are inserted on ISR routine and a task
routine.

The fix has been stress tested on i386, ia64, pp64 and on x86_64.  To
reproduce the problem insert kprobes on schedule() and do_IRQ() functions
and you should see hang or system crash.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c   |  3 ++-
 arch/ia64/kernel/kprobes.c   | 22 +++++++++++++++++++---
 arch/ppc64/kernel/kprobes.c  | 11 ++++++-----
 arch/x86_64/kernel/kprobes.c |  3 ++-
 include/asm-ia64/kprobes.h   |  1 +
 include/asm-ppc64/kprobes.h  |  3 +++
 kernel/kprobes.c             | 22 ++++++++++++++++++++++
 7 files changed, 55 insertions(+), 10 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index e5cec32018a..6345b430b10 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -177,7 +177,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 		   Disarm the probe we just hit, and ignore it. */
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS) {
+			if (kprobe_status == KPROBE_HIT_SS &&
+				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
 				regs->eflags &= ~TF_MASK;
 				regs->eflags |= kprobe_saved_eflags;
 				unlock_kprobes();
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 4b1bd539ec4..471086b808a 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -95,6 +95,17 @@ static void __kprobes update_kprobe_inst_flag(uint template, uint  slot,
 	p->ainsn.inst_flag = 0;
 	p->ainsn.target_br_reg = 0;
 
+	/* Check for Break instruction
+ 	 * Bits 37:40 Major opcode to be zero
+	 * Bits 27:32 X6 to be zero
+	 * Bits 32:35 X3 to be zero
+	 */
+	if ((!major_opcode) && (!((kprobe_inst >> 27) & 0x1FF)) ) {
+		/* is a break instruction */
+	 	p->ainsn.inst_flag |= INST_FLAG_BREAK_INST;
+		return;
+	}
+
 	if (bundle_encoding[template][slot] == B) {
 		switch (major_opcode) {
 		  case INDIRECT_CALL_OPCODE:
@@ -542,8 +553,11 @@ static void __kprobes prepare_ss(struct kprobe *p, struct pt_regs *regs)
 	unsigned long bundle_addr = (unsigned long) &p->opcode.bundle;
 	unsigned long slot = (unsigned long)p->addr & 0xf;
 
-	/* Update instruction pointer (IIP) and slot number (IPSR.ri) */
-	regs->cr_iip = bundle_addr & ~0xFULL;
+	/* single step inline if break instruction */
+	if (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)
+		regs->cr_iip = (unsigned long)p->addr & ~0xFULL;
+	else
+		regs->cr_iip = bundle_addr & ~0xFULL;
 
 	if (slot > 2)
 		slot = 0;
@@ -599,7 +613,9 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 	if (kprobe_running()) {
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS) {
+			if ( (kprobe_status == KPROBE_HIT_SS) &&
+	 		     (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)) {
+  				ia64_psr(regs)->ss = 0;
 				unlock_kprobes();
 				goto no_kprobe;
 			}
diff --git a/arch/ppc64/kernel/kprobes.c b/arch/ppc64/kernel/kprobes.c
index 591e4b67b5a..7e80d49c589 100644
--- a/arch/ppc64/kernel/kprobes.c
+++ b/arch/ppc64/kernel/kprobes.c
@@ -102,7 +102,7 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 	regs->msr |= MSR_SE;
 
 	/* single step inline if it is a trap variant */
-	if (IS_TW(insn) || IS_TD(insn) || IS_TWI(insn) || IS_TDI(insn))
+	if (is_trap(insn))
 		regs->nip = (unsigned long)p->addr;
 	else
 		regs->nip = (unsigned long)p->ainsn.insn;
@@ -152,7 +152,9 @@ static inline int kprobe_handler(struct pt_regs *regs)
 		   Disarm the probe we just hit, and ignore it. */
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS) {
+			kprobe_opcode_t insn = *p->ainsn.insn;
+			if (kprobe_status == KPROBE_HIT_SS &&
+					is_trap(insn)) {
 				regs->msr &= ~MSR_SE;
 				regs->msr |= kprobe_saved_msr;
 				unlock_kprobes();
@@ -192,8 +194,7 @@ static inline int kprobe_handler(struct pt_regs *regs)
 			 * trap variant, it could belong to someone else
 			 */
 			kprobe_opcode_t cur_insn = *addr;
-			if (IS_TW(cur_insn) || IS_TD(cur_insn) ||
-					IS_TWI(cur_insn) || IS_TDI(cur_insn))
+			if (is_trap(cur_insn))
 		       		goto no_kprobe;
 			/*
 			 * The breakpoint instruction was removed right
@@ -403,7 +404,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	default:
 		break;
 	}
-	preempt_enable();
+	preempt_enable_no_resched();
 	return ret;
 }
 
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 2d7658fbbb2..df08c43276a 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -311,7 +311,8 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 		   Disarm the probe we just hit, and ignore it. */
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS) {
+			if (kprobe_status == KPROBE_HIT_SS &&
+				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
 				regs->eflags &= ~TF_MASK;
 				regs->eflags |= kprobe_saved_rflags;
 				unlock_kprobes();
diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
index bf36a32e37e..573a3574a24 100644
--- a/include/asm-ia64/kprobes.h
+++ b/include/asm-ia64/kprobes.h
@@ -92,6 +92,7 @@ struct arch_specific_insn {
 	kprobe_opcode_t insn;
  #define INST_FLAG_FIX_RELATIVE_IP_ADDR		1
  #define INST_FLAG_FIX_BRANCH_REG		2
+ #define INST_FLAG_BREAK_INST			4
  	unsigned long inst_flag;
  	unsigned short target_br_reg;
 };
diff --git a/include/asm-ppc64/kprobes.h b/include/asm-ppc64/kprobes.h
index 0802919c323..d9129d2b038 100644
--- a/include/asm-ppc64/kprobes.h
+++ b/include/asm-ppc64/kprobes.h
@@ -42,6 +42,9 @@ typedef unsigned int kprobe_opcode_t;
 
 #define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)((func_descr_t *)pentry)
 
+#define is_trap(instr)	(IS_TW(instr) || IS_TD(instr) || \
+			IS_TWI(instr) || IS_TDI(instr))
+
 #define ARCH_SUPPORTS_KRETPROBES
 void kretprobe_trampoline(void);
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3b7653f2e7a..f3ea492ab44 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -155,14 +155,36 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
 /* Locks kprobe: irqs must be disabled */
 void __kprobes lock_kprobes(void)
 {
+	unsigned long flags = 0;
+
+	/* Avoiding local interrupts to happen right after we take the kprobe_lock
+	 * and before we get a chance to update kprobe_cpu, this to prevent
+	 * deadlock when we have a kprobe on ISR routine and a kprobe on task
+	 * routine
+	 */
+	local_irq_save(flags);
+
 	spin_lock(&kprobe_lock);
 	kprobe_cpu = smp_processor_id();
+
+ 	local_irq_restore(flags);
 }
 
 void __kprobes unlock_kprobes(void)
 {
+	unsigned long flags = 0;
+
+	/* Avoiding local interrupts to happen right after we update
+	 * kprobe_cpu and before we get a a chance to release kprobe_lock,
+	 * this to prevent deadlock when we have a kprobe on ISR routine and
+	 * a kprobe on task routine
+	 */
+	local_irq_save(flags);
+
 	kprobe_cpu = NR_CPUS;
 	spin_unlock(&kprobe_lock);
+
+ 	local_irq_restore(flags);
 }
 
 /* You have to be holding the kprobe_lock */
-- 
cgit v1.2.3-70-g09d2


From a08b6b7968e7a6afc75e365ac31830867275abdc Mon Sep 17 00:00:00 2001
From: "viro@ZenIV.linux.org.uk" <viro@ZenIV.linux.org.uk>
Date: Tue, 6 Sep 2005 01:48:42 +0100
Subject: [PATCH] Kconfig fix (BLK_DEV_FD dependencies)

Sanitized and fixed floppy dependencies: split the messy dependencies for
BLK_DEV_FD by introducing a new symbol (ARCH_MAY_HAVE_PC_FDC), making
BLK_DEV_FD depend on that one and taking declarations of ARCH_MAY_HAVE_PC_FDC
to arch/*/Kconfig.  While we are at it, fixed several obvious cases when
BLK_DEV_FD should have been excluded (architectures lacking asm/floppy.h
are *not* going to have floppy.c compile, let alone work).

If you can come up with better name for that ("this architecture might
have working PC-compatible floppy disk controller"), you are more than
welcome - just s/ARCH_MAY_HAVE_PC_FDC/your_prefered_name/g in the patch
below...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/Kconfig               | 3 +++
 arch/arm/Kconfig                 | 4 ++++
 arch/arm/mach-footbridge/Kconfig | 1 +
 arch/arm26/Kconfig               | 4 ++++
 arch/i386/Kconfig                | 4 ++++
 arch/m68k/Kconfig                | 5 +++++
 arch/mips/Kconfig                | 5 +++++
 arch/parisc/Kconfig              | 4 ++++
 arch/ppc/Kconfig                 | 4 ++++
 arch/ppc64/Kconfig               | 4 ++++
 arch/sh/Kconfig                  | 4 ++++
 arch/sparc/Kconfig               | 4 ++++
 arch/sparc64/Kconfig             | 4 ++++
 arch/x86_64/Kconfig              | 4 ++++
 drivers/block/Kconfig            | 2 +-
 15 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'arch/i386')

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 189d5eababa..786491f9ceb 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -479,6 +479,9 @@ config EISA
 	depends on ALPHA_GENERIC || ALPHA_JENSEN || ALPHA_ALCOR || ALPHA_MIKASA || ALPHA_SABLE || ALPHA_LYNX || ALPHA_NORITAKE || ALPHA_RAWHIDE
 	default y
 
+config ARCH_MAY_HAVE_PC_FDC
+	def_bool y
+
 config SMP
 	bool "Symmetric multi-processing support"
 	depends on ALPHA_SABLE || ALPHA_LYNX || ALPHA_RAWHIDE || ALPHA_DP264 || ALPHA_WILDFIRE || ALPHA_TITAN || ALPHA_GENERIC || ALPHA_SHARK || ALPHA_MARVEL
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 68dfdba71d7..0f2899b4159 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -64,6 +64,9 @@ config GENERIC_CALIBRATE_DELAY
 config GENERIC_BUST_SPINLOCK
 	bool
 
+config ARCH_MAY_HAVE_PC_FDC
+	bool
+
 config GENERIC_ISA_DMA
 	bool
 
@@ -150,6 +153,7 @@ config ARCH_RPC
 	select ARCH_ACORN
 	select FIQ
 	select TIMER_ACORN
+	select ARCH_MAY_HAVE_PC_FDC
 	help
 	  On the Acorn Risc-PC, Linux can support the internal IDE disk and
 	  CD-ROM interface, serial and parallel port, and the floppy drive.
diff --git a/arch/arm/mach-footbridge/Kconfig b/arch/arm/mach-footbridge/Kconfig
index 324d9edeec3..bdd257921cf 100644
--- a/arch/arm/mach-footbridge/Kconfig
+++ b/arch/arm/mach-footbridge/Kconfig
@@ -87,6 +87,7 @@ config FOOTBRIDGE_ADDIN
 
 # EBSA285 board in either host or addin mode
 config ARCH_EBSA285
+	select ARCH_MAY_HAVE_PC_FDC
 	bool
 
 endif
diff --git a/arch/arm26/Kconfig b/arch/arm26/Kconfig
index 1f037326730..1f00b3d03a0 100644
--- a/arch/arm26/Kconfig
+++ b/arch/arm26/Kconfig
@@ -55,6 +55,10 @@ config GENERIC_BUST_SPINLOCK
 config GENERIC_ISA_DMA
 	bool
 
+config ARCH_MAY_HAVE_PC_FDC
+	bool
+	default y
+
 source "init/Kconfig"
 
 
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 4b7de3e1e57..5d51b38bd70 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -37,6 +37,10 @@ config GENERIC_IOMAP
 	bool
 	default y
 
+config ARCH_MAY_HAVE_PC_FDC
+	bool
+	default y
+
 source "init/Kconfig"
 
 menu "Processor type and features"
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 178c4a3fbb7..ba960bbc8e6 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -25,6 +25,11 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
+config ARCH_MAY_HAVE_PC_FDC
+	bool
+	depends on Q40 || (BROKEN && SUN3X)
+	default y
+
 mainmenu "Linux/68k Kernel Configuration"
 
 source "init/Kconfig"
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index d79fba0aa8b..8d76eb1ff29 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -4,6 +4,11 @@ config MIPS
 	# Horrible source of confusion.  Die, die, die ...
 	select EMBEDDED
 
+# shouldn't it be per-subarchitecture?
+config ARCH_MAY_HAVE_PC_FDC
+	bool
+	default y
+
 mainmenu "Linux/MIPS Kernel Configuration"
 
 source "init/Kconfig"
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 1c2d8743523..0b07922a2ac 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -49,6 +49,10 @@ config ISA_DMA_API
 	bool
 	default y
 
+config ARCH_MAY_HAVE_PC_FDC
+	bool
+	default y
+
 source "init/Kconfig"
 
 
diff --git a/arch/ppc/Kconfig b/arch/ppc/Kconfig
index 36dee0ff5ca..6ab7e5ea5fc 100644
--- a/arch/ppc/Kconfig
+++ b/arch/ppc/Kconfig
@@ -47,6 +47,10 @@ config SCHED_NO_NO_OMIT_FRAME_POINTER
 	bool
 	default y
 
+config ARCH_MAY_HAVE_PC_FDC
+	bool
+	default y
+
 source "init/Kconfig"
 
 menu "Processor"
diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig
index 13b262f1021..deca68ad644 100644
--- a/arch/ppc64/Kconfig
+++ b/arch/ppc64/Kconfig
@@ -44,6 +44,10 @@ config SCHED_NO_NO_OMIT_FRAME_POINTER
 	bool
 	default y
 
+config ARCH_MAY_HAVE_PC_FDC
+	bool
+	default y
+
 # We optimistically allocate largepages from the VM, so make the limit
 # large enough (16MB). This badly named config option is actually
 # max order + 1
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index adc8109f8b7..3e804c736e6 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -37,6 +37,10 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
+config ARCH_MAY_HAVE_PC_FDC
+	bool
+	default y
+
 source "init/Kconfig"
 
 menu "System type"
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index aca028aa29b..aba05394d30 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -211,6 +211,10 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
+config ARCH_MAY_HAVE_PC_FDC
+	bool
+	default y
+
 config SUN_PM
 	bool
 	default y
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index 73ec6aec5ed..1e9d8638a28 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -26,6 +26,10 @@ config TIME_INTERPOLATION
 	bool
 	default y
 
+config ARCH_MAY_HAVE_PC_FDC
+	bool
+	default y
+
 choice
 	prompt "Kernel page size"
 	default SPARC64_PAGE_SIZE_8KB
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 251ce7cf1a3..8f868b67ef0 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -65,6 +65,10 @@ config GENERIC_IOMAP
 	bool
 	default y
 
+config ARCH_MAY_HAVE_PC_FDC
+	bool
+	default y
+
 source "init/Kconfig"
 
 
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 6b736364cc5..51b0af1cebe 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -6,7 +6,7 @@ menu "Block devices"
 
 config BLK_DEV_FD
 	tristate "Normal floppy disk support"
-	depends on (!ARCH_S390 && !M68K && !IA64 && !UML && !ARM) || Q40 || (SUN3X && BROKEN) || ARCH_RPC || ARCH_EBSA285
+	depends on ARCH_MAY_HAVE_PC_FDC
 	---help---
 	  If you want to use the floppy disk drive(s) of your PC under Linux,
 	  say Y. Information about this driver, especially important for IBM
-- 
cgit v1.2.3-70-g09d2