diff options
Diffstat (limited to 'arch/i386')
51 files changed, 532 insertions, 519 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 619d843ba23..3b3b017e1c1 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -14,6 +14,10 @@ config X86 486, 586, Pentiums, and various instruction-set-compatible chips by AMD, Cyrix, and others. +config SEMAPHORE_SLEEPERS + bool + default y + config MMU bool default y @@ -754,6 +758,7 @@ config NUMA depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI)) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT) + select SPARSEMEM_STATIC # Need comments to help the hapless user trying to turn on NUMA support comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support" diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 4cc83b322b3..64682a0edac 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -7,7 +7,7 @@ extra-y := head.o init_task.o vmlinux.lds obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \ - doublefault.o quirks.o + doublefault.o quirks.o i8237.o obj-y += cpu/ obj-y += timers/ diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index b7808a89d94..34ee500c26e 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -833,6 +833,9 @@ acpi_process_madt(void) if (!error) { acpi_lapic = 1; +#ifdef CONFIG_X86_GENERICARCH + generic_bigsmp_probe(); +#endif /* * Parse MADT IO-APIC entries */ diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 4553ffd94b1..46ce9b248f5 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -613,8 +613,8 @@ void __devinit cpu_init(void) memcpy(thread->tls_array, &per_cpu(cpu_gdt_table, cpu), GDT_ENTRY_TLS_ENTRIES * 8); - __asm__ __volatile__("lgdt %0" : : "m" (cpu_gdt_descr[cpu])); - __asm__ __volatile__("lidt %0" : : "m" (idt_descr)); + load_gdt(&cpu_gdt_descr[cpu]); + load_idt(&idt_descr); /* * Delete NT @@ -642,12 +642,12 @@ void __devinit cpu_init(void) asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); /* Clear all 6 debug registers: */ - -#define CD(register) set_debugreg(0, register) - - CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); - -#undef CD + set_debugreg(0, 0); + set_debugreg(0, 1); + set_debugreg(0, 2); + set_debugreg(0, 3); + set_debugreg(0, 6); + set_debugreg(0, 7); /* * Force FPU initialization: diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.c b/arch/i386/kernel/cpu/cpufreq/longhaul.c index 04e3563da4f..bf02b5026e6 100644 --- a/arch/i386/kernel/cpu/cpufreq/longhaul.c +++ b/arch/i386/kernel/cpu/cpufreq/longhaul.c @@ -64,8 +64,6 @@ static int dont_scale_voltage; #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg) -#define __hlt() __asm__ __volatile__("hlt": : :"memory") - /* Clock ratios multiplied by 10 */ static int clock_ratio[32]; static int eblcr_table[32]; @@ -168,11 +166,9 @@ static void do_powersaver(union msr_longhaul *longhaul, outb(0xFE,0x21); /* TMR0 only */ outb(0xFF,0x80); /* delay */ - local_irq_enable(); - - __hlt(); + safe_halt(); wrmsrl(MSR_VIA_LONGHAUL, longhaul->val); - __hlt(); + halt(); local_irq_disable(); @@ -251,9 +247,7 @@ static void longhaul_setstate(unsigned int clock_ratio_index) bcr2.bits.CLOCKMUL = clock_ratio_index; local_irq_disable(); wrmsrl (MSR_VIA_BCR2, bcr2.val); - local_irq_enable(); - - __hlt(); + safe_halt(); /* Disable software clock multiplier */ rdmsrl (MSR_VIA_BCR2, bcr2.val); diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index ba4b01138c8..ff87cc22b32 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -132,11 +132,7 @@ static void __init set_cx86_memwb(void) setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); /* set 'Not Write-through' */ cr0 = 0x20000000; - __asm__("movl %%cr0,%%eax\n\t" - "orl %0,%%eax\n\t" - "movl %%eax,%%cr0\n" - : : "r" (cr0) - :"ax"); + write_cr0(read_cr0() | cr0); /* CCR2 bit 2: lock NW bit and set WT1 */ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 ); } diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index a2c33c1a46c..43601de0f63 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -82,16 +82,13 @@ static void __devinit Intel_errata_workarounds(struct cpuinfo_x86 *c) */ static int __devinit num_cpu_cores(struct cpuinfo_x86 *c) { - unsigned int eax; + unsigned int eax, ebx, ecx, edx; if (c->cpuid_level < 4) return 1; - __asm__("cpuid" - : "=a" (eax) - : "0" (4), "c" (0) - : "bx", "dx"); - + /* Intel has a non-standard dependency on %ecx for this CPUID level. */ + cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); if (eax & 0x1f) return ((eax >> 26) + 1); else diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c index 6c55b50cf04..9e0d5f83cb9 100644 --- a/arch/i386/kernel/cpu/intel_cacheinfo.c +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -305,6 +305,9 @@ static void __devinit cache_shared_cpu_map_setup(unsigned int cpu, int index) { struct _cpuid4_info *this_leaf; unsigned long num_threads_sharing; +#ifdef CONFIG_X86_HT + struct cpuinfo_x86 *c = cpu_data + cpu; +#endif this_leaf = CPUID4_INFO_IDX(cpu, index); num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; @@ -314,10 +317,12 @@ static void __devinit cache_shared_cpu_map_setup(unsigned int cpu, int index) #ifdef CONFIG_X86_HT else if (num_threads_sharing == smp_num_siblings) this_leaf->shared_cpu_map = cpu_sibling_map[cpu]; -#endif + else if (num_threads_sharing == (c->x86_num_cores * smp_num_siblings)) + this_leaf->shared_cpu_map = cpu_core_map[cpu]; else - printk(KERN_INFO "Number of CPUs sharing cache didn't match " + printk(KERN_DEBUG "Number of CPUs sharing cache didn't match " "any known set of CPUs\n"); +#endif } #else static void __init cache_shared_cpu_map_setup(unsigned int cpu, int index) {} diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index 764cac64e21..dd4ebd6af7e 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -561,7 +561,7 @@ struct mtrr_value { static struct mtrr_value * mtrr_state; -static int mtrr_save(struct sys_device * sysdev, u32 state) +static int mtrr_save(struct sys_device * sysdev, pm_message_t state) { int i; int size = num_var_ranges * sizeof(struct mtrr_value); diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c index e5fab12f792..913be77bb84 100644 --- a/arch/i386/kernel/crash.c +++ b/arch/i386/kernel/crash.c @@ -153,7 +153,7 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu) disable_local_APIC(); atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ - __asm__("hlt"); + halt(); for(;;); return 1; diff --git a/arch/i386/kernel/doublefault.c b/arch/i386/kernel/doublefault.c index 789af3e9fb1..5edb1d379ad 100644 --- a/arch/i386/kernel/doublefault.c +++ b/arch/i386/kernel/doublefault.c @@ -20,7 +20,7 @@ static void doublefault_fn(void) struct Xgt_desc_struct gdt_desc = {0, 0}; unsigned long gdt, tss; - __asm__ __volatile__("sgdt %0": "=m" (gdt_desc): :"memory"); + store_gdt(&gdt_desc); gdt = gdt_desc.address; printk("double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size); diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c index 385883ea8c1..ecad519fd39 100644 --- a/arch/i386/kernel/efi.c +++ b/arch/i386/kernel/efi.c @@ -79,7 +79,7 @@ static void efi_call_phys_prelog(void) * directory. If I have PSE, I just need to duplicate one entry in * page directory. */ - __asm__ __volatile__("movl %%cr4, %0":"=r"(cr4)); + cr4 = read_cr4(); if (cr4 & X86_CR4_PSE) { efi_bak_pg_dir_pointer[0].pgd = @@ -104,8 +104,7 @@ static void efi_call_phys_prelog(void) local_flush_tlb(); cpu_gdt_descr[0].address = __pa(cpu_gdt_descr[0].address); - __asm__ __volatile__("lgdt %0":"=m" - (*(struct Xgt_desc_struct *) __pa(&cpu_gdt_descr[0]))); + load_gdt((struct Xgt_desc_struct *) __pa(&cpu_gdt_descr[0])); } static void efi_call_phys_epilog(void) @@ -114,8 +113,8 @@ static void efi_call_phys_epilog(void) cpu_gdt_descr[0].address = (unsigned long) __va(cpu_gdt_descr[0].address); - __asm__ __volatile__("lgdt %0":"=m"(cpu_gdt_descr)); - __asm__ __volatile__("movl %%cr4, %0":"=r"(cr4)); + load_gdt(&cpu_gdt_descr[0]); + cr4 = read_cr4(); if (cr4 & X86_CR4_PSE) { swapper_pg_dir[pgd_index(0)].pgd = @@ -233,22 +232,23 @@ void __init efi_map_memmap(void) { memmap.map = NULL; - memmap.map = (efi_memory_desc_t *) - bt_ioremap((unsigned long) memmap.phys_map, - (memmap.nr_map * sizeof(efi_memory_desc_t))); - + memmap.map = bt_ioremap((unsigned long) memmap.phys_map, + (memmap.nr_map * memmap.desc_size)); if (memmap.map == NULL) printk(KERN_ERR PFX "Could not remap the EFI memmap!\n"); + + memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); } #if EFI_DEBUG static void __init print_efi_memmap(void) { efi_memory_desc_t *md; + void *p; int i; - for (i = 0; i < memmap.nr_map; i++) { - md = &memmap.map[i]; + for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { + md = p; printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, " "range=[0x%016llx-0x%016llx) (%lluMB)\n", i, md->type, md->attribute, md->phys_addr, @@ -271,10 +271,10 @@ void efi_memmap_walk(efi_freemem_callback_t callback, void *arg) } prev, curr; efi_memory_desc_t *md; unsigned long start, end; - int i; + void *p; - for (i = 0; i < memmap.nr_map; i++) { - md = &memmap.map[i]; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; if ((md->num_pages == 0) || (!is_available_memory(md))) continue; @@ -325,6 +325,7 @@ void __init efi_init(void) memmap.phys_map = EFI_MEMMAP; memmap.nr_map = EFI_MEMMAP_SIZE/EFI_MEMDESC_SIZE; memmap.desc_version = EFI_MEMDESC_VERSION; + memmap.desc_size = EFI_MEMDESC_SIZE; efi.systab = (efi_system_table_t *) boot_ioremap((unsigned long) efi_phys.systab, @@ -428,22 +429,30 @@ void __init efi_init(void) printk(KERN_ERR PFX "Could not map the runtime service table!\n"); /* Map the EFI memory map for use until paging_init() */ - - memmap.map = (efi_memory_desc_t *) - boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE); - + memmap.map = boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE); if (memmap.map == NULL) printk(KERN_ERR PFX "Could not map the EFI memory map!\n"); - if (EFI_MEMDESC_SIZE != sizeof(efi_memory_desc_t)) { - printk(KERN_WARNING PFX "Warning! Kernel-defined memdesc doesn't " - "match the one from EFI!\n"); - } + memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); + #if EFI_DEBUG print_efi_memmap(); #endif } +static inline void __init check_range_for_systab(efi_memory_desc_t *md) +{ + if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) && + ((unsigned long)efi_phys.systab < md->phys_addr + + ((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) { + unsigned long addr; + + addr = md->virt_addr - md->phys_addr + + (unsigned long)efi_phys.systab; + efi.systab = (efi_system_table_t *)addr; + } +} + /* * This function will switch the EFI runtime services to virtual mode. * Essentially, look through the EFI memmap and map every region that @@ -457,43 +466,32 @@ void __init efi_enter_virtual_mode(void) { efi_memory_desc_t *md; efi_status_t status; - int i; + void *p; efi.systab = NULL; - for (i = 0; i < memmap.nr_map; i++) { - md = &memmap.map[i]; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; - if (md->attribute & EFI_MEMORY_RUNTIME) { - md->virt_addr = - (unsigned long)ioremap(md->phys_addr, - md->num_pages << EFI_PAGE_SHIFT); - if (!(unsigned long)md->virt_addr) { - printk(KERN_ERR PFX "ioremap of 0x%lX failed\n", - (unsigned long)md->phys_addr); - } + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; - if (((unsigned long)md->phys_addr <= - (unsigned long)efi_phys.systab) && - ((unsigned long)efi_phys.systab < - md->phys_addr + - ((unsigned long)md->num_pages << - EFI_PAGE_SHIFT))) { - unsigned long addr; - - addr = md->virt_addr - md->phys_addr + - (unsigned long)efi_phys.systab; - efi.systab = (efi_system_table_t *)addr; - } + md->virt_addr = (unsigned long)ioremap(md->phys_addr, + md->num_pages << EFI_PAGE_SHIFT); + if (!(unsigned long)md->virt_addr) { + printk(KERN_ERR PFX "ioremap of 0x%lX failed\n", + (unsigned long)md->phys_addr); } + /* update the virtual address of the EFI system table */ + check_range_for_systab(md); } if (!efi.systab) BUG(); status = phys_efi_set_virtual_address_map( - sizeof(efi_memory_desc_t) * memmap.nr_map, - sizeof(efi_memory_desc_t), + memmap.desc_size * memmap.nr_map, + memmap.desc_size, memmap.desc_version, memmap.phys_map); @@ -533,10 +531,10 @@ efi_initialize_iomem_resources(struct resource *code_resource, { struct resource *res; efi_memory_desc_t *md; - int i; + void *p; - for (i = 0; i < memmap.nr_map; i++) { - md = &memmap.map[i]; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) > 0x100000000ULL) @@ -613,10 +611,10 @@ efi_initialize_iomem_resources(struct resource *code_resource, u32 efi_mem_type(unsigned long phys_addr) { efi_memory_desc_t *md; - int i; + void *p; - for (i = 0; i < memmap.nr_map; i++) { - md = &memmap.map[i]; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; if ((md->phys_addr <= phys_addr) && (phys_addr < (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) )) return md->type; @@ -627,10 +625,10 @@ u32 efi_mem_type(unsigned long phys_addr) u64 efi_mem_attributes(unsigned long phys_addr) { efi_memory_desc_t *md; - int i; + void *p; - for (i = 0; i < memmap.nr_map; i++) { - md = &memmap.map[i]; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; if ((md->phys_addr <= phys_addr) && (phys_addr < (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) )) return md->attribute; diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index a991d4e5edd..abb909793ef 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -203,7 +203,7 @@ sysenter_past_esp: GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp) + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -226,9 +226,9 @@ ENTRY(system_call) pushl %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebp) - # system call tracing in operation + # system call tracing in operation / emulation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp) + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -338,6 +338,9 @@ syscall_trace_entry: movl %esp, %eax xorl %edx,%edx call do_syscall_trace + cmpl $0, %eax + jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, + # so must skip actual syscall movl ORIG_EAX(%esp), %eax cmpl $(nr_syscalls), %eax jnae syscall_call diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 4477bb10709..0480ca9e9e5 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -77,6 +77,32 @@ ENTRY(startup_32) subl %edi,%ecx shrl $2,%ecx rep ; stosl +/* + * Copy bootup parameters out of the way. + * Note: %esi still has the pointer to the real-mode data. + * With the kexec as boot loader, parameter segment might be loaded beyond + * kernel image and might not even be addressable by early boot page tables. + * (kexec on panic case). Hence copy out the parameters before initializing + * page tables. + */ + movl $(boot_params - __PAGE_OFFSET),%edi + movl $(PARAM_SIZE/4),%ecx + cld + rep + movsl + movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi + andl %esi,%esi + jnz 2f # New command line protocol + cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR + jne 1f + movzwl OLD_CL_OFFSET,%esi + addl $(OLD_CL_BASE_ADDR),%esi +2: + movl $(saved_command_line - __PAGE_OFFSET),%edi + movl $(COMMAND_LINE_SIZE/4),%ecx + rep + movsl +1: /* * Initialize page tables. This creates a PDE and a set of page @@ -214,28 +240,6 @@ ENTRY(startup_32_smp) */ call setup_idt -/* - * Copy bootup parameters out of the way. - * Note: %esi still has the pointer to the real-mode data. - */ - movl $boot_params,%edi - movl $(PARAM_SIZE/4),%ecx - cld - rep - movsl - movl boot_params+NEW_CL_POINTER,%esi - andl %esi,%esi - jnz 2f # New command line protocol - cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR - jne 1f - movzwl OLD_CL_OFFSET,%esi - addl $(OLD_CL_BASE_ADDR),%esi -2: - movl $saved_command_line,%edi - movl $(COMMAND_LINE_SIZE/4),%ecx - rep - movsl -1: checkCPUtype: movl $-1,X86_CPUID # -1 for no CPUID initially diff --git a/arch/i386/kernel/i8237.c b/arch/i386/kernel/i8237.c new file mode 100644 index 00000000000..c36d1c006c2 --- /dev/null +++ b/arch/i386/kernel/i8237.c @@ -0,0 +1,67 @@ +/* + * i8237.c: 8237A DMA controller suspend functions. + * + * Written by Pierre Ossman, 2005. + */ + +#include <linux/init.h> +#include <linux/sysdev.h> + +#include <asm/dma.h> + +/* + * This module just handles suspend/resume issues with the + * 8237A DMA controller (used for ISA and LPC). + * Allocation is handled in kernel/dma.c and normal usage is + * in asm/dma.h. + */ + +static int i8237A_resume(struct sys_device *dev) +{ + unsigned long flags; + int i; + + flags = claim_dma_lock(); + + dma_outb(DMA1_RESET_REG, 0); + dma_outb(DMA2_RESET_REG, 0); + + for (i = 0;i < 8;i++) { + set_dma_addr(i, 0x000000); + /* DMA count is a bit weird so this is not 0 */ + set_dma_count(i, 1); + } + + /* Enable cascade DMA or channel 0-3 won't work */ + enable_dma(4); + + release_dma_lock(flags); + + return 0; +} + +static int i8237A_suspend(struct sys_device *dev, pm_message_t state) +{ + return 0; +} + +static struct sysdev_class i8237_sysdev_class = { + set_kset_name("i8237"), + .suspend = i8237A_suspend, + .resume = i8237A_resume, +}; + +static struct sys_device device_i8237A = { + .id = 0, + .cls = &i8237_sysdev_class, +}; + +static int __init i8237A_init_sysfs(void) +{ + int error = sysdev_class_register(&i8237_sysdev_class); + if (!error) + error = sysdev_register(&device_i8237A); + return error; +} + +device_initcall(i8237A_init_sysfs); diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c index 8b25160393c..f2b37654777 100644 --- a/arch/i386/kernel/ioport.c +++ b/arch/i386/kernel/ioport.c @@ -132,6 +132,7 @@ asmlinkage long sys_iopl(unsigned long unused) volatile struct pt_regs * regs = (struct pt_regs *) &unused; unsigned int level = regs->ebx; unsigned int old = (regs->eflags >> 12) & 3; + struct thread_struct *t = ¤t->thread; if (level > 3) return -EINVAL; @@ -140,8 +141,8 @@ asmlinkage long sys_iopl(unsigned long unused) if (!capable(CAP_SYS_RAWIO)) return -EPERM; } - regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12); - /* Make sure we return the long way (not sysenter) */ - set_thread_flag(TIF_IRET); + t->iopl = level << 12; + regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl; + set_iopl_mask(t->iopl); return 0; } diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c index bb50afbee92..fe1ffa55587 100644 --- a/arch/i386/kernel/ldt.c +++ b/arch/i386/kernel/ldt.c @@ -177,7 +177,7 @@ static int read_default_ldt(void __user * ptr, unsigned long bytecount) static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) { struct mm_struct * mm = current->mm; - __u32 entry_1, entry_2, *lp; + __u32 entry_1, entry_2; int error; struct user_desc ldt_info; @@ -205,8 +205,6 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) goto out_unlock; } - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); - /* Allow LDTs to be cleared by the user. */ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { if (oldmode || LDT_empty(&ldt_info)) { @@ -223,8 +221,7 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) /* Install the new entry ... */ install: - *lp = entry_1; - *(lp+1) = entry_2; + write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2); error = 0; out_unlock: diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c index cb699a2aa1f..a912fed4848 100644 --- a/arch/i386/kernel/machine_kexec.c +++ b/arch/i386/kernel/machine_kexec.c @@ -17,13 +17,7 @@ #include <asm/apic.h> #include <asm/cpufeature.h> #include <asm/desc.h> - -static inline unsigned long read_cr3(void) -{ - unsigned long cr3; - asm volatile("movl %%cr3,%0": "=r"(cr3)); - return cr3; -} +#include <asm/system.h> #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) @@ -99,10 +93,7 @@ static void set_idt(void *newidt, __u16 limit) curidt.size = limit; curidt.address = (unsigned long)newidt; - __asm__ __volatile__ ( - "lidtl %0\n" - : : "m" (curidt) - ); + load_idt(&curidt); }; @@ -114,10 +105,7 @@ static void set_gdt(void *newgdt, __u16 limit) curgdt.size = limit; curgdt.address = (unsigned long)newgdt; - __asm__ __volatile__ ( - "lgdtl %0\n" - : : "m" (curgdt) - ); + load_gdt(&curgdt); }; static void load_segments(void) diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c index a77c612aad0..165f13158c6 100644 --- a/arch/i386/kernel/microcode.c +++ b/arch/i386/kernel/microcode.c @@ -164,7 +164,8 @@ static void collect_cpu_info (void *unused) } wrmsr(MSR_IA32_UCODE_REV, 0, 0); - __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx"); + /* see notes above for revision 1.07. Apparent chip bug */ + serialize_cpu(); /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev); pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", @@ -377,7 +378,9 @@ static void do_update_one (void * unused) (unsigned long) uci->mc->bits >> 16 >> 16); wrmsr(MSR_IA32_UCODE_REV, 0, 0); - __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx"); + /* see notes above for revision 1.07. Apparent chip bug */ + serialize_cpu(); + /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index ce838abb27d..5d0b9a8fc43 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -65,6 +65,8 @@ int nr_ioapics; int pic_mode; unsigned long mp_lapic_addr; +unsigned int def_to_bigsmp = 0; + /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; /* Internal processor count */ @@ -120,7 +122,7 @@ static int MP_valid_apicid(int apicid, int version) static void __init MP_processor_info (struct mpc_config_processor *m) { - int ver, apicid; + int ver, apicid, cpu, found_bsp = 0; physid_mask_t tmp; if (!(m->mpc_cpuflag & CPU_ENABLED)) @@ -179,6 +181,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m) if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { Dprintk(" Bootup CPU\n"); boot_cpu_physical_apicid = m->mpc_apicid; + found_bsp = 1; } if (num_processors >= NR_CPUS) { @@ -202,6 +205,11 @@ static void __init MP_processor_info (struct mpc_config_processor *m) return; } + if (found_bsp) + cpu = 0; + else + cpu = num_processors - 1; + cpu_set(cpu, cpu_possible_map); tmp = apicid_to_cpu_present(apicid); physids_or(phys_cpu_present_map, phys_cpu_present_map, tmp); @@ -213,6 +221,13 @@ static void __init MP_processor_info (struct mpc_config_processor *m) ver = 0x10; } apic_version[m->mpc_apicid] = ver; + if ((num_processors > 8) && + APIC_XAPIC(ver) && + (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)) + def_to_bigsmp = 1; + else + def_to_bigsmp = 0; + bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; } diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c index b2f03c39a6f..03100d6fc5d 100644 --- a/arch/i386/kernel/msr.c +++ b/arch/i386/kernel/msr.c @@ -46,23 +46,13 @@ static struct class *msr_class; -/* Note: "err" is handled in a funny way below. Otherwise one version - of gcc or another breaks. */ - static inline int wrmsr_eio(u32 reg, u32 eax, u32 edx) { int err; - asm volatile ("1: wrmsr\n" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl %4,%0\n" - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" " .long 1b,3b\n" ".previous":"=&bDS" (err) - :"a"(eax), "d"(edx), "c"(reg), "i"(-EIO), "0"(0)); - + err = wrmsr_safe(reg, eax, edx); + if (err) + err = -EIO; return err; } @@ -70,18 +60,9 @@ static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx) { int err; - asm volatile ("1: rdmsr\n" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl %4,%0\n" - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b,3b\n" - ".previous":"=&bDS" (err), "=a"(*eax), "=d"(*edx) - :"c"(reg), "i"(-EIO), "0"(0)); - + err = rdmsr_safe(reg, eax, edx); + if (err) + err = -EIO; return err; } diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 8c242bb1ef4..8bbdbda07a2 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -501,8 +501,11 @@ void nmi_watchdog_tick (struct pt_regs * regs) */ alert_counter[cpu]++; if (alert_counter[cpu] == 5*nmi_hz) + /* + * die_nmi will return ONLY if NOTIFY_STOP happens.. + */ die_nmi(regs, "NMI Watchdog detected LOCKUP"); - } else { + last_irq_sums[cpu] = sum; alert_counter[cpu] = 0; } diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index e3f362e8af5..b45cbf93d43 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -164,7 +164,7 @@ static inline void play_dead(void) */ local_irq_disable(); while (1) - __asm__ __volatile__("hlt":::"memory"); + halt(); } #else static inline void play_dead(void) @@ -313,16 +313,12 @@ void show_regs(struct pt_regs * regs) printk(" DS: %04x ES: %04x\n", 0xffff & regs->xds,0xffff & regs->xes); - __asm__("movl %%cr0, %0": "=r" (cr0)); - __asm__("movl %%cr2, %0": "=r" (cr2)); - __asm__("movl %%cr3, %0": "=r" (cr3)); - /* This could fault if %cr4 does not exist */ - __asm__("1: movl %%cr4, %0 \n" - "2: \n" - ".section __ex_table,\"a\" \n" - ".long 1b,2b \n" - ".previous \n" - : "=r" (cr4): "0" (0)); + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = read_cr3(); + if (current_cpu_data.x86 > 4) { + cr4 = read_cr4(); + } printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); show_trace(NULL, ®s->esp); } @@ -682,21 +678,26 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas __unlazy_fpu(prev_p); /* - * Reload esp0, LDT and the page table pointer: + * Reload esp0. */ load_esp0(tss, next); /* - * Load the per-thread Thread-Local Storage descriptor. + * Save away %fs and %gs. No need to save %es and %ds, as + * those are always kernel segments while inside the kernel. + * Doing this before setting the new TLS descriptors avoids + * the situation where we temporarily have non-reloadable + * segments in %fs and %gs. This could be an issue if the + * NMI handler ever used %fs or %gs (it does not today), or + * if the kernel is running inside of a hypervisor layer. */ - load_TLS(next, cpu); + savesegment(fs, prev->fs); + savesegment(gs, prev->gs); /* - * Save away %fs and %gs. No need to save %es and %ds, as - * those are always kernel segments while inside the kernel. + * Load the per-thread Thread-Local Storage descriptor. */ - asm volatile("mov %%fs,%0":"=m" (prev->fs)); - asm volatile("mov %%gs,%0":"=m" (prev->gs)); + load_TLS(next, cpu); /* * Restore %fs and %gs if needed. @@ -711,6 +712,12 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas loadsegment(gs, next->gs); /* + * Restore IOPL if needed. + */ + if (unlikely(prev->iopl != next->iopl)) + set_iopl_mask(next->iopl); + + /* * Now maybe reload the debug registers */ if (unlikely(next->debugreg[7])) { diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index 0da59b42843..340980203b0 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -271,6 +271,8 @@ static void clear_singlestep(struct task_struct *child) void ptrace_disable(struct task_struct *child) { clear_singlestep(child); + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); } /* @@ -509,15 +511,20 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data) } break; + case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */ case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ case PTRACE_CONT: /* restart after signal. */ ret = -EIO; if (!valid_signal(data)) break; - if (request == PTRACE_SYSCALL) { + if (request == PTRACE_SYSEMU) { + set_tsk_thread_flag(child, TIF_SYSCALL_EMU); + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + } else if (request == PTRACE_SYSCALL) { set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - } - else { + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); + } else { + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); } child->exit_code = data; @@ -542,10 +549,17 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data) wake_up_process(child); break; + case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */ case PTRACE_SINGLESTEP: /* set the trap flag. */ ret = -EIO; if (!valid_signal(data)) break; + + if (request == PTRACE_SYSEMU_SINGLESTEP) + set_tsk_thread_flag(child, TIF_SYSCALL_EMU); + else + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); set_singlestep(child); child->exit_code = data; @@ -678,26 +692,52 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) * - triggered by current->work.syscall_trace */ __attribute__((regparm(3))) -void do_syscall_trace(struct pt_regs *regs, int entryexit) +int do_syscall_trace(struct pt_regs *regs, int entryexit) { + int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU), ret = 0; + /* With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall + * interception. */ + int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP); + /* do the secure computing check first */ secure_computing(regs->orig_eax); - if (unlikely(current->audit_context) && entryexit) - audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax); + if (unlikely(current->audit_context)) { + if (entryexit) + audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax); + /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only + * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is + * not used, entry.S will call us only on syscall exit, not + * entry; so when TIF_SYSCALL_AUDIT is used we must avoid + * calling send_sigtrap() on syscall entry. + * + * Note that when PTRACE_SYSEMU_SINGLESTEP is used, + * is_singlestep is false, despite his name, so we will still do + * the correct thing. + */ + else if (is_singlestep) + goto out; + } if (!(current->ptrace & PT_PTRACED)) goto out; + /* If a process stops on the 1st tracepoint with SYSCALL_TRACE + * and then is resumed with SYSEMU_SINGLESTEP, it will come in + * here. We have to check this and return */ + if (is_sysemu && entryexit) + return 0; + /* Fake a debug trap */ - if (test_thread_flag(TIF_SINGLESTEP)) + if (is_singlestep) send_sigtrap(current, regs, 0); - if (!test_thread_flag(TIF_SYSCALL_TRACE)) + if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) goto out; /* the 0x80 provides a way for the tracing parent to distinguish between a syscall stop and SIGTRAP delivery */ + /* Note that the debugger could change the result of test_thread_flag!*/ ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0)); /* @@ -709,9 +749,16 @@ void do_syscall_trace(struct pt_regs *regs, int entryexit) send_sig(current->exit_code, current, 1); current->exit_code = 0; } + ret = is_sysemu; out: if (unlikely(current->audit_context) && !entryexit) audit_syscall_entry(current, AUDIT_ARCH_I386, regs->orig_eax, regs->ebx, regs->ecx, regs->edx, regs->esi); + if (ret == 0) + return 0; + regs->orig_eax = -1; /* force skip of syscall restarting */ + if (unlikely(current->audit_context)) + audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax); + return 1; } diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index c71fef31dc4..1cbb9c0f470 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -13,6 +13,7 @@ #include <linux/dmi.h> #include <asm/uaccess.h> #include <asm/apic.h> +#include <asm/desc.h> #include "mach_reboot.h" #include <linux/reboot_fixups.h> @@ -242,13 +243,13 @@ void machine_real_restart(unsigned char *code, int length) /* Set up the IDT for real mode. */ - __asm__ __volatile__ ("lidt %0" : : "m" (real_mode_idt)); + load_idt(&real_mode_idt); /* Set up a GDT from which we can load segment descriptors for real mode. The GDT is not used in real mode; it is just needed here to prepare the descriptors. */ - __asm__ __volatile__ ("lgdt %0" : : "m" (real_mode_gdt)); + load_gdt(&real_mode_gdt); /* Load the data segment registers, and thus the descriptors ready for real mode. The base address of each segment is 0x100, 16 times the @@ -316,7 +317,7 @@ void machine_emergency_restart(void) if (!reboot_thru_bios) { if (efi_enabled) { efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); - __asm__ __volatile__("lidt %0": :"m" (no_idt)); + load_idt(&no_idt); __asm__ __volatile__("int3"); } /* rebooting needs to touch the page at absolute addr 0 */ @@ -325,7 +326,7 @@ void machine_emergency_restart(void) mach_reboot_fixups(); /* for board specific fixups */ mach_reboot(); /* That didn't work - force a triple fault.. */ - __asm__ __volatile__("lidt %0": :"m" (no_idt)); + load_idt(&no_idt); __asm__ __volatile__("int3"); } } diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c index 469f496e55c..7455ab64394 100644 --- a/arch/i386/kernel/semaphore.c +++ b/arch/i386/kernel/semaphore.c @@ -13,171 +13,9 @@ * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org> */ #include <linux/config.h> -#include <linux/sched.h> -#include <linux/err.h> -#include <linux/init.h> #include <asm/semaphore.h> /* - * Semaphores are implemented using a two-way counter: - * The "count" variable is decremented for each process - * that tries to acquire the semaphore, while the "sleeping" - * variable is a count of such acquires. - * - * Notably, the inline "up()" and "down()" functions can - * efficiently test if they need to do any extra work (up - * needs to do something only if count was negative before - * the increment operation. - * - * "sleeping" and the contention routine ordering is protected - * by the spinlock in the semaphore's waitqueue head. - * - * Note that these functions are only called when there is - * contention on the lock, and as such all this is the - * "non-critical" part of the whole semaphore business. The - * critical part is the inline stuff in <asm/semaphore.h> - * where we want to avoid any extra jumps and calls. - */ - -/* - * Logic: - * - only on a boundary condition do we need to care. When we go - * from a negative count to a non-negative, we wake people up. - * - when we go from a non-negative count to a negative do we - * (a) synchronize with the "sleeper" count and (b) make sure - * that we're on the wakeup list before we synchronize so that - * we cannot lose wakeup events. - */ - -static fastcall void __attribute_used__ __up(struct semaphore *sem) -{ - wake_up(&sem->wait); -} - -static fastcall void __attribute_used__ __sched __down(struct semaphore * sem) -{ - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - unsigned long flags; - - tsk->state = TASK_UNINTERRUPTIBLE; - spin_lock_irqsave(&sem->wait.lock, flags); - add_wait_queue_exclusive_locked(&sem->wait, &wait); - - sem->sleepers++; - for (;;) { - int sleepers = sem->sleepers; - - /* - * Add "everybody else" into it. They aren't - * playing, because we own the spinlock in - * the wait_queue_head. - */ - if (!atomic_add_negative(sleepers - 1, &sem->count)) { - sem->sleepers = 0; - break; - } - sem->sleepers = 1; /* us - see -1 above */ - spin_unlock_irqrestore(&sem->wait.lock, flags); - - schedule(); - - spin_lock_irqsave(&sem->wait.lock, flags); - tsk->state = TASK_UNINTERRUPTIBLE; - } - remove_wait_queue_locked(&sem->wait, &wait); - wake_up_locked(&sem->wait); - spin_unlock_irqrestore(&sem->wait.lock, flags); - tsk->state = TASK_RUNNING; -} - -static fastcall int __attribute_used__ __sched __down_interruptible(struct semaphore * sem) -{ - int retval = 0; - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - unsigned long flags; - - tsk->state = TASK_INTERRUPTIBLE; - spin_lock_irqsave(&sem->wait.lock, flags); - add_wait_queue_exclusive_locked(&sem->wait, &wait); - - sem->sleepers++; - for (;;) { - int sleepers = sem->sleepers; - - /* - * With signals pending, this turns into - * the trylock failure case - we won't be - * sleeping, and we* can't get the lock as - * it has contention. Just correct the count - * and exit. - */ - if (signal_pending(current)) { - retval = -EINTR; - sem->sleepers = 0; - atomic_add(sleepers, &sem->count); - break; - } - - /* - * Add "everybody else" into it. They aren't - * playing, because we own the spinlock in - * wait_queue_head. The "-1" is because we're - * still hoping to get the semaphore. - */ - if (!atomic_add_negative(sleepers - 1, &sem->count)) { - sem->sleepers = 0; - break; - } - sem->sleepers = 1; /* us - see -1 above */ - spin_unlock_irqrestore(&sem->wait.lock, flags); - - schedule(); - - spin_lock_irqsave(&sem->wait.lock, flags); - tsk->state = TASK_INTERRUPTIBLE; - } - remove_wait_queue_locked(&sem->wait, &wait); - wake_up_locked(&sem->wait); - spin_unlock_irqrestore(&sem->wait.lock, flags); - - tsk->state = TASK_RUNNING; - return retval; -} - -/* - * Trylock failed - make sure we correct for - * having decremented the count. - * - * We could have done the trylock with a - * single "cmpxchg" without failure cases, - * but then it wouldn't work on a 386. - */ -static fastcall int __attribute_used__ __down_trylock(struct semaphore * sem) -{ - int sleepers; - unsigned long flags; - - spin_lock_irqsave(&sem->wait.lock, flags); - sleepers = sem->sleepers + 1; - sem->sleepers = 0; - - /* - * Add "everybody else" and us into it. They aren't - * playing, because we own the spinlock in the - * wait_queue_head. - */ - if (!atomic_add_negative(sleepers, &sem->count)) { - wake_up_locked(&sem->wait); - } - - spin_unlock_irqrestore(&sem->wait.lock, flags); - return 1; -} - - -/* * The semaphore operations have a special calling sequence that * allow us to do a simpler in-line version of them. These routines * need to convert that sequence back into the C sequence when diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index af4de58cab5..294bcca985a 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -370,12 +370,16 @@ static void __init limit_regions(unsigned long long size) int i; if (efi_enabled) { - for (i = 0; i < memmap.nr_map; i++) { - current_addr = memmap.map[i].phys_addr + - (memmap.map[i].num_pages << 12); - if (memmap.map[i].type == EFI_CONVENTIONAL_MEMORY) { + efi_memory_desc_t *md; + void *p; + + for (p = memmap.map, i = 0; p < memmap.map_end; + p += memmap.desc_size, i++) { + md = p; + current_addr = md->phys_addr + (md->num_pages << 12); + if (md->type == EFI_CONVENTIONAL_MEMORY) { if (current_addr >= size) { - memmap.map[i].num_pages -= + md->num_pages -= (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT); memmap.nr_map = i + 1; return; @@ -1581,8 +1585,14 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_table_init(); acpi_boot_init(); -#endif +#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) + if (def_to_bigsmp) + printk(KERN_WARNING "More than 8 CPUs detected and " + "CONFIG_X86_PC cannot handle it.\nUse " + "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); +#endif +#endif #ifdef CONFIG_X86_LOCAL_APIC if (smp_found_config) get_smp_config(); diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 140e340569c..61eb0c8a6e4 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -278,9 +278,9 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, int tmp, err = 0; tmp = 0; - __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); + savesegment(gs, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->gs); - __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); + savesegment(fs, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->fs); err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); @@ -604,7 +604,9 @@ int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset) * We want the common case to go fast, which * is why we may in certain cases get here from * kernel mode. Just return without doing anything - * if so. + * if so. vm86 regs switched out by assembly code + * before reaching here, so testing against kernel + * CS suffices. */ if (!user_mode(regs)) return 1; diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index cec4bde6716..48b55db3680 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -576,7 +576,7 @@ static void stop_this_cpu (void * dummy) local_irq_disable(); disable_local_APIC(); if (cpu_data[smp_processor_id()].hlt_works_ok) - for(;;) __asm__("hlt"); + for(;;) halt(); for (;;); } diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 8ac8e9fd561..5e4893d2b9f 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -88,6 +88,8 @@ EXPORT_SYMBOL(cpu_online_map); cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; EXPORT_SYMBOL(cpu_callout_map); +cpumask_t cpu_possible_map; +EXPORT_SYMBOL(cpu_possible_map); static cpumask_t smp_commenced_mask; /* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there @@ -1017,8 +1019,8 @@ int __devinit smp_prepare_cpu(int cpu) tsc_sync_disabled = 1; /* init low mem mapping */ - memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - sizeof(swapper_pg_dir[0]) * KERNEL_PGD_PTRS); + clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, + KERNEL_PGD_PTRS); flush_tlb_all(); schedule_work(&task); wait_for_completion(&done); @@ -1265,6 +1267,7 @@ void __devinit smp_prepare_boot_cpu(void) cpu_set(smp_processor_id(), cpu_online_map); cpu_set(smp_processor_id(), cpu_callout_map); cpu_set(smp_processor_id(), cpu_present_map); + cpu_set(smp_processor_id(), cpu_possible_map); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; } diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 0ee9dee8af0..6f794a78ee1 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -383,6 +383,7 @@ void notify_arch_cmos_timer(void) static long clock_cmos_diff, sleep_start; +static struct timer_opts *last_timer; static int timer_suspend(struct sys_device *dev, pm_message_t state) { /* @@ -391,6 +392,10 @@ static int timer_suspend(struct sys_device *dev, pm_message_t state) clock_cmos_diff = -get_cmos_time(); clock_cmos_diff += get_seconds(); sleep_start = get_cmos_time(); + last_timer = cur_timer; + cur_timer = &timer_none; + if (last_timer->suspend) + last_timer->suspend(state); return 0; } @@ -404,6 +409,7 @@ static int timer_resume(struct sys_device *dev) if (is_hpet_enabled()) hpet_reenable(); #endif + setup_pit_timer(); sec = get_cmos_time() + clock_cmos_diff; sleep_length = (get_cmos_time() - sleep_start) * HZ; write_seqlock_irqsave(&xtime_lock, flags); @@ -412,6 +418,10 @@ static int timer_resume(struct sys_device *dev) write_sequnlock_irqrestore(&xtime_lock, flags); jiffies += sleep_length; wall_jiffies += sleep_length; + if (last_timer->resume) + last_timer->resume(); + cur_timer = last_timer; + last_timer = NULL; return 0; } diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c index ef8dac5dd33..001de97c9e4 100644 --- a/arch/i386/kernel/timers/timer_hpet.c +++ b/arch/i386/kernel/timers/timer_hpet.c @@ -136,6 +136,8 @@ static void delay_hpet(unsigned long loops) } while ((hpet_end - hpet_start) < (loops)); } +static struct timer_opts timer_hpet; + static int __init init_hpet(char* override) { unsigned long result, remain; @@ -163,6 +165,8 @@ static int __init init_hpet(char* override) } set_cyc2ns_scale(cpu_khz/1000); } + /* set this only when cpu_has_tsc */ + timer_hpet.read_timer = read_timer_tsc; } /* @@ -177,6 +181,19 @@ static int __init init_hpet(char* override) return 0; } +static int hpet_resume(void) +{ + write_seqlock(&monotonic_lock); + /* Assume this is the last mark offset time */ + rdtsc(last_tsc_low, last_tsc_high); + + if (hpet_use_timer) + hpet_last = hpet_readl(HPET_T0_CMP) - hpet_tick; + else + hpet_last = hpet_readl(HPET_COUNTER); + write_sequnlock(&monotonic_lock); + return 0; +} /************************************************************/ /* tsc timer_opts struct */ @@ -186,7 +203,7 @@ static struct timer_opts timer_hpet __read_mostly = { .get_offset = get_offset_hpet, .monotonic_clock = monotonic_clock_hpet, .delay = delay_hpet, - .read_timer = read_timer_tsc, + .resume = hpet_resume, }; struct init_timer_opts __initdata timer_hpet_init = { diff --git a/arch/i386/kernel/timers/timer_pit.c b/arch/i386/kernel/timers/timer_pit.c index 06de036a820..eddb6403823 100644 --- a/arch/i386/kernel/timers/timer_pit.c +++ b/arch/i386/kernel/timers/timer_pit.c @@ -175,30 +175,3 @@ void setup_pit_timer(void) outb(LATCH >> 8 , PIT_CH0); /* MSB */ spin_unlock_irqrestore(&i8253_lock, flags); } - -static int timer_resume(struct sys_device *dev) -{ - setup_pit_timer(); - return 0; -} - -static struct sysdev_class timer_sysclass = { - set_kset_name("timer_pit"), - .resume = timer_resume, -}; - -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int __init init_timer_sysfs(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(init_timer_sysfs); - diff --git a/arch/i386/kernel/timers/timer_pm.c b/arch/i386/kernel/timers/timer_pm.c index 4ef20e66349..264edaaac31 100644 --- a/arch/i386/kernel/timers/timer_pm.c +++ b/arch/i386/kernel/timers/timer_pm.c @@ -186,6 +186,14 @@ static void mark_offset_pmtmr(void) } } +static int pmtmr_resume(void) +{ + write_seqlock(&monotonic_lock); + /* Assume this is the last mark offset time */ + offset_tick = read_pmtmr(); + write_sequnlock(&monotonic_lock); + return 0; +} static unsigned long long monotonic_clock_pmtmr(void) { @@ -247,6 +255,7 @@ static struct timer_opts timer_pmtmr = { .monotonic_clock = monotonic_clock_pmtmr, .delay = delay_pmtmr, .read_timer = read_timer_tsc, + .resume = pmtmr_resume, }; struct init_timer_opts __initdata timer_pmtmr_init = { diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c index 8f4e4d5bc56..6dd470cc9f7 100644 --- a/arch/i386/kernel/timers/timer_tsc.c +++ b/arch/i386/kernel/timers/timer_tsc.c @@ -543,6 +543,19 @@ static int __init init_tsc(char* override) return -ENODEV; } +static int tsc_resume(void) +{ + write_seqlock(&monotonic_lock); + /* Assume this is the last mark offset time */ + rdtsc(last_tsc_low, last_tsc_high); +#ifdef CONFIG_HPET_TIMER + if (is_hpet_enabled() && hpet_use_timer) + hpet_last = hpet_readl(HPET_COUNTER); +#endif + write_sequnlock(&monotonic_lock); + return 0; +} + #ifndef CONFIG_X86_TSC /* disable flag for tsc. Takes effect by clearing the TSC cpu flag * in cpu/common.c */ @@ -573,6 +586,7 @@ static struct timer_opts timer_tsc = { .monotonic_clock = monotonic_clock_tsc, .delay = delay_tsc, .read_timer = read_timer_tsc, + .resume = tsc_resume, }; struct init_timer_opts __initdata timer_tsc_init = { diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index cd2d5d5514f..54629bb5893 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -210,7 +210,7 @@ void show_registers(struct pt_regs *regs) unsigned short ss; esp = (unsigned long) (®s->esp); - ss = __KERNEL_DS; + savesegment(ss, ss); if (user_mode(regs)) { in_kernel = 0; esp = regs->esp; @@ -267,9 +267,6 @@ static void handle_BUG(struct pt_regs *regs) char c; unsigned long eip; - if (user_mode(regs)) - goto no_bug; /* Not in kernel */ - eip = regs->eip; if (eip < PAGE_OFFSET) @@ -568,6 +565,10 @@ static DEFINE_SPINLOCK(nmi_print_lock); void die_nmi (struct pt_regs *regs, const char *msg) { + if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 0, SIGINT) == + NOTIFY_STOP) + return; + spin_lock(&nmi_print_lock); /* * We are in trouble anyway, lets at least try @@ -1008,7 +1009,7 @@ void __init trap_init_f00f_bug(void) * it uses the read-only mapped virtual address. */ idt_descr.address = fix_to_virt(FIX_F00F_IDT); - __asm__ __volatile__("lidt %0" : : "m" (idt_descr)); + load_idt(&idt_descr); } #endif diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c index ec0f68ce688..16b48500962 100644 --- a/arch/i386/kernel/vm86.c +++ b/arch/i386/kernel/vm86.c @@ -294,8 +294,8 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk */ info->regs32->eax = 0; tsk->thread.saved_esp0 = tsk->thread.esp0; - asm volatile("mov %%fs,%0":"=m" (tsk->thread.saved_fs)); - asm volatile("mov %%gs,%0":"=m" (tsk->thread.saved_gs)); + savesegment(fs, tsk->thread.saved_fs); + savesegment(gs, tsk->thread.saved_gs); tss = &per_cpu(init_tss, get_cpu()); tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; @@ -542,7 +542,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code) unsigned char opcode; unsigned char __user *csp; unsigned char __user *ssp; - unsigned short ip, sp; + unsigned short ip, sp, orig_flags; int data32, pref_done; #define CHECK_IF_IN_TRAP \ @@ -551,8 +551,12 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code) #define VM86_FAULT_RETURN do { \ if (VMPI.force_return_for_pic && (VEFLAGS & (IF_MASK | VIF_MASK))) \ return_to_32bit(regs, VM86_PICRETURN); \ + if (orig_flags & TF_MASK) \ + handle_vm86_trap(regs, 0, 1); \ return; } while (0) + orig_flags = *(unsigned short *)®s->eflags; + csp = (unsigned char __user *) (regs->cs << 4); ssp = (unsigned char __user *) (regs->ss << 4); sp = SP(regs); diff --git a/arch/i386/kernel/vsyscall-sigreturn.S b/arch/i386/kernel/vsyscall-sigreturn.S index c8fcf75b9be..68afa50dd7c 100644 --- a/arch/i386/kernel/vsyscall-sigreturn.S +++ b/arch/i386/kernel/vsyscall-sigreturn.S @@ -15,7 +15,7 @@ */ .text - .org __kernel_vsyscall+32 + .org __kernel_vsyscall+32,0x90 .globl __kernel_sigreturn .type __kernel_sigreturn,@function __kernel_sigreturn: @@ -35,6 +35,7 @@ __kernel_rt_sigreturn: int $0x80 .LEND_rt_sigreturn: .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn + .balign 32 .previous .section .eh_frame,"a",@progbits diff --git a/arch/i386/mach-es7000/es7000.h b/arch/i386/mach-es7000/es7000.h index 70691f0c4ce..898ed905e11 100644 --- a/arch/i386/mach-es7000/es7000.h +++ b/arch/i386/mach-es7000/es7000.h @@ -104,7 +104,8 @@ struct mip_reg { #define MIP_SW_APIC 0x1020b #define MIP_FUNC(VALUE) (VALUE & 0xff) -extern int parse_unisys_oem (char *oemptr, int oem_entries); -extern int find_unisys_acpi_oem_table(unsigned long *oem_addr, int *length); +extern int parse_unisys_oem (char *oemptr); +extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); +extern void setup_unisys (); extern int es7000_start_cpu(int cpu, unsigned long eip); extern void es7000_sw_apic(void); diff --git a/arch/i386/mach-es7000/es7000plat.c b/arch/i386/mach-es7000/es7000plat.c index d5936d50047..2000bdca2fc 100644 --- a/arch/i386/mach-es7000/es7000plat.c +++ b/arch/i386/mach-es7000/es7000plat.c @@ -75,12 +75,29 @@ es7000_rename_gsi(int ioapic, int gsi) #endif // (CONFIG_X86_IO_APIC) && (CONFIG_ACPI_INTERPRETER || CONFIG_ACPI_BOOT) +void __init +setup_unisys () +{ + /* + * Determine the generation of the ES7000 currently running. + * + * es7000_plat = 1 if the machine is a 5xx ES7000 box + * es7000_plat = 2 if the machine is a x86_64 ES7000 box + * + */ + if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2)) + es7000_plat = 2; + else + es7000_plat = 1; + ioapic_renumber_irq = es7000_rename_gsi; +} + /* * Parse the OEM Table */ int __init -parse_unisys_oem (char *oemptr, int oem_entries) +parse_unisys_oem (char *oemptr) { int i; int success = 0; @@ -95,7 +112,7 @@ parse_unisys_oem (char *oemptr, int oem_entries) tp += 8; - for (i=0; i <= oem_entries; i++) { + for (i=0; i <= 6; i++) { type = *tp++; size = *tp++; tp -= 2; @@ -130,34 +147,18 @@ parse_unisys_oem (char *oemptr, int oem_entries) default: break; } - if (i == 6) break; tp += size; } if (success < 2) { es7000_plat = 0; - } else { - printk("\nEnabling ES7000 specific features...\n"); - /* - * Determine the generation of the ES7000 currently running. - * - * es7000_plat = 0 if the machine is NOT a Unisys ES7000 box - * es7000_plat = 1 if the machine is a 5xx ES7000 box - * es7000_plat = 2 if the machine is a x86_64 ES7000 box - * - */ - if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2)) - es7000_plat = 2; - else - es7000_plat = 1; - - ioapic_renumber_irq = es7000_rename_gsi; - } + } else + setup_unisys(); return es7000_plat; } int __init -find_unisys_acpi_oem_table(unsigned long *oem_addr, int *length) +find_unisys_acpi_oem_table(unsigned long *oem_addr) { struct acpi_table_rsdp *rsdp = NULL; unsigned long rsdp_phys = 0; @@ -201,13 +202,11 @@ find_unisys_acpi_oem_table(unsigned long *oem_addr, int *length) acpi_table_print(header, sdt.entry[i].pa); t = (struct oem_table *) __acpi_map_table(sdt.entry[i].pa, header->length); addr = (void *) __acpi_map_table(t->OEMTableAddr, t->OEMTableSize); - *length = header->length; *oem_addr = (unsigned long) addr; return 0; } } } - Dprintk("ES7000: did not find Unisys ACPI OEM table!\n"); return -1; } diff --git a/arch/i386/mach-generic/bigsmp.c b/arch/i386/mach-generic/bigsmp.c index 25883b44f62..037b2af1a1f 100644 --- a/arch/i386/mach-generic/bigsmp.c +++ b/arch/i386/mach-generic/bigsmp.c @@ -47,7 +47,10 @@ static struct dmi_system_id __initdata bigsmp_dmi_table[] = { static __init int probe_bigsmp(void) { - dmi_check_system(bigsmp_dmi_table); + if (def_to_bigsmp) + dmi_bigsmp = 1; + else + dmi_check_system(bigsmp_dmi_table); return dmi_bigsmp; } diff --git a/arch/i386/mach-generic/probe.c b/arch/i386/mach-generic/probe.c index 5497c65a879..cea5b3ce4b5 100644 --- a/arch/i386/mach-generic/probe.c +++ b/arch/i386/mach-generic/probe.c @@ -30,6 +30,25 @@ struct genapic *apic_probe[] __initdata = { NULL, }; +static int cmdline_apic; + +void __init generic_bigsmp_probe(void) +{ + /* + * This routine is used to switch to bigsmp mode when + * - There is no apic= option specified by the user + * - generic_apic_probe() has choosen apic_default as the sub_arch + * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support + */ + + if (!cmdline_apic && genapic == &apic_default) + if (apic_bigsmp.probe()) { + genapic = &apic_bigsmp; + printk(KERN_INFO "Overriding APIC driver with %s\n", + genapic->name); + } +} + void __init generic_apic_probe(char *command_line) { char *s; @@ -52,6 +71,7 @@ void __init generic_apic_probe(char *command_line) if (!changed) printk(KERN_ERR "Unknown genapic `%s' specified.\n", s); *p = old; + cmdline_apic = changed; } for (i = 0; !changed && apic_probe[i]; i++) { if (apic_probe[i]->probe()) { diff --git a/arch/i386/mach-voyager/voyager_basic.c b/arch/i386/mach-voyager/voyager_basic.c index c6384061328..cc69875d979 100644 --- a/arch/i386/mach-voyager/voyager_basic.c +++ b/arch/i386/mach-voyager/voyager_basic.c @@ -234,10 +234,9 @@ voyager_power_off(void) #endif } /* and wait for it to happen */ - for(;;) { - __asm("cli"); - __asm("hlt"); - } + local_irq_disable(); + for(;;) + halt(); } /* copied from process.c */ @@ -278,10 +277,9 @@ machine_restart(char *cmd) outb(basebd | 0x08, VOYAGER_MC_SETUP); outb(0x02, catbase + 0x21); } - for(;;) { - asm("cli"); - asm("hlt"); - } + local_irq_disable(); + for(;;) + halt(); } void diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c index 0e1f4208b07..46b0cf4a31e 100644 --- a/arch/i386/mach-voyager/voyager_smp.c +++ b/arch/i386/mach-voyager/voyager_smp.c @@ -242,6 +242,8 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE; cpumask_t cpu_callin_map = CPU_MASK_NONE; cpumask_t cpu_callout_map = CPU_MASK_NONE; EXPORT_SYMBOL(cpu_callout_map); +cpumask_t cpu_possible_map = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_possible_map); /* The per processor IRQ masks (these are usually kept in sync) */ static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned; @@ -1015,7 +1017,7 @@ smp_stop_cpu_function(void *dummy) cpu_clear(smp_processor_id(), cpu_online_map); local_irq_disable(); for(;;) - __asm__("hlt"); + halt(); } static DEFINE_SPINLOCK(call_lock); @@ -1910,6 +1912,7 @@ void __devinit smp_prepare_boot_cpu(void) { cpu_set(smp_processor_id(), cpu_online_map); cpu_set(smp_processor_id(), cpu_callout_map); + cpu_set(smp_processor_id(), cpu_possible_map); } int __devinit diff --git a/arch/i386/math-emu/get_address.c b/arch/i386/math-emu/get_address.c index 91175738e94..9819b705efa 100644 --- a/arch/i386/math-emu/get_address.c +++ b/arch/i386/math-emu/get_address.c @@ -155,7 +155,6 @@ static long pm_address(u_char FPU_modrm, u_char segment, { struct desc_struct descriptor; unsigned long base_address, limit, address, seg_top; - unsigned short selector; segment--; @@ -173,17 +172,11 @@ static long pm_address(u_char FPU_modrm, u_char segment, /* fs and gs aren't used by the kernel, so they still have their user-space values. */ case PREFIX_FS_-1: - /* The cast is needed here to get gcc 2.8.0 to use a 16 bit register - in the assembler statement. */ - - __asm__("mov %%fs,%0":"=r" (selector)); - addr->selector = selector; + /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */ + savesegment(fs, addr->selector); break; case PREFIX_GS_-1: - /* The cast is needed here to get gcc 2.8.0 to use a 16 bit register - in the assembler statement. */ - __asm__("mov %%gs,%0":"=r" (selector)); - addr->selector = selector; + savesegment(gs, addr->selector); break; default: addr->selector = PM_REG_(segment); diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index 8e90339d6ea..411b8500ad1 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -199,6 +199,18 @@ static inline int is_prefetch(struct pt_regs *regs, unsigned long addr, return 0; } +static noinline void force_sig_info_fault(int si_signo, int si_code, + unsigned long address, struct task_struct *tsk) +{ + siginfo_t info; + + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + force_sig_info(si_signo, &info, tsk); +} + fastcall void do_invalid_op(struct pt_regs *, unsigned long); /* @@ -218,11 +230,10 @@ fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code) struct vm_area_struct * vma; unsigned long address; unsigned long page; - int write; - siginfo_t info; + int write, si_code; /* get the address */ - __asm__("movl %%cr2,%0":"=r" (address)); + address = read_cr2(); if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, SIGSEGV) == NOTIFY_STOP) @@ -233,7 +244,7 @@ fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code) tsk = current; - info.si_code = SEGV_MAPERR; + si_code = SEGV_MAPERR; /* * We fault-in kernel-space virtual memory on-demand. The @@ -313,7 +324,7 @@ fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code) * we can handle it.. */ good_area: - info.si_code = SEGV_ACCERR; + si_code = SEGV_ACCERR; write = 0; switch (error_code & 3) { default: /* 3: write, present */ @@ -387,11 +398,7 @@ bad_area_nosemaphore: /* Kernel addresses are always protection faults */ tsk->thread.error_code = error_code | (address >= TASK_SIZE); tsk->thread.trap_no = 14; - info.si_signo = SIGSEGV; - info.si_errno = 0; - /* info.si_code has been set above */ - info.si_addr = (void __user *)address; - force_sig_info(SIGSEGV, &info, tsk); + force_sig_info_fault(SIGSEGV, si_code, address, tsk); return; } @@ -446,7 +453,7 @@ no_context: printk(" at virtual address %08lx\n",address); printk(KERN_ALERT " printing eip:\n"); printk("%08lx\n", regs->eip); - asm("movl %%cr3,%0":"=r" (page)); + page = read_cr3(); page = ((unsigned long *) __va(page))[address >> 22]; printk(KERN_ALERT "*pde = %08lx\n", page); /* @@ -500,11 +507,7 @@ do_sigbus: tsk->thread.cr2 = address; tsk->thread.error_code = error_code; tsk->thread.trap_no = 14; - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; - force_sig_info(SIGBUS, &info, tsk); + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); return; vmalloc_fault: @@ -523,7 +526,7 @@ vmalloc_fault: pmd_t *pmd, *pmd_k; pte_t *pte_k; - asm("movl %%cr3,%0":"=r" (pgd_paddr)); + pgd_paddr = read_cr3(); pgd = index + (pgd_t *)__va(pgd_paddr); pgd_k = init_mm.pgd + index; diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c index 3b099f32b94..d524127c9af 100644 --- a/arch/i386/mm/hugetlbpage.c +++ b/arch/i386/mm/hugetlbpage.c @@ -22,12 +22,15 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; - pmd_t *pmd = NULL; + pte_t *pte = NULL; pgd = pgd_offset(mm, addr); pud = pud_alloc(mm, pgd, addr); - pmd = pmd_alloc(mm, pud, addr); - return (pte_t *) pmd; + if (pud) + pte = (pte_t *) pmd_alloc(mm, pud, addr); + BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); + + return pte; } pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) @@ -37,8 +40,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) pmd_t *pmd = NULL; pgd = pgd_offset(mm, addr); - pud = pud_offset(pgd, addr); - pmd = pmd_offset(pud, addr); + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, addr); + if (pud_present(*pud)) + pmd = pmd_offset(pud, addr); + } return (pte_t *) pmd; } @@ -118,17 +124,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, } #endif -void hugetlb_clean_stale_pgtable(pte_t *pte) -{ - pmd_t *pmd = (pmd_t *) pte; - struct page *page; - - page = pmd_page(*pmd); - pmd_clear(pmd); - dec_page_state(nr_page_table_pages); - page_cache_release(page); -} - /* x86_64 also uses this file */ #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 12216b52e28..9edfc058b89 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -198,9 +198,10 @@ int page_is_ram(unsigned long pagenr) if (efi_enabled) { efi_memory_desc_t *md; + void *p; - for (i = 0; i < memmap.nr_map; i++) { - md = &memmap.map[i]; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; if (!is_available_memory(md)) continue; addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; @@ -348,7 +349,7 @@ static void __init pagetable_init (void) * All user-space mappings are explicitly cleared after * SMP startup. */ - pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; + set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]); #endif } diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index cb3da6baa70..f600fc244f0 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -12,6 +12,7 @@ #include <asm/uaccess.h> #include <asm/processor.h> #include <asm/tlbflush.h> +#include <asm/pgalloc.h> static DEFINE_SPINLOCK(cpa_lock); static struct list_head df_list = LIST_HEAD_INIT(df_list); @@ -52,8 +53,8 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot) addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { - pbase[i] = pfn_pte(addr >> PAGE_SHIFT, - addr == address ? prot : PAGE_KERNEL); + set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, + addr == address ? prot : PAGE_KERNEL)); } return base; } @@ -62,7 +63,7 @@ static void flush_kernel_map(void *dummy) { /* Could use CLFLUSH here if the CPU supports it (Hammer,P4) */ if (boot_cpu_data.x86_model >= 4) - asm volatile("wbinvd":::"memory"); + wbinvd(); /* Flush all to work around Errata in early athlons regarding * large page flushing. */ diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index bd2f7afc7a2..dcdce2c6c53 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -207,19 +207,19 @@ void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) { unsigned long flags; - if (PTRS_PER_PMD == 1) + if (PTRS_PER_PMD == 1) { + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); spin_lock_irqsave(&pgd_lock, flags); + } - memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - + KERNEL_PGD_PTRS); if (PTRS_PER_PMD > 1) return; pgd_list_add(pgd); spin_unlock_irqrestore(&pgd_lock, flags); - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); } /* never called when PTRS_PER_PMD > 1 */ diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c index c547c1af6fa..7b0b9ad848e 100644 --- a/arch/i386/power/cpu.c +++ b/arch/i386/power/cpu.c @@ -42,25 +42,25 @@ void __save_processor_state(struct saved_context *ctxt) /* * descriptor tables */ - asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); - asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); - asm volatile ("str %0" : "=m" (ctxt->tr)); + store_gdt(&ctxt->gdt_limit); + store_idt(&ctxt->idt_limit); + store_tr(ctxt->tr); /* * segment registers */ - asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); - asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); - asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); - asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); + savesegment(es, ctxt->es); + savesegment(fs, ctxt->fs); + savesegment(gs, ctxt->gs); + savesegment(ss, ctxt->ss); /* * control registers */ - asm volatile ("movl %%cr0, %0" : "=r" (ctxt->cr0)); - asm volatile ("movl %%cr2, %0" : "=r" (ctxt->cr2)); - asm volatile ("movl %%cr3, %0" : "=r" (ctxt->cr3)); - asm volatile ("movl %%cr4, %0" : "=r" (ctxt->cr4)); + ctxt->cr0 = read_cr0(); + ctxt->cr2 = read_cr2(); + ctxt->cr3 = read_cr3(); + ctxt->cr4 = read_cr4(); } void save_processor_state(void) @@ -84,7 +84,6 @@ static void fix_processor_context(void) struct tss_struct * t = &per_cpu(init_tss, cpu); set_tss_desc(cpu,t); /* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ - per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_TSS].b &= 0xfffffdff; load_TR_desc(); /* This does ltr */ load_LDT(¤t->active_mm->context); /* This does lldt */ @@ -109,25 +108,25 @@ void __restore_processor_state(struct saved_context *ctxt) /* * control registers */ - asm volatile ("movl %0, %%cr4" :: "r" (ctxt->cr4)); - asm volatile ("movl %0, %%cr3" :: "r" (ctxt->cr3)); - asm volatile ("movl %0, %%cr2" :: "r" (ctxt->cr2)); - asm volatile ("movl %0, %%cr0" :: "r" (ctxt->cr0)); + write_cr4(ctxt->cr4); + write_cr3(ctxt->cr3); + write_cr2(ctxt->cr2); + write_cr2(ctxt->cr0); /* * now restore the descriptor tables to their proper values * ltr is done i fix_processor_context(). */ - asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); - asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); + load_gdt(&ctxt->gdt_limit); + load_idt(&ctxt->idt_limit); /* * segment registers */ - asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); - asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); - asm volatile ("movw %0, %%gs" :: "r" (ctxt->gs)); - asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); + loadsegment(es, ctxt->es); + loadsegment(fs, ctxt->fs); + loadsegment(gs, ctxt->gs); + loadsegment(ss, ctxt->ss); /* * sysenter MSRs |