diff options
author | David Woodhouse <dwmw2@infradead.org> | 2007-07-23 10:20:10 +0100 |
---|---|---|
committer | David Woodhouse <dwmw2@infradead.org> | 2007-07-23 10:20:10 +0100 |
commit | 39fe5434cb9de5da40510028b17b96bc4eb312b3 (patch) | |
tree | 7a02a317b9ad57da51ca99887c119e779ccf3f13 /arch/i386 | |
parent | 0fc72b81d3111d114ab378935b1cf07680ca1289 (diff) | |
parent | f695baf2df9e0413d3521661070103711545207a (diff) |
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'arch/i386')
139 files changed, 10184 insertions, 4685 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index b1b2b30b1b8..abb582bc218 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -18,6 +18,10 @@ config GENERIC_TIME bool default y +config GENERIC_CMOS_UPDATE + bool + default y + config CLOCKSOURCE_WATCHDOG bool default y @@ -222,6 +226,8 @@ config PARAVIRT However, when run without a hypervisor the kernel is theoretically slower. If in doubt, say N. +source "arch/i386/xen/Kconfig" + config VMI bool "VMI Paravirt-ops support" depends on PARAVIRT @@ -542,6 +548,7 @@ config HIGHMEM4G config HIGHMEM64G bool "64GB" depends on !M386 && !M486 + select X86_PAE help Select this if you have a 32-bit processor and more than 4 gigabytes of physical RAM. @@ -571,12 +578,12 @@ choice config VMSPLIT_3G bool "3G/1G user/kernel split" config VMSPLIT_3G_OPT - depends on !HIGHMEM + depends on !X86_PAE bool "3G/1G user/kernel split (for full 1G low memory)" config VMSPLIT_2G bool "2G/2G user/kernel split" config VMSPLIT_2G_OPT - depends on !HIGHMEM + depends on !X86_PAE bool "2G/2G user/kernel split (for full 2G low memory)" config VMSPLIT_1G bool "1G/3G user/kernel split" @@ -596,10 +603,15 @@ config HIGHMEM default y config X86_PAE - bool - depends on HIGHMEM64G - default y + bool "PAE (Physical Address Extension) Support" + default n + depends on !HIGHMEM4G select RESOURCES_64BIT + help + PAE is required for NX support, and furthermore enables + larger swapspace support for non-overcommit purposes. It + has the cost of more pagetable lookup overhead, and also + consumes more pagetable space per process. # Common NUMA Features config NUMA @@ -815,6 +827,7 @@ config CRASH_DUMP config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) + default "0x1000000" if X86_NUMAQ default "0x100000" help This gives the physical address where the kernel is loaded. @@ -1212,21 +1225,26 @@ source "drivers/Kconfig" source "fs/Kconfig" -menu "Instrumentation Support" +menuconfig INSTRUMENTATION + bool "Instrumentation Support" depends on EXPERIMENTAL + default y + +if INSTRUMENTATION source "arch/i386/oprofile/Kconfig" config KPROBES - bool "Kprobes (EXPERIMENTAL)" - depends on KALLSYMS && EXPERIMENTAL && MODULES + bool "Kprobes" + depends on KALLSYMS && MODULES help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes a probepoint and specifies the callback. Kprobes is useful for kernel debugging, non-intrusive instrumentation and testing. If in doubt, say "N". -endmenu + +endif # INSTRUMENTATION source "arch/i386/Kconfig.debug" diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu index 5c95ceb7f12..11a24d54f27 100644 --- a/arch/i386/Kconfig.cpu +++ b/arch/i386/Kconfig.cpu @@ -297,11 +297,6 @@ config X86_POPAD_OK depends on !M386 default y -config X86_CMPXCHG64 - bool - depends on X86_PAE - default y - config X86_ALIGNMENT_16 bool depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 @@ -344,8 +339,8 @@ config X86_CMOV depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7) default y -config X86_MINIMUM_CPU_MODEL +config X86_MINIMUM_CPU_FAMILY int - default "4" if X86_XADD || X86_CMPXCHG || X86_BSWAP - default "0" + default "4" if X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK + default "3" diff --git a/arch/i386/Makefile b/arch/i386/Makefile index bd28f9f9b4b..01f0ff0daaf 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -93,6 +93,9 @@ mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-i386/mach-es7000 mcore-$(CONFIG_X86_ES7000) := mach-default core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/ +# Xen paravirtualization support +core-$(CONFIG_XEN) += arch/i386/xen/ + # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default @@ -108,6 +111,7 @@ drivers-$(CONFIG_PCI) += arch/i386/pci/ # must be linked after kernel/ drivers-$(CONFIG_OPROFILE) += arch/i386/oprofile/ drivers-$(CONFIG_PM) += arch/i386/power/ +drivers-$(CONFIG_FB) += arch/i386/video/ CFLAGS += $(mflags-y) AFLAGS += $(mflags-y) diff --git a/arch/i386/boot/.gitignore b/arch/i386/boot/.gitignore index 495f20c085d..18465143cfa 100644 --- a/arch/i386/boot/.gitignore +++ b/arch/i386/boot/.gitignore @@ -1,3 +1,5 @@ bootsect bzImage setup +setup.bin +setup.elf diff --git a/arch/i386/boot/Makefile b/arch/i386/boot/Makefile index bfbc32098a4..93386a4e40b 100644 --- a/arch/i386/boot/Makefile +++ b/arch/i386/boot/Makefile @@ -25,27 +25,56 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA #RAMDISK := -DRAMDISK=512 -targets := vmlinux.bin bootsect bootsect.o \ - setup setup.o zImage bzImage +targets := vmlinux.bin setup.bin setup.elf zImage bzImage subdir- := compressed +setup-y += a20.o apm.o cmdline.o copy.o cpu.o cpucheck.o edd.o +setup-y += header.o main.o mca.o memory.o pm.o pmjump.o +setup-y += printf.o string.o tty.o video.o version.o voyager.o + +# The link order of the video-*.o modules can matter. In particular, +# video-vga.o *must* be listed first, followed by video-vesa.o. +# Hardware-specific drivers should follow in the order they should be +# probed, and video-bios.o should typically be last. +setup-y += video-vga.o +setup-y += video-vesa.o +setup-y += video-bios.o +targets += $(setup-y) hostprogs-y := tools/build HOSTCFLAGS_build.o := $(LINUXINCLUDE) # --------------------------------------------------------------------------- +# How to compile the 16-bit code. Note we always compile for -march=i386, +# that way we can complain to the user if the CPU is insufficient. +cflags-i386 := +cflags-x86_64 := -m32 +CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ + $(cflags-$(ARCH)) \ + -Wall -Wstrict-prototypes \ + -march=i386 -mregparm=3 \ + -include $(srctree)/$(src)/code16gcc.h \ + -fno-strict-aliasing -fomit-frame-pointer \ + $(call cc-option, -ffreestanding) \ + $(call cc-option, -fno-toplevel-reorder,\ + $(call cc-option, -fno-unit-at-a-time)) \ + $(call cc-option, -fno-stack-protector) \ + $(call cc-option, -mpreferred-stack-boundary=2) +AFLAGS := $(CFLAGS) -D__ASSEMBLY__ + $(obj)/zImage: IMAGE_OFFSET := 0x1000 $(obj)/zImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) $(obj)/bzImage: IMAGE_OFFSET := 0x100000 +$(obj)/bzImage: EXTRA_CFLAGS := -D__BIG_KERNEL__ $(obj)/bzImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ $(obj)/bzImage: BUILDFLAGS := -b quiet_cmd_image = BUILD $@ -cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/bootsect $(obj)/setup \ +cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/setup.bin \ $(obj)/vmlinux.bin $(ROOT_DEV) > $@ -$(obj)/zImage $(obj)/bzImage: $(obj)/bootsect $(obj)/setup \ +$(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \ $(obj)/vmlinux.bin $(obj)/tools/build FORCE $(call if_changed,image) @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' @@ -53,12 +82,17 @@ $(obj)/zImage $(obj)/bzImage: $(obj)/bootsect $(obj)/setup \ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE $(call if_changed,objcopy) -LDFLAGS_bootsect := -Ttext 0x0 -s --oformat binary -LDFLAGS_setup := -Ttext 0x0 -s --oformat binary -e begtext +SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -$(obj)/setup $(obj)/bootsect: %: %.o FORCE +LDFLAGS_setup.elf := -T +$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE $(call if_changed,ld) +OBJCOPYFLAGS_setup.bin := -O binary + +$(obj)/setup.bin: $(obj)/setup.elf FORCE + $(call if_changed,objcopy) + $(obj)/compressed/vmlinux: FORCE $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ diff --git a/arch/i386/boot/a20.c b/arch/i386/boot/a20.c new file mode 100644 index 00000000000..31348d054fc --- /dev/null +++ b/arch/i386/boot/a20.c @@ -0,0 +1,161 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/a20.c + * + * Enable A20 gate (return -1 on failure) + */ + +#include "boot.h" + +#define MAX_8042_LOOPS 100000 + +static int empty_8042(void) +{ + u8 status; + int loops = MAX_8042_LOOPS; + + while (loops--) { + io_delay(); + + status = inb(0x64); + if (status & 1) { + /* Read and discard input data */ + io_delay(); + (void)inb(0x60); + } else if (!(status & 2)) { + /* Buffers empty, finished! */ + return 0; + } + } + + return -1; +} + +/* Returns nonzero if the A20 line is enabled. The memory address + used as a test is the int $0x80 vector, which should be safe. */ + +#define A20_TEST_ADDR (4*0x80) +#define A20_TEST_SHORT 32 +#define A20_TEST_LONG 2097152 /* 2^21 */ + +static int a20_test(int loops) +{ + int ok = 0; + int saved, ctr; + + set_fs(0x0000); + set_gs(0xffff); + + saved = ctr = rdfs32(A20_TEST_ADDR); + + while (loops--) { + wrfs32(++ctr, A20_TEST_ADDR); + io_delay(); /* Serialize and make delay constant */ + ok = rdgs32(A20_TEST_ADDR+0x10) ^ ctr; + if (ok) + break; + } + + wrfs32(saved, A20_TEST_ADDR); + return ok; +} + +/* Quick test to see if A20 is already enabled */ +static int a20_test_short(void) +{ + return a20_test(A20_TEST_SHORT); +} + +/* Longer test that actually waits for A20 to come on line; this + is useful when dealing with the KBC or other slow external circuitry. */ +static int a20_test_long(void) +{ + return a20_test(A20_TEST_LONG); +} + +static void enable_a20_bios(void) +{ + asm volatile("pushfl; int $0x15; popfl" + : : "a" ((u16)0x2401)); +} + +static void enable_a20_kbc(void) +{ + empty_8042(); + + outb(0xd1, 0x64); /* Command write */ + empty_8042(); + + outb(0xdf, 0x60); /* A20 on */ + empty_8042(); +} + +static void enable_a20_fast(void) +{ + u8 port_a; + + port_a = inb(0x92); /* Configuration port A */ + port_a |= 0x02; /* Enable A20 */ + port_a &= ~0x01; /* Do not reset machine */ + outb(port_a, 0x92); +} + +/* + * Actual routine to enable A20; return 0 on ok, -1 on failure + */ + +#define A20_ENABLE_LOOPS 255 /* Number of times to try */ + +int enable_a20(void) +{ + int loops = A20_ENABLE_LOOPS; + +#if defined(CONFIG_X86_ELAN) + /* Elan croaks if we try to touch the KBC */ + enable_a20_fast(); + while (!a20_test_long()) + ; + return 0; +#elif defined(CONFIG_X86_VOYAGER) + /* On Voyager, a20_test() is unsafe? */ + enable_a20_kbc(); + return 0; +#else + while (loops--) { + /* First, check to see if A20 is already enabled + (legacy free, etc.) */ + if (a20_test_short()) + return 0; + + /* Next, try the BIOS (INT 0x15, AX=0x2401) */ + enable_a20_bios(); + if (a20_test_short()) + return 0; + + /* Try enabling A20 through the keyboard controller */ + empty_8042(); + if (a20_test_short()) + return 0; /* BIOS worked, but with delayed reaction */ + + enable_a20_kbc(); + if (a20_test_long()) + return 0; + + /* Finally, try enabling the "fast A20 gate" */ + enable_a20_fast(); + if (a20_test_long()) + return 0; + } + + return -1; +#endif +} diff --git a/arch/i386/boot/apm.c b/arch/i386/boot/apm.c new file mode 100644 index 00000000000..a34087c370c --- /dev/null +++ b/arch/i386/boot/apm.c @@ -0,0 +1,97 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * Original APM BIOS checking by Stephen Rothwell, May 1994 + * (sfr@canb.auug.org.au) + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/apm.c + * + * Get APM BIOS information + */ + +#include "boot.h" + +#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE) + +int query_apm_bios(void) +{ + u16 ax, bx, cx, dx, di; + u32 ebx, esi; + u8 err; + + /* APM BIOS installation check */ + ax = 0x5300; + bx = cx = 0; + asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0" + : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx) + : : "esi", "edi"); + + if (err) + return -1; /* No APM BIOS */ + + if (bx != 0x504d) /* "PM" signature */ + return -1; + + if (cx & 0x02) /* 32 bits supported? */ + return -1; + + /* Disconnect first, just in case */ + ax = 0x5304; + asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp" + : "+a" (ax) + : : "ebx", "ecx", "edx", "esi", "edi"); + + /* Paranoia */ + ebx = esi = 0; + cx = dx = di = 0; + + /* 32-bit connect */ + asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6" + : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx), + "+S" (esi), "+D" (di), "=m" (err) + : "a" (0x5303)); + + boot_params.apm_bios_info.cseg = ax; + boot_params.apm_bios_info.offset = ebx; + boot_params.apm_bios_info.cseg_16 = cx; + boot_params.apm_bios_info.dseg = dx; + boot_params.apm_bios_info.cseg_len = (u16)esi; + boot_params.apm_bios_info.cseg_16_len = esi >> 16; + boot_params.apm_bios_info.dseg_len = di; + + if (err) + return -1; + + /* Redo the installation check as the 32-bit connect; + some BIOSes return different flags this way... */ + + ax = 0x5300; + bx = cx = 0; + asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0" + : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx) + : : "esi", "edi"); + + if (err || bx != 0x504d) { + /* Failure with 32-bit connect, try to disconect and ignore */ + ax = 0x5304; + bx = 0; + asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp" + : "+a" (ax), "+b" (bx) + : : "ecx", "edx", "esi", "edi"); + return -1; + } + + boot_params.apm_bios_info.version = ax; + boot_params.apm_bios_info.flags = cx; + return 0; +} + +#endif diff --git a/arch/i386/boot/bitops.h b/arch/i386/boot/bitops.h new file mode 100644 index 00000000000..8dcc8dc7db8 --- /dev/null +++ b/arch/i386/boot/bitops.h @@ -0,0 +1,45 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/bitops.h + * + * Very simple bitops for the boot code. + */ + +#ifndef BOOT_BITOPS_H +#define BOOT_BITOPS_H +#define _LINUX_BITOPS_H /* Inhibit inclusion of <linux/bitops.h> */ + +static inline int constant_test_bit(int nr, const void *addr) +{ + const u32 *p = (const u32 *)addr; + return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0; +} +static inline int variable_test_bit(int nr, const void *addr) +{ + u8 v; + const u32 *p = (const u32 *)addr; + + asm("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr)); + return v; +} + +#define test_bit(nr,addr) \ +(__builtin_constant_p(nr) ? \ + constant_test_bit((nr),(addr)) : \ + variable_test_bit((nr),(addr))) + +static inline void set_bit(int nr, void *addr) +{ + asm("btsl %1,%0" : "+m" (*(u32 *)addr) : "Ir" (nr)); +} + +#endif /* BOOT_BITOPS_H */ diff --git a/arch/i386/boot/boot.h b/arch/i386/boot/boot.h new file mode 100644 index 00000000000..dec70c9b605 --- /dev/null +++ b/arch/i386/boot/boot.h @@ -0,0 +1,296 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/boot.h + * + * Header file for the real-mode kernel code + */ + +#ifndef BOOT_BOOT_H +#define BOOT_BOOT_H + +#ifndef __ASSEMBLY__ + +#include <stdarg.h> +#include <linux/types.h> +#include <linux/edd.h> +#include <asm/boot.h> +#include <asm/bootparam.h> + +/* Useful macros */ +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) + +extern struct setup_header hdr; +extern struct boot_params boot_params; + +/* Basic port I/O */ +static inline void outb(u8 v, u16 port) +{ + asm volatile("outb %0,%1" : : "a" (v), "dN" (port)); +} +static inline u8 inb(u16 port) +{ + u8 v; + asm volatile("inb %1,%0" : "=a" (v) : "dN" (port)); + return v; +} + +static inline void outw(u16 v, u16 port) +{ + asm volatile("outw %0,%1" : : "a" (v), "dN" (port)); +} +static inline u16 inw(u16 port) +{ + u16 v; + asm volatile("inw %1,%0" : "=a" (v) : "dN" (port)); + return v; +} + +static inline void outl(u32 v, u16 port) +{ + asm volatile("outl %0,%1" : : "a" (v), "dN" (port)); +} +static inline u32 inl(u32 port) +{ + u32 v; + asm volatile("inl %1,%0" : "=a" (v) : "dN" (port)); + return v; +} + +static inline void io_delay(void) +{ + const u16 DELAY_PORT = 0x80; + asm volatile("outb %%al,%0" : : "dN" (DELAY_PORT)); +} + +/* These functions are used to reference data in other segments. */ + +static inline u16 ds(void) +{ + u16 seg; + asm("movw %%ds,%0" : "=rm" (seg)); + return seg; +} + +static inline void set_fs(u16 seg) +{ + asm volatile("movw %0,%%fs" : : "rm" (seg)); +} +static inline u16 fs(void) +{ + u16 seg; + asm("movw %%fs,%0" : "=rm" (seg)); + return seg; +} + +static inline void set_gs(u16 seg) +{ + asm volatile("movw %0,%%gs" : : "rm" (seg)); +} +static inline u16 gs(void) +{ + u16 seg; + asm("movw %%gs,%0" : "=rm" (seg)); + return seg; +} + +typedef unsigned int addr_t; + +static inline u8 rdfs8(addr_t addr) +{ + u8 v; + asm("movb %%fs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr)); + return v; +} +static inline u16 rdfs16(addr_t addr) +{ + u16 v; + asm("movw %%fs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr)); + return v; +} +static inline u32 rdfs32(addr_t addr) +{ + u32 v; + asm("movl %%fs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr)); + return v; +} + +static inline void wrfs8(u8 v, addr_t addr) +{ + asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "r" (v)); +} +static inline void wrfs16(u16 v, addr_t addr) +{ + asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "r" (v)); +} +static inline void wrfs32(u32 v, addr_t addr) +{ + asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "r" (v)); +} + +static inline u8 rdgs8(addr_t addr) +{ + u8 v; + asm("movb %%gs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr)); + return v; +} +static inline u16 rdgs16(addr_t addr) +{ + u16 v; + asm("movw %%gs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr)); + return v; +} +static inline u32 rdgs32(addr_t addr) +{ + u32 v; + asm("movl %%gs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr)); + return v; +} + +static inline void wrgs8(u8 v, addr_t addr) +{ + asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "r" (v)); +} +static inline void wrgs16(u16 v, addr_t addr) +{ + asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "r" (v)); +} +static inline void wrgs32(u32 v, addr_t addr) +{ + asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "r" (v)); +} + +/* Note: these only return true/false, not a signed return value! */ +static inline int memcmp(const void *s1, const void *s2, size_t len) +{ + u8 diff; + asm("repe; cmpsb; setnz %0" + : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); + return diff; +} + +static inline int memcmp_fs(const void *s1, addr_t s2, size_t len) +{ + u8 diff; + asm("fs; repe; cmpsb; setnz %0" + : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); + return diff; +} +static inline int memcmp_gs(const void *s1, addr_t s2, size_t len) +{ + u8 diff; + asm("gs; repe; cmpsb; setnz %0" + : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); + return diff; +} + +static inline int isdigit(int ch) +{ + return (ch >= '0') && (ch <= '9'); +} + +/* Heap -- available for dynamic lists. */ +#define STACK_SIZE 512 /* Minimum number of bytes for stack */ + +extern char _end[]; +extern char *HEAP; +extern char *heap_end; +#define RESET_HEAP() ((void *)( HEAP = _end )) +static inline char *__get_heap(size_t s, size_t a, size_t n) +{ + char *tmp; + + HEAP = (char *)(((size_t)HEAP+(a-1)) & ~(a-1)); + tmp = HEAP; + HEAP += s*n; + return tmp; +} +#define GET_HEAP(type, n) \ + ((type *)__get_heap(sizeof(type),__alignof__(type),(n))) + +static inline int heap_free(void) +{ + return heap_end-HEAP; +} + +/* copy.S */ + +void copy_to_fs(addr_t dst, void *src, size_t len); +void *copy_from_fs(void *dst, addr_t src, size_t len); +void copy_to_gs(addr_t dst, void *src, size_t len); +void *copy_from_gs(void *dst, addr_t src, size_t len); +void *memcpy(void *dst, void *src, size_t len); +void *memset(void *dst, int c, size_t len); + +#define memcpy(d,s,l) __builtin_memcpy(d,s,l) +#define memset(d,c,l) __builtin_memset(d,c,l) + +/* a20.c */ +int enable_a20(void); + +/* apm.c */ +int query_apm_bios(void); + +/* cmdline.c */ +int cmdline_find_option(const char *option, char *buffer, int bufsize); + +/* cpu.c, cpucheck.c */ +int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); +int validate_cpu(void); + +/* edd.c */ +void query_edd(void); + +/* header.S */ +void __attribute__((noreturn)) die(void); + +/* mca.c */ +int query_mca(void); + +/* memory.c */ +int detect_memory(void); + +/* pm.c */ +void __attribute__((noreturn)) go_to_protected_mode(void); + +/* pmjump.S */ +void __attribute__((noreturn)) + protected_mode_jump(u32 entrypoint, u32 bootparams); + +/* printf.c */ +int sprintf(char *buf, const char *fmt, ...); +int vsprintf(char *buf, const char *fmt, va_list args); +int printf(const char *fmt, ...); + +/* string.c */ +int strcmp(const char *str1, const char *str2); +size_t strnlen(const char *s, size_t maxlen); +unsigned int atou(const char *s); + +/* tty.c */ +void puts(const char *); +void putchar(int); +int getchar(void); +void kbd_flush(void); +int getchar_timeout(void); + +/* video.c */ +void set_video(void); + +/* video-vesa.c */ +void vesa_store_edid(void); + +/* voyager.c */ +int query_voyager(void); + +#endif /* __ASSEMBLY__ */ + +#endif /* BOOT_BOOT_H */ diff --git a/arch/i386/boot/bootsect.S b/arch/i386/boot/bootsect.S deleted file mode 100644 index 011b7a4993d..00000000000 --- a/arch/i386/boot/bootsect.S +++ /dev/null @@ -1,98 +0,0 @@ -/* - * bootsect.S Copyright (C) 1991, 1992 Linus Torvalds - * - * modified by Drew Eckhardt - * modified by Bruce Evans (bde) - * modified by Chris Noe (May 1999) (as86 -> gas) - * gutted by H. Peter Anvin (Jan 2003) - * - * BIG FAT NOTE: We're in real mode using 64k segments. Therefore segment - * addresses must be multiplied by 16 to obtain their respective linear - * addresses. To avoid confusion, linear addresses are written using leading - * hex while segment addresses are written as segment:offset. - * - */ - -#include <asm/boot.h> - -SETUPSECTS = 4 /* default nr of setup-sectors */ -BOOTSEG = 0x07C0 /* original address of boot-sector */ -INITSEG = DEF_INITSEG /* we move boot here - out of the way */ -SETUPSEG = DEF_SETUPSEG /* setup starts here */ -SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */ -SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ - /* to be loaded */ -ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ -SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */ - -#ifndef SVGA_MODE -#define SVGA_MODE ASK_VGA -#endif - -#ifndef RAMDISK -#define RAMDISK 0 -#endif - -#ifndef ROOT_RDONLY -#define ROOT_RDONLY 1 -#endif - -.code16 -.text - -.global _start -_start: - - # Normalize the start address - jmpl $BOOTSEG, $start2 - -start2: - movw %cs, %ax - movw %ax, %ds - movw %ax, %es - movw %ax, %ss - movw $0x7c00, %sp - sti - cld - - movw $bugger_off_msg, %si - -msg_loop: - lodsb - andb %al, %al - jz die - movb $0xe, %ah - movw $7, %bx - int $0x10 - jmp msg_loop - -die: - # Allow the user to press a key, then reboot - xorw %ax, %ax - int $0x16 - int $0x19 - - # int 0x19 should never return. In case it does anyway, - # invoke the BIOS reset code... - ljmp $0xf000,$0xfff0 - - -bugger_off_msg: - .ascii "Direct booting from floppy is no longer supported.\r\n" - .ascii "Please use a boot loader program instead.\r\n" - .ascii "\n" - .ascii "Remove disk and press any key to reboot . . .\r\n" - .byte 0 - - - # Kernel attributes; used by setup - - .org 497 -setup_sects: .byte SETUPSECTS -root_flags: .word ROOT_RDONLY -syssize: .word SYSSIZE -swap_dev: .word SWAP_DEV -ram_size: .word RAMDISK -vid_mode: .word SVGA_MODE -root_dev: .word ROOT_DEV -boot_flag: .word 0xAA55 diff --git a/arch/i386/boot/cmdline.c b/arch/i386/boot/cmdline.c new file mode 100644 index 00000000000..34bb778c435 --- /dev/null +++ b/arch/i386/boot/cmdline.c @@ -0,0 +1,97 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/cmdline.c + * + * Simple command-line parser for early boot. + */ + +#include "boot.h" + +static inline int myisspace(u8 c) +{ + return c <= ' '; /* Close enough approximation */ +} + +/* + * Find a non-boolean option, that is, "option=argument". In accordance + * with standard Linux practice, if this option is repeated, this returns + * the last instance on the command line. + * + * Returns the length of the argument (regardless of if it was + * truncated to fit in the buffer), or -1 on not found. + */ +int cmdline_find_option(const char *option, char *buffer, int bufsize) +{ + u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr; + addr_t cptr; + char c; + int len = -1; + const char *opptr = NULL; + char *bufptr = buffer; + enum { + st_wordstart, /* Start of word/after whitespace */ + st_wordcmp, /* Comparing this word */ + st_wordskip, /* Miscompare, skip */ + st_bufcpy /* Copying this to buffer */ + } state = st_wordstart; + + if (!cmdline_ptr || cmdline_ptr >= 0x100000) + return -1; /* No command line, or inaccessible */ + + cptr = cmdline_ptr & 0xf; + set_fs(cmdline_ptr >> 4); + + while (cptr < 0x10000 && (c = rdfs8(cptr++))) { + switch (state) { + case st_wordstart: + if (myisspace(c)) + break; + + /* else */ + state = st_wordcmp; + opptr = option; + /* fall through */ + + case st_wordcmp: + if (c == '=' && !*opptr) { + len = 0; + bufptr = buffer; + state = st_bufcpy; + } else if (myisspace(c)) { + state = st_wordstart; + } else if (c != *opptr++) { + state = st_wordskip; + } + break; + + case st_wordskip: + if (myisspace(c)) + state = st_wordstart; + break; + + case st_bufcpy: + if (myisspace(c)) { + state = st_wordstart; + } else { + if (len < bufsize-1) + *bufptr++ = c; + len++; + } + break; + } + } + + if (bufsize) + *bufptr = '\0'; + + return len; +} diff --git a/arch/i386/boot/code16gcc.h b/arch/i386/boot/code16gcc.h new file mode 100644 index 00000000000..3bd848093b9 --- /dev/null +++ b/arch/i386/boot/code16gcc.h @@ -0,0 +1,15 @@ +/* + * code16gcc.h + * + * This file is -include'd when compiling 16-bit C code. + * Note: this asm() needs to be emitted before gcc omits any code. + * Depending on gcc version, this requires -fno-unit-at-a-time or + * -fno-toplevel-reorder. + * + * Hopefully gcc will eventually have a real -m16 option so we can + * drop this hack long term. + */ + +#ifndef __ASSEMBLY__ +asm(".code16gcc"); +#endif diff --git a/arch/i386/boot/compressed/Makefile b/arch/i386/boot/compressed/Makefile index a661217f33e..189fa1dbefc 100644 --- a/arch/i386/boot/compressed/Makefile +++ b/arch/i386/boot/compressed/Makefile @@ -9,9 +9,14 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o \ EXTRA_AFLAGS := -traditional LDFLAGS_vmlinux := -T -CFLAGS_misc.o += -fPIC hostprogs-y := relocs +CFLAGS := -m32 -D__KERNEL__ $(LINUX_INCLUDE) -O2 \ + -fno-strict-aliasing -fPIC \ + $(call cc-option,-ffreestanding) \ + $(call cc-option,-fno-stack-protector) +LDFLAGS := -m elf_i386 + $(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE $(call if_changed,ld) @: diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S index 3517a32aaf4..f35ea223752 100644 --- a/arch/i386/boot/compressed/head.S +++ b/arch/i386/boot/compressed/head.S @@ -45,10 +45,10 @@ startup_32: * at and where we were actually loaded at. This can only be done * with a short local call on x86. Nothing else will tell us what * address we are running at. The reserved chunk of the real-mode - * data at 0x34-0x3f are used as the stack for this calculation. - * Only 4 bytes are needed. + * data at 0x1e4 (defined as a scratch field) are used as the stack + * for this calculation. Only 4 bytes are needed. */ - leal 0x40(%esi), %esp + leal (0x1e4+4)(%esi), %esp call 1f 1: popl %ebp subl $1b, %ebp diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c index ce4fda261aa..2d77ee728f9 100644 --- a/arch/i386/boot/compressed/relocs.c +++ b/arch/i386/boot/compressed/relocs.c @@ -31,6 +31,9 @@ static const char* safe_abs_relocs[] = { "__kernel_rt_sigreturn", "__kernel_sigreturn", "SYSENTER_RETURN", + "VDSO_NOTE_MASK", + "xen_irq_disable_direct_reloc", + "xen_save_fl_direct_reloc", }; static int is_safe_abs_reloc(const char* sym_name) diff --git a/arch/i386/boot/copy.S b/arch/i386/boot/copy.S new file mode 100644 index 00000000000..ef127e56a3c --- /dev/null +++ b/arch/i386/boot/copy.S @@ -0,0 +1,101 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/copy.S + * + * Memory copy routines + */ + + .code16gcc + .text + + .globl memcpy + .type memcpy, @function +memcpy: + pushw %si + pushw %di + movw %ax, %di + movw %dx, %si + pushw %cx + shrw $2, %cx + rep; movsl + popw %cx + andw $3, %cx + rep; movsb + popw %di + popw %si + ret + .size memcpy, .-memcpy + + .globl memset + .type memset, @function +memset: + pushw %di + movw %ax, %di + movzbl %dl, %eax + imull $0x01010101,%eax + pushw %cx + shrw $2, %cx + rep; stosl + popw %cx + andw $3, %cx + rep; stosb + popw %di + ret + .size memset, .-memset + + .globl copy_from_fs + .type copy_from_fs, @function +copy_from_fs: + pushw %ds + pushw %fs + popw %ds + call memcpy + popw %ds + ret + .size copy_from_fs, .-copy_from_fs + + .globl copy_to_fs + .type copy_to_fs, @function +copy_to_fs: + pushw %es + pushw %fs + popw %es + call memcpy + popw %es + ret + .size copy_to_fs, .-copy_to_fs + +#if 0 /* Not currently used, but can be enabled as needed */ + + .globl copy_from_gs + .type copy_from_gs, @function +copy_from_gs: + pushw %ds + pushw %gs + popw %ds + call memcpy + popw %ds + ret + .size copy_from_gs, .-copy_from_gs + .globl copy_to_gs + + .type copy_to_gs, @function +copy_to_gs: + pushw %es + pushw %gs + popw %es + call memcpy + popw %es + ret + .size copy_to_gs, .-copy_to_gs + +#endif diff --git a/arch/i386/boot/cpu.c b/arch/i386/boot/cpu.c new file mode 100644 index 00000000000..2a5c32da585 --- /dev/null +++ b/arch/i386/boot/cpu.c @@ -0,0 +1,69 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/cpu.c + * + * Check for obligatory CPU features and abort if the features are not + * present. + */ + +#include "boot.h" +#include "bitops.h" +#include <asm/cpufeature.h> + +static char *cpu_name(int level) +{ + static char buf[6]; + + if (level == 64) { + return "x86-64"; + } else { + sprintf(buf, "i%d86", level); + return buf; + } +} + +int validate_cpu(void) +{ + u32 *err_flags; + int cpu_level, req_level; + + check_cpu(&cpu_level, &req_level, &err_flags); + + if (cpu_level < req_level) { + printf("This kernel requires an %s CPU, ", + cpu_name(req_level)); + printf("but only detected an %s CPU.\n", + cpu_name(cpu_level)); + return -1; + } + + if (err_flags) { + int i, j; + puts("This kernel requires the following features " + "not present on the CPU:\n"); + + for (i = 0; i < NCAPINTS; i++) { + u32 e = err_flags[i]; + + for (j = 0; j < 32; j++) { + if (e & 1) + printf("%d:%d ", i, j); + + e >>= 1; + } + } + putchar('\n'); + return -1; + } else { + return 0; + } +} diff --git a/arch/i386/boot/cpucheck.c b/arch/i386/boot/cpucheck.c new file mode 100644 index 00000000000..991e8ceae1d --- /dev/null +++ b/arch/i386/boot/cpucheck.c @@ -0,0 +1,267 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/cpucheck.c + * + * Check for obligatory CPU features and abort if the features are not + * present. This code should be compilable as 16-, 32- or 64-bit + * code, so be very careful with types and inline assembly. + * + * This code should not contain any messages; that requires an + * additional wrapper. + * + * As written, this code is not safe for inclusion into the kernel + * proper (after FPU initialization, in particular). + */ + +#ifdef _SETUP +# include "boot.h" +# include "bitops.h" +#endif +#include <linux/types.h> +#include <asm/cpufeature.h> +#include <asm/processor-flags.h> +#include <asm/required-features.h> +#include <asm/msr-index.h> + +struct cpu_features { + int level; /* Family, or 64 for x86-64 */ + int model; + u32 flags[NCAPINTS]; +}; + +static struct cpu_features cpu; +static u32 cpu_vendor[3]; +static u32 err_flags[NCAPINTS]; + +#ifdef CONFIG_X86_64 +static const int req_level = 64; +#elif defined(CONFIG_X86_MINIMUM_CPU_FAMILY) +static const int req_level = CONFIG_X86_MINIMUM_CPU_FAMILY; +#else +static const int req_level = 3; +#endif + +static const u32 req_flags[NCAPINTS] = +{ + REQUIRED_MASK0, + REQUIRED_MASK1, + REQUIRED_MASK2, + REQUIRED_MASK3, + REQUIRED_MASK4, + REQUIRED_MASK5, + REQUIRED_MASK6, + REQUIRED_MASK7, +}; + +#define A32(a,b,c,d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) + +static int is_amd(void) +{ + return cpu_vendor[0] == A32('A','u','t','h') && + cpu_vendor[1] == A32('e','n','t','i') && + cpu_vendor[2] == A32('c','A','M','D'); +} + +static int is_centaur(void) +{ + return cpu_vendor[0] == A32('C','e','n','t') && + cpu_vendor[1] == A32('a','u','r','H') && + cpu_vendor[2] == A32('a','u','l','s'); +} + +static int is_transmeta(void) +{ + return cpu_vendor[0] == A32('G','e','n','u') && + cpu_vendor[1] == A32('i','n','e','T') && + cpu_vendor[2] == A32('M','x','8','6'); +} + +static int has_fpu(void) +{ + u16 fcw = -1, fsw = -1; + u32 cr0; + + asm("movl %%cr0,%0" : "=r" (cr0)); + if (cr0 & (X86_CR0_EM|X86_CR0_TS)) { + cr0 &= ~(X86_CR0_EM|X86_CR0_TS); + asm volatile("movl %0,%%cr0" : : "r" (cr0)); + } + + asm("fninit ; fnstsw %0 ; fnstcw %1" : "+m" (fsw), "+m" (fcw)); + + return fsw == 0 && (fcw & 0x103f) == 0x003f; +} + +static int has_eflag(u32 mask) +{ + u32 f0, f1; + + asm("pushfl ; " + "pushfl ; " + "popl %0 ; " + "movl %0,%1 ; " + "xorl %2,%1 ; " + "pushl %1 ; " + "popfl ; " + "pushfl ; " + "popl %1 ; " + "popfl" + : "=&r" (f0), "=&r" (f1) + : "ri" (mask)); + + return !!((f0^f1) & mask); +} + +static void get_flags(void) +{ + u32 max_intel_level, max_amd_level; + u32 tfms; + + if (has_fpu()) + set_bit(X86_FEATURE_FPU, cpu.flags); + + if (has_eflag(X86_EFLAGS_ID)) { + asm("cpuid" + : "=a" (max_intel_level), + "=b" (cpu_vendor[0]), + "=d" (cpu_vendor[1]), + "=c" (cpu_vendor[2]) + : "a" (0)); + + if (max_intel_level >= 0x00000001 && + max_intel_level <= 0x0000ffff) { + asm("cpuid" + : "=a" (tfms), + "=c" (cpu.flags[4]), + "=d" (cpu.flags[0]) + : "a" (0x00000001) + : "ebx"); + cpu.level = (tfms >> 8) & 15; + cpu.model = (tfms >> 4) & 15; + if (cpu.level >= 6) + cpu.model += ((tfms >> 16) & 0xf) << 4; + } + + asm("cpuid" + : "=a" (max_amd_level) + : "a" (0x80000000) + : "ebx", "ecx", "edx"); + + if (max_amd_level >= 0x80000001 && + max_amd_level <= 0x8000ffff) { + u32 eax = 0x80000001; + asm("cpuid" + : "+a" (eax), + "=c" (cpu.flags[6]), + "=d" (cpu.flags[1]) + : : "ebx"); + } + } +} + +/* Returns a bitmask of which words we have error bits in */ +static int check_flags(void) +{ + u32 err; + int i; + + err = 0; + for (i = 0; i < NCAPINTS; i++) { + err_flags[i] = req_flags[i] & ~cpu.flags[i]; + if (err_flags[i]) + err |= 1 << i; + } + + return err; +} + +/* + * Returns -1 on error. + * + * *cpu_level is set to the current CPU level; *req_level to the required + * level. x86-64 is considered level 64 for this purpose. + * + * *err_flags_ptr is set to the flags error array if there are flags missing. + */ +int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) +{ + int err; + + memset(&cpu.flags, 0, sizeof cpu.flags); + cpu.level = 3; + + if (has_eflag(X86_EFLAGS_AC)) + cpu.level = 4; + + get_flags(); + err = check_flags(); + + if (test_bit(X86_FEATURE_LM, cpu.flags)) + cpu.level = 64; + + if (err == 0x01 && + !(err_flags[0] & + ~((1 << X86_FEATURE_XMM)|(1 << X86_FEATURE_XMM2))) && + is_amd()) { + /* If this is an AMD and we're only missing SSE+SSE2, try to + turn them on */ + + u32 ecx = MSR_K7_HWCR; + u32 eax, edx; + + asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); + eax &= ~(1 << 15); + asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); + + get_flags(); /* Make sure it really did something */ + err = check_flags(); + } else if (err == 0x01 && + !(err_flags[0] & ~(1 << X86_FEATURE_CX8)) && + is_centaur() && cpu.model >= 6) { + /* If this is a VIA C3, we might have to enable CX8 + explicitly */ + + u32 ecx = MSR_VIA_FCR; + u32 eax, edx; + + asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); + eax |= (1<<1)|(1<<7); + asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); + + set_bit(X86_FEATURE_CX8, cpu.flags); + err = check_flags(); + } else if (err == 0x01 && is_transmeta()) { + /* Transmeta might have masked feature bits in word 0 */ + + u32 ecx = 0x80860004; + u32 eax, edx; + u32 level = 1; + + asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx)); + asm("wrmsr" : : "a" (~0), "d" (edx), "c" (ecx)); + asm("cpuid" + : "+a" (level), "=d" (cpu.flags[0]) + : : "ecx", "ebx"); + asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); + + err = check_flags(); + } + + if (err_flags_ptr) + *err_flags_ptr = err ? err_flags : NULL; + if (cpu_level_ptr) + *cpu_level_ptr = cpu.level; + if (req_level_ptr) + *req_level_ptr = req_level; + + return (cpu.level < req_level || err) ? -1 : 0; +} diff --git a/arch/i386/boot/edd.S b/arch/i386/boot/edd.S deleted file mode 100644 index 34321368011..00000000000 --- a/arch/i386/boot/edd.S +++ /dev/null @@ -1,231 +0,0 @@ -/* - * BIOS Enhanced Disk Drive support - * Copyright (C) 2002, 2003, 2004 Dell, Inc. - * by Matt Domsch <Matt_Domsch@dell.com> October 2002 - * conformant to T13 Committee www.t13.org - * projects 1572D, 1484D, 1386D, 1226DT - * disk signature read by Matt Domsch <Matt_Domsch@dell.com> - * and Andrew Wilks <Andrew_Wilks@dell.com> September 2003, June 2004 - * legacy CHS retrieval by Patrick J. LoPresti <patl@users.sourceforge.net> - * March 2004 - * Command line option parsing, Matt Domsch, November 2004 - */ - -#include <linux/edd.h> -#include <asm/setup.h> - -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) - -# It is assumed that %ds == INITSEG here - - movb $0, (EDD_MBR_SIG_NR_BUF) - movb $0, (EDDNR) - -# Check the command line for options: -# edd=of disables EDD completely (edd=off) -# edd=sk skips the MBR test (edd=skipmbr) -# edd=on re-enables EDD (edd=on) - - pushl %esi - movw $edd_mbr_sig_start, %di # Default to edd=on - - movl %cs:(cmd_line_ptr), %esi - andl %esi, %esi - jz old_cl # Old boot protocol? - -# Convert to a real-mode pointer in fs:si - movl %esi, %eax - shrl $4, %eax - movw %ax, %fs - andw $0xf, %si - jmp have_cl_pointer - -# Old-style boot protocol? -old_cl: - push %ds # aka INITSEG - pop %fs - - cmpw $0xa33f, (0x20) - jne done_cl # No command line at all? - movw (0x22), %si # Pointer relative to INITSEG - -# fs:si has the pointer to the command line now -have_cl_pointer: - -# Loop through kernel command line one byte at a time. Just in -# case the loader is buggy and failed to null-terminate the command line -# terminate if we get close enough to the end of the segment that we -# cannot fit "edd=XX"... -cl_atspace: - cmpw $-5, %si # Watch for segment wraparound - jae done_cl - movl %fs:(%si), %eax - andb %al, %al # End of line? - jz done_cl - cmpl $EDD_CL_EQUALS, %eax - jz found_edd_equals - cmpb $0x20, %al # <= space consider whitespace - ja cl_skipword - incw %si - jmp cl_atspace - -cl_skipword: - cmpw $-5, %si # Watch for segment wraparound - jae done_cl - movb %fs:(%si), %al # End of string? - andb %al, %al - jz done_cl - cmpb $0x20, %al - jbe cl_atspace - incw %si - jmp cl_skipword - -found_edd_equals: -# only looking at first two characters after equals -# late overrides early on the command line, so keep going after finding something - movw %fs:4(%si), %ax - cmpw $EDD_CL_OFF, %ax # edd=of - je do_edd_off - cmpw $EDD_CL_SKIP, %ax # edd=sk - je do_edd_skipmbr - cmpw $EDD_CL_ON, %ax # edd=on - je do_edd_on - jmp cl_skipword -do_edd_skipmbr: - movw $edd_start, %di - jmp cl_skipword -do_edd_off: - movw $edd_done, %di - jmp cl_skipword -do_edd_on: - movw $edd_mbr_sig_start, %di - jmp cl_skipword - -done_cl: - popl %esi - jmpw *%di - -# Read the first sector of each BIOS disk device and store the 4-byte signature -edd_mbr_sig_start: - movb $0x80, %dl # from device 80 - movw $EDD_MBR_SIG_BUF, %bx # store buffer ptr in bx -edd_mbr_sig_read: - movl $0xFFFFFFFF, %eax - movl %eax, (%bx) # assume failure - pushw %bx - movb $READ_SECTORS, %ah - movb $1, %al # read 1 sector - movb $0, %dh # at head 0 - movw $1, %cx # cylinder 0, sector 0 - pushw %es - pushw %ds - popw %es - movw $EDDBUF, %bx # disk's data goes into EDDBUF - pushw %dx # work around buggy BIOSes - stc # work around buggy BIOSes - int $0x13 - sti # work around buggy BIOSes - popw %dx - popw %es - popw %bx - jc edd_mbr_sig_done # on failure, we're done. - cmpb $0, %ah # some BIOSes do not set CF - jne edd_mbr_sig_done # on failure, we're done. - movl (EDDBUF+EDD_MBR_SIG_OFFSET), %eax # read sig out of the MBR - movl %eax, (%bx) # store success - incb (EDD_MBR_SIG_NR_BUF) # note that we stored something - incb %dl # increment to next device - addw $4, %bx # increment sig buffer ptr - cmpb $EDD_MBR_SIG_MAX, (EDD_MBR_SIG_NR_BUF) # Out of space? - jb edd_mbr_sig_read # keep looping -edd_mbr_sig_done: - -# Do the BIOS Enhanced Disk Drive calls -# This consists of two calls: -# int 13h ah=41h "Check Extensions Present" -# int 13h ah=48h "Get Device Parameters" -# int 13h ah=08h "Legacy Get Device Parameters" -# -# A buffer of size EDDMAXNR*(EDDEXTSIZE+EDDPARMSIZE) is reserved for our use -# in the boot_params at EDDBUF. The first four bytes of which are -# used to store the device number, interface support map and version -# results from fn41. The next four bytes are used to store the legacy -# cylinders, heads, and sectors from fn08. The following 74 bytes are used to -# store the results from fn48. Starting from device 80h, fn41, then fn48 -# are called and their results stored in EDDBUF+n*(EDDEXTSIZE+EDDPARMIZE). -# Then the pointer is incremented to store the data for the next call. -# This repeats until either a device doesn't exist, or until EDDMAXNR -# devices have been stored. -# The one tricky part is that ds:si always points EDDEXTSIZE bytes into -# the structure, and the fn41 and fn08 results are stored at offsets -# from there. This removes the need to increment the pointer for -# every store, and leaves it ready for the fn48 call. -# A second one-byte buffer, EDDNR, in the boot_params stores -# the number of BIOS devices which exist, up to EDDMAXNR. -# In setup.c, copy_edd() stores both boot_params buffers away -# for later use, as they would get overwritten otherwise. -# This code is sensitive to the size of the structs in edd.h -edd_start: - # %ds points to the bootsector - # result buffer for fn48 - movw $EDDBUF+EDDEXTSIZE, %si # in ds:si, fn41 results - # kept just before that - movb $0x80, %dl # BIOS device 0x80 - -edd_check_ext: - movb $CHECKEXTENSIONSPRESENT, %ah # Function 41 - movw $EDDMAGIC1, %bx # magic - int $0x13 # make the call - jc edd_done # no more BIOS devices - - cmpw $EDDMAGIC2, %bx # is magic right? - jne edd_next # nope, next... - - movb %dl, %ds:-8(%si) # store device number - movb %ah, %ds:-7(%si) # store version - movw %cx, %ds:-6(%si) # store extensions - incb (EDDNR) # note that we stored something - -edd_get_device_params: - movw $EDDPARMSIZE, %ds:(%si) # put size - movw $0x0, %ds:2(%si) # work around buggy BIOSes - movb $GETDEVICEPARAMETERS, %ah # Function 48 - int $0x13 # make the call - # Don't check for fail return - # it doesn't matter. -edd_get_legacy_chs: - xorw %ax, %ax - movw %ax, %ds:-4(%si) - movw %ax, %ds:-2(%si) - # Ralf Brown's Interrupt List says to set ES:DI to - # 0000h:0000h "to guard against BIOS bugs" - pushw %es - movw %ax, %es - movw %ax, %di - pushw %dx # legacy call clobbers %dl - movb $LEGACYGETDEVICEPARAMETERS, %ah # Function 08 - int $0x13 # make the call - jc edd_legacy_done # failed - movb %cl, %al # Low 6 bits are max - andb $0x3F, %al # sector number - movb %al, %ds:-1(%si) # Record max sect - movb %dh, %ds:-2(%si) # Record max head number - movb %ch, %al # Low 8 bits of max cyl - shr $6, %cl - movb %cl, %ah # High 2 bits of max cyl - movw %ax, %ds:-4(%si) - -edd_legacy_done: - popw %dx - popw %es - movw %si, %ax # increment si - addw $EDDPARMSIZE+EDDEXTSIZE, %ax - movw %ax, %si - -edd_next: - incb %dl # increment to next device - cmpb $EDDMAXNR, (EDDNR) # Out of space? - jb edd_check_ext # keep looping - -edd_done: -#endif diff --git a/arch/i386/boot/edd.c b/arch/i386/boot/edd.c new file mode 100644 index 00000000000..25a282494f4 --- /dev/null +++ b/arch/i386/boot/edd.c @@ -0,0 +1,196 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/edd.c + * + * Get EDD BIOS disk information + */ + +#include "boot.h" +#include <linux/edd.h> + +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) + +struct edd_dapa { + u8 pkt_size; + u8 rsvd; + u16 sector_cnt; + u16 buf_off, buf_seg; + u64 lba; + u64 buf_lin_addr; +}; + +/* + * Read the MBR (first sector) from a specific device. + */ +static int read_mbr(u8 devno, void *buf) +{ + struct edd_dapa dapa; + u16 ax, bx, cx, dx, si; + + memset(&dapa, 0, sizeof dapa); + dapa.pkt_size = sizeof(dapa); + dapa.sector_cnt = 1; + dapa.buf_off = (size_t)buf; + dapa.buf_seg = ds(); + /* dapa.lba = 0; */ + + ax = 0x4200; /* Extended Read */ + si = (size_t)&dapa; + dx = devno; + asm("pushfl; stc; int $0x13; setc %%al; popfl" + : "+a" (ax), "+S" (si), "+d" (dx) + : "m" (dapa) + : "ebx", "ecx", "edi", "memory"); + + if (!(u8)ax) + return 0; /* OK */ + + ax = 0x0201; /* Legacy Read, one sector */ + cx = 0x0001; /* Sector 0-0-1 */ + dx = devno; + bx = (size_t)buf; + asm("pushfl; stc; int $0x13; setc %%al; popfl" + : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx) + : : "esi", "edi", "memory"); + + return -(u8)ax; /* 0 or -1 */ +} + +static u32 read_mbr_sig(u8 devno, struct edd_info *ei) +{ + int sector_size; + char *mbrbuf_ptr, *mbrbuf_end; + u32 mbrsig; + u32 buf_base, mbr_base; + extern char _end[]; + static char mbr_buf[1024]; + + sector_size = ei->params.bytes_per_sector; + if (!sector_size) + sector_size = 512; /* Best available guess */ + + buf_base = (ds() << 4) + (u32)&_end; + mbr_base = (buf_base+sector_size-1) & ~(sector_size-1); + mbrbuf_ptr = mbr_buf + (mbr_base-buf_base); + mbrbuf_end = mbrbuf_ptr + sector_size; + + if (!(boot_params.hdr.loadflags & CAN_USE_HEAP)) + return 0; + if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr) + return 0; + + if (read_mbr(devno, mbrbuf_ptr)) + return 0; + + mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET]; + return mbrsig; +} + +static int get_edd_info(u8 devno, struct edd_info *ei) +{ + u16 ax, bx, cx, dx, di; + + memset(ei, 0, sizeof *ei); + + /* Check Extensions Present */ + + ax = 0x4100; + bx = EDDMAGIC1; + dx = devno; + asm("pushfl; stc; int $0x13; setc %%al; popfl" + : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx) + : : "esi", "edi"); + + if ((u8)ax) + return -1; /* No extended information */ + + if (bx != EDDMAGIC2) + return -1; + + ei->device = devno; + ei->version = ax >> 8; /* EDD version number */ + ei->interface_support = cx; /* EDD functionality subsets */ + + /* Extended Get Device Parameters */ + + ei->params.length = sizeof(ei->params); + ax = 0x4800; + dx = devno; + asm("pushfl; int $0x13; popfl" + : "+a" (ax), "+d" (dx) + : "S" (&ei->params) + : "ebx", "ecx", "edi"); + + /* Get legacy CHS parameters */ + + /* Ralf Brown recommends setting ES:DI to 0:0 */ + ax = 0x0800; + dx = devno; + di = 0; + asm("pushw %%es; " + "movw %%di,%%es; " + "pushfl; stc; int $0x13; setc %%al; popfl; " + "popw %%es" + : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di) + : : "esi"); + + if ((u8)ax == 0) { + ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2); + ei->legacy_max_head = dx >> 8; + ei->legacy_sectors_per_track = cx & 0x3f; + } + + return 0; +} + +void query_edd(void) +{ + char eddarg[8]; + int do_mbr = 1; + int do_edd = 1; + int devno; + struct edd_info ei, *edp; + + if (cmdline_find_option("edd", eddarg, sizeof eddarg) > 0) { + if (!strcmp(eddarg, "skipmbr") || !strcmp(eddarg, "skip")) + do_mbr = 0; + else if (!strcmp(eddarg, "off")) + do_edd = 0; + } + + edp = (struct edd_info *)boot_params.eddbuf; + + if (!do_edd) + return; + + for (devno = 0x80; devno < 0x80+EDD_MBR_SIG_MAX; devno++) { + /* + * Scan the BIOS-supported hard disks and query EDD + * information... + */ + get_edd_info(devno, &ei); + + if (boot_params.eddbuf_entries < EDDMAXNR) { + memcpy(edp, &ei, sizeof ei); + edp++; + boot_params.eddbuf_entries++; + } + + if (do_mbr) { + u32 mbr_sig; + mbr_sig = read_mbr_sig(devno, &ei); + boot_params.edd_mbr_sig_buffer[devno-0x80] = mbr_sig; + } + } +} + +#endif diff --git a/arch/i386/boot/header.S b/arch/i386/boot/header.S new file mode 100644 index 00000000000..6b9923fb6ea --- /dev/null +++ b/arch/i386/boot/header.S @@ -0,0 +1,283 @@ +/* + * header.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Based on bootsect.S and setup.S + * modified by more people than can be counted + * + * Rewritten as a common file by H. Peter Anvin (Apr 2007) + * + * BIG FAT NOTE: We're in real mode using 64k segments. Therefore segment + * addresses must be multiplied by 16 to obtain their respective linear + * addresses. To avoid confusion, linear addresses are written using leading + * hex while segment addresses are written as segment:offset. + * + */ + +#include <asm/segment.h> +#include <linux/utsrelease.h> +#include <asm/boot.h> +#include <asm/e820.h> +#include <asm/page.h> +#include <asm/setup.h> +#include "boot.h" + +SETUPSECTS = 4 /* default nr of setup-sectors */ +BOOTSEG = 0x07C0 /* original address of boot-sector */ +SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */ +SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ + /* to be loaded */ +ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ +SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */ + +#ifndef SVGA_MODE +#define SVGA_MODE ASK_VGA +#endif + +#ifndef RAMDISK +#define RAMDISK 0 +#endif + +#ifndef ROOT_RDONLY +#define ROOT_RDONLY 1 +#endif + + .code16 + .section ".bstext", "ax" + + .global bootsect_start +bootsect_start: + + # Normalize the start address + ljmp $BOOTSEG, $start2 + +start2: + movw %cs, %ax + movw %ax, %ds + movw %ax, %es + movw %ax, %ss + xorw %sp, %sp + sti + cld + + movw $bugger_off_msg, %si + +msg_loop: + lodsb + andb %al, %al + jz bs_die + movb $0xe, %ah + movw $7, %bx + int $0x10 + jmp msg_loop + +bs_die: + # Allow the user to press a key, then reboot + xorw %ax, %ax + int $0x16 + int $0x19 + + # int 0x19 should never return. In case it does anyway, + # invoke the BIOS reset code... + ljmp $0xf000,$0xfff0 + + .section ".bsdata", "a" +bugger_off_msg: + .ascii "Direct booting from floppy is no longer supported.\r\n" + .ascii "Please use a boot loader program instead.\r\n" + .ascii "\n" + .ascii "Remove disk and press any key to reboot . . .\r\n" + .byte 0 + + + # Kernel attributes; used by setup. This is part 1 of the + # header, from the old boot sector. + + .section ".header", "a" + .globl hdr +hdr: +setup_sects: .byte SETUPSECTS +root_flags: .word ROOT_RDONLY +syssize: .long SYSSIZE +ram_size: .word RAMDISK +vid_mode: .word SVGA_MODE +root_dev: .word ROOT_DEV +boot_flag: .word 0xAA55 + + # offset 512, entry point + + .globl _start +_start: + # Explicitly enter this as bytes, or the assembler + # tries to generate a 3-byte jump here, which causes + # everything else to push off to the wrong offset. + .byte 0xeb # short (2-byte) jump + .byte start_of_setup-1f +1: + + # Part 2 of the header, from the old setup.S + + .ascii "HdrS" # header signature + .word 0x0206 # header version number (>= 0x0105) + # or else old loadlin-1.5 will fail) + .globl realmode_swtch +realmode_swtch: .word 0, 0 # default_switch, SETUPSEG +start_sys_seg: .word SYSSEG + .word kernel_version-512 # pointing to kernel version string + # above section of header is compatible + # with loadlin-1.5 (header v1.5). Don't + # change it. + +type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin, + # Bootlin, SYSLX, bootsect...) + # See Documentation/i386/boot.txt for + # assigned ids + +# flags, unused bits must be zero (RFU) bit within loadflags +loadflags: +LOADED_HIGH = 1 # If set, the kernel is loaded high +CAN_USE_HEAP = 0x80 # If set, the loader also has set + # heap_end_ptr to tell how much + # space behind setup.S can be used for + # heap purposes. + # Only the loader knows what is free +#ifndef __BIG_KERNEL__ + .byte 0 +#else + .byte LOADED_HIGH +#endif + +setup_move_size: .word 0x8000 # size to move, when setup is not + # loaded at 0x90000. We will move setup + # to 0x90000 then just before jumping + # into the kernel. However, only the + # loader knows how much data behind + # us also needs to be loaded. + +code32_start: # here loaders can put a different + # start address for 32-bit code. +#ifndef __BIG_KERNEL__ + .long 0x1000 # 0x1000 = default for zImage +#else + .long 0x100000 # 0x100000 = default for big kernel +#endif + +ramdisk_image: .long 0 # address of loaded ramdisk image + # Here the loader puts the 32-bit + # address where it loaded the image. + # This only will be read by the kernel. + +ramdisk_size: .long 0 # its size in bytes + +bootsect_kludge: + .long 0 # obsolete + +heap_end_ptr: .word _end+1024 # (Header version 0x0201 or later) + # space from here (exclusive) down to + # end of setup code can be used by setup + # for local heap purposes. + +pad1: .word 0 +cmd_line_ptr: .long 0 # (Header version 0x0202 or later) + # If nonzero, a 32-bit pointer + # to the kernel command line. + # The command line should be + # located between the start of + # setup and the end of low + # memory (0xa0000), or it may + # get overwritten before it + # gets read. If this field is + # used, there is no longer + # anything magical about the + # 0x90000 segment; the setup + # can be located anywhere in + # low memory 0x10000 or higher. + +ramdisk_max: .long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff + # (Header version 0x0203 or later) + # The highest safe address for + # the contents of an initrd + +kernel_alignment: .long CONFIG_PHYSICAL_ALIGN #physical addr alignment + #required for protected mode + #kernel +#ifdef CONFIG_RELOCATABLE +relocatable_kernel: .byte 1 +#else +relocatable_kernel: .byte 0 +#endif +pad2: .byte 0 +pad3: .word 0 + +cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, + #added with boot protocol + #version 2.06 + +# End of setup header ##################################################### + + .section ".inittext", "ax" +start_of_setup: +#ifdef SAFE_RESET_DISK_CONTROLLER +# Reset the disk controller. + movw $0x0000, %ax # Reset disk controller + movb $0x80, %dl # All disks + int $0x13 +#endif + +# We will have entired with %cs = %ds+0x20, normalize %cs so +# it is on par with the other segments. + pushw %ds + pushw $setup2 + lretw + +setup2: +# Force %es = %ds + movw %ds, %ax + movw %ax, %es + cld + +# Stack paranoia: align the stack and make sure it is good +# for both 16- and 32-bit references. In particular, if we +# were meant to have been using the full 16-bit segment, the +# caller might have set %sp to zero, which breaks %esp-based +# references. + andw $~3, %sp # dword align (might as well...) + jnz 1f + movw $0xfffc, %sp # Make sure we're not zero +1: movzwl %sp, %esp # Clear upper half of %esp + sti + +# Check signature at end of setup + cmpl $0x5a5aaa55, setup_sig + jne setup_bad + +# Zero the bss + movw $__bss_start, %di + movw $_end+3, %cx + xorl %eax, %eax + subw %di, %cx + shrw $2, %cx + rep; stosl + +# Jump to C code (should not return) + calll main + +# Setup corrupt somehow... +setup_bad: + movl $setup_corrupt, %eax + calll puts + # Fall through... + + .globl die + .type die, @function +die: + hlt + jmp die + + .size die, .-due + + .section ".initdata", "a" +setup_corrupt: + .byte 7 + .string "No setup signature found..." diff --git a/arch/i386/boot/main.c b/arch/i386/boot/main.c new file mode 100644 index 00000000000..7f01f96c4fb --- /dev/null +++ b/arch/i386/boot/main.c @@ -0,0 +1,161 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/main.c + * + * Main module for the real-mode kernel code + */ + +#include "boot.h" + +struct boot_params boot_params __attribute__((aligned(16))); + +char *HEAP = _end; +char *heap_end = _end; /* Default end of heap = no heap */ + +/* + * Copy the header into the boot parameter block. Since this + * screws up the old-style command line protocol, adjust by + * filling in the new-style command line pointer instead. + */ +#define OLD_CL_MAGIC 0xA33F +#define OLD_CL_ADDRESS 0x20 + +static void copy_boot_params(void) +{ + struct old_cmdline { + u16 cl_magic; + u16 cl_offset; + }; + const struct old_cmdline * const oldcmd = + (const struct old_cmdline *)OLD_CL_ADDRESS; + + BUILD_BUG_ON(sizeof boot_params != 4096); + memcpy(&boot_params.hdr, &hdr, sizeof hdr); + + if (!boot_params.hdr.cmd_line_ptr && + oldcmd->cl_magic == OLD_CL_MAGIC) { + /* Old-style command line protocol. */ + u16 cmdline_seg; + + /* Figure out if the command line falls in the region + of memory that an old kernel would have copied up + to 0x90000... */ + if (oldcmd->cl_offset < boot_params.hdr.setup_move_size) + cmdline_seg = ds(); + else + cmdline_seg = 0x9000; + + boot_params.hdr.cmd_line_ptr = + (cmdline_seg << 4) + oldcmd->cl_offset; + } +} + +/* + * Set the keyboard repeat rate to maximum. Unclear why this + * is done here; this might be possible to kill off as stale code. + */ +static void keyboard_set_repeat(void) +{ + u16 ax = 0x0305; + u16 bx = 0; + asm volatile("int $0x16" + : "+a" (ax), "+b" (bx) + : : "ecx", "edx", "esi", "edi"); +} + +/* + * Get Intel SpeedStep IST information. + */ +static void query_speedstep_ist(void) +{ + asm("int $0x15" + : "=a" (boot_params.speedstep_info[0]), + "=b" (boot_params.speedstep_info[1]), + "=c" (boot_params.speedstep_info[2]), + "=d" (boot_params.speedstep_info[3]) + : "a" (0x0000e980), /* IST Support */ + "d" (0x47534943)); /* Request value */ +} + +/* + * Tell the BIOS what CPU mode we intend to run in. + */ +static void set_bios_mode(void) +{ +#ifdef CONFIG_X86_64 + u32 eax, ebx; + + eax = 0xec00; + ebx = 2; + asm volatile("int $0x15" + : "+a" (eax), "+b" (ebx) + : : "ecx", "edx", "esi", "edi"); +#endif +} + +void main(void) +{ + /* First, copy the boot header into the "zeropage" */ + copy_boot_params(); + + /* End of heap check */ + if (boot_params.hdr.loadflags & CAN_USE_HEAP) { + heap_end = (char *)(boot_params.hdr.heap_end_ptr + +0x200-STACK_SIZE); + } else { + /* Boot protocol 2.00 only, no heap available */ + puts("WARNING: Ancient bootloader, some functionality " + "may be limited!\n"); + } + + /* Make sure we have all the proper CPU support */ + if (validate_cpu()) { + puts("Unable to boot - please use a kernel appropriate " + "for your CPU.\n"); + die(); + } + + /* Tell the BIOS what CPU mode we intend to run in. */ + set_bios_mode(); + + /* Detect memory layout */ + detect_memory(); + + /* Set keyboard repeat rate (why?) */ + keyboard_set_repeat(); + + /* Set the video mode */ + set_video(); + + /* Query MCA information */ + query_mca(); + + /* Voyager */ +#ifdef CONFIG_X86_VOYAGER + query_voyager(); +#endif + + /* Query SpeedStep IST information */ + query_speedstep_ist(); + + /* Query APM information */ +#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE) + query_apm_bios(); +#endif + + /* Query EDD information */ +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) + query_edd(); +#endif + /* Do the last things and invoke protected mode */ + go_to_protected_mode(); +} diff --git a/arch/i386/boot/mca.c b/arch/i386/boot/mca.c new file mode 100644 index 00000000000..68222f2d4b6 --- /dev/null +++ b/arch/i386/boot/mca.c @@ -0,0 +1,43 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/mca.c + * + * Get the MCA system description table + */ + +#include "boot.h" + +int query_mca(void) +{ + u8 err; + u16 es, bx, len; + + asm("pushw %%es ; " + "int $0x15 ; " + "setc %0 ; " + "movw %%es, %1 ; " + "popw %%es" + : "=acd" (err), "=acdSD" (es), "=b" (bx) + : "a" (0xc000)); + + if (err) + return -1; /* No MCA present */ + + set_fs(es); + len = rdfs16(bx); + + if (len > sizeof(boot_params.sys_desc_table)) + len = sizeof(boot_params.sys_desc_table); + + copy_from_fs(&boot_params.sys_desc_table, bx, len); + return 0; +} diff --git a/arch/i386/boot/memory.c b/arch/i386/boot/memory.c new file mode 100644 index 00000000000..1a2e62db8be --- /dev/null +++ b/arch/i386/boot/memory.c @@ -0,0 +1,99 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/memory.c + * + * Memory detection code + */ + +#include "boot.h" + +#define SMAP 0x534d4150 /* ASCII "SMAP" */ + +static int detect_memory_e820(void) +{ + u32 next = 0; + u32 size, id; + u8 err; + struct e820entry *desc = boot_params.e820_map; + + do { + size = sizeof(struct e820entry); + id = SMAP; + asm("int $0x15; setc %0" + : "=am" (err), "+b" (next), "+d" (id), "+c" (size), + "=m" (*desc) + : "D" (desc), "a" (0xe820)); + + if (err || id != SMAP) + break; + + boot_params.e820_entries++; + desc++; + } while (next && boot_params.e820_entries < E820MAX); + + return boot_params.e820_entries; +} + +static int detect_memory_e801(void) +{ + u16 ax, bx, cx, dx; + u8 err; + + bx = cx = dx = 0; + ax = 0xe801; + asm("stc; int $0x15; setc %0" + : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx)); + + if (err) + return -1; + + /* Do we really need to do this? */ + if (cx || dx) { + ax = cx; + bx = dx; + } + + if (ax > 15*1024) + return -1; /* Bogus! */ + + /* This ignores memory above 16MB if we have a memory hole + there. If someone actually finds a machine with a memory + hole at 16MB and no support for 0E820h they should probably + generate a fake e820 map. */ + boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax; + + return 0; +} + +static int detect_memory_88(void) +{ + u16 ax; + u8 err; + + ax = 0x8800; + asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax)); + + boot_params.screen_info.ext_mem_k = ax; + + return -err; +} + +int detect_memory(void) +{ + if (detect_memory_e820() > 0) + return 0; + + if (!detect_memory_e801()) + return 0; + + return detect_memory_88(); +} diff --git a/arch/i386/boot/pm.c b/arch/i386/boot/pm.c new file mode 100644 index 00000000000..1df025c7326 --- /dev/null +++ b/arch/i386/boot/pm.c @@ -0,0 +1,170 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/pm.c + * + * Prepare the machine for transition to protected mode. + */ + +#include "boot.h" +#include <asm/segment.h> + +/* + * Invoke the realmode switch hook if present; otherwise + * disable all interrupts. + */ +static void realmode_switch_hook(void) +{ + if (boot_params.hdr.realmode_swtch) { + asm volatile("lcallw *%0" + : : "m" (boot_params.hdr.realmode_swtch) + : "eax", "ebx", "ecx", "edx"); + } else { + asm volatile("cli"); + outb(0x80, 0x70); /* Disable NMI */ + io_delay(); + } +} + +/* + * A zImage kernel is loaded at 0x10000 but wants to run at 0x1000. + * A bzImage kernel is loaded and runs at 0x100000. + */ +static void move_kernel_around(void) +{ + /* Note: rely on the compile-time option here rather than + the LOADED_HIGH flag. The Qemu kernel loader unconditionally + sets the loadflags to zero. */ +#ifndef __BIG_KERNEL__ + u16 dst_seg, src_seg; + u32 syssize; + + dst_seg = 0x1000 >> 4; + src_seg = 0x10000 >> 4; + syssize = boot_params.hdr.syssize; /* Size in 16-byte paragraphs */ + + while (syssize) { + int paras = (syssize >= 0x1000) ? 0x1000 : syssize; + int dwords = paras << 2; + + asm volatile("pushw %%es ; " + "pushw %%ds ; " + "movw %1,%%es ; " + "movw %2,%%ds ; " + "xorw %%di,%%di ; " + "xorw %%si,%%si ; " + "rep;movsl ; " + "popw %%ds ; " + "popw %%es" + : "+c" (dwords) + : "r" (dst_seg), "r" (src_seg) + : "esi", "edi"); + + syssize -= paras; + dst_seg += paras; + src_seg += paras; + } +#endif +} + +/* + * Disable all interrupts at the legacy PIC. + */ +static void mask_all_interrupts(void) +{ + outb(0xff, 0xa1); /* Mask all interrupts on the seconday PIC */ + io_delay(); + outb(0xfb, 0x21); /* Mask all but cascade on the primary PIC */ + io_delay(); +} + +/* + * Reset IGNNE# if asserted in the FPU. + */ +static void reset_coprocessor(void) +{ + outb(0, 0xf0); + io_delay(); + outb(0, 0xf1); + io_delay(); +} + +/* + * Set up the GDT + */ +#define GDT_ENTRY(flags,base,limit) \ + (((u64)(base & 0xff000000) << 32) | \ + ((u64)flags << 40) | \ + ((u64)(limit & 0x00ff0000) << 32) | \ + ((u64)(base & 0x00ffff00) << 16) | \ + ((u64)(limit & 0x0000ffff))) + +struct gdt_ptr { + u16 len; + u32 ptr; +} __attribute__((packed)); + +static void setup_gdt(void) +{ + /* There are machines which are known to not boot with the GDT + being 8-byte unaligned. Intel recommends 16 byte alignment. */ + static const u64 boot_gdt[] __attribute__((aligned(16))) = { + /* CS: code, read/execute, 4 GB, base 0 */ + [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff), + /* DS: data, read/write, 4 GB, base 0 */ + [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff), + }; + struct gdt_ptr gdt; + + gdt.len = sizeof(boot_gdt)-1; + gdt.ptr = (u32)&boot_gdt + (ds() << 4); + + asm volatile("lgdtl %0" : : "m" (gdt)); +} + +/* + * Set up the IDT + */ +static void setup_idt(void) +{ + static const struct gdt_ptr null_idt = {0, 0}; + asm volatile("lidtl %0" : : "m" (null_idt)); +} + +/* + * Actual invocation sequence + */ +void go_to_protected_mode(void) +{ + /* Hook before leaving real mode, also disables interrupts */ + realmode_switch_hook(); + + /* Move the kernel/setup to their final resting places */ + move_kernel_around(); + + /* Enable the A20 gate */ + if (enable_a20()) { + puts("A20 gate not responding, unable to boot...\n"); + die(); + } + + /* Reset coprocessor (IGNNE#) */ + reset_coprocessor(); + + /* Mask all interrupts in the PIC */ + mask_all_interrupts(); + + /* Actual transition to protected mode... */ + setup_idt(); + setup_gdt(); + protected_mode_jump(boot_params.hdr.code32_start, + (u32)&boot_params + (ds() << 4)); +} diff --git a/arch/i386/boot/pmjump.S b/arch/i386/boot/pmjump.S new file mode 100644 index 00000000000..2e559233725 --- /dev/null +++ b/arch/i386/boot/pmjump.S @@ -0,0 +1,54 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/pmjump.S + * + * The actual transition into protected mode + */ + +#include <asm/boot.h> +#include <asm/segment.h> + + .text + + .globl protected_mode_jump + .type protected_mode_jump, @function + + .code16 + +/* + * void protected_mode_jump(u32 entrypoint, u32 bootparams); + */ +protected_mode_jump: + xorl %ebx, %ebx # Flag to indicate this is a boot + movl %edx, %esi # Pointer to boot_params table + movl %eax, 2f # Patch ljmpl instruction + jmp 1f # Short jump to flush instruction q. + +1: + movw $__BOOT_DS, %cx + + movl %cr0, %edx + orb $1, %dl # Protected mode (PE) bit + movl %edx, %cr0 + + movw %cx, %ds + movw %cx, %es + movw %cx, %fs + movw %cx, %gs + movw %cx, %ss + + # Jump to the 32-bit entrypoint + .byte 0x66, 0xea # ljmpl opcode +2: .long 0 # offset + .word __BOOT_CS # segment + + .size protected_mode_jump, .-protected_mode_jump diff --git a/arch/i386/boot/printf.c b/arch/i386/boot/printf.c new file mode 100644 index 00000000000..1a09f9309d3 --- /dev/null +++ b/arch/i386/boot/printf.c @@ -0,0 +1,307 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/printf.c + * + * Oh, it's a waste of space, but oh-so-yummy for debugging. This + * version of printf() does not include 64-bit support. "Live with + * it." + * + */ + +#include "boot.h" + +static int skip_atoi(const char **s) +{ + int i = 0; + + while (isdigit(**s)) + i = i * 10 + *((*s)++) - '0'; + return i; +} + +#define ZEROPAD 1 /* pad with zero */ +#define SIGN 2 /* unsigned/signed long */ +#define PLUS 4 /* show plus */ +#define SPACE 8 /* space if plus */ +#define LEFT 16 /* left justified */ +#define SPECIAL 32 /* 0x */ +#define LARGE 64 /* use 'ABCDEF' instead of 'abcdef' */ + +#define do_div(n,base) ({ \ +int __res; \ +__res = ((unsigned long) n) % (unsigned) base; \ +n = ((unsigned long) n) / (unsigned) base; \ +__res; }) + +static char *number(char *str, long num, int base, int size, int precision, + int type) +{ + char c, sign, tmp[66]; + const char *digits = "0123456789abcdefghijklmnopqrstuvwxyz"; + int i; + + if (type & LARGE) + digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + if (type & LEFT) + type &= ~ZEROPAD; + if (base < 2 || base > 36) + return 0; + c = (type & ZEROPAD) ? '0' : ' '; + sign = 0; + if (type & SIGN) { + if (num < 0) { + sign = '-'; + num = -num; + size--; + } else if (type & PLUS) { + sign = '+'; + size--; + } else if (type & SPACE) { + sign = ' '; + size--; + } + } + if (type & SPECIAL) { + if (base == 16) + size -= 2; + else if (base == 8) + size--; + } + i = 0; + if (num == 0) + tmp[i++] = '0'; + else + while (num != 0) + tmp[i++] = digits[do_div(num, base)]; + if (i > precision) + precision = i; + size -= precision; + if (!(type & (ZEROPAD + LEFT))) + while (size-- > 0) + *str++ = ' '; + if (sign) + *str++ = sign; + if (type & SPECIAL) { + if (base == 8) + *str++ = '0'; + else if (base == 16) { + *str++ = '0'; + *str++ = digits[33]; + } + } + if (!(type & LEFT)) + while (size-- > 0) + *str++ = c; + while (i < precision--) + *str++ = '0'; + while (i-- > 0) + *str++ = tmp[i]; + while (size-- > 0) + *str++ = ' '; + return str; +} + +int vsprintf(char *buf, const char *fmt, va_list args) +{ + int len; + unsigned long num; + int i, base; + char *str; + const char *s; + + int flags; /* flags to number() */ + + int field_width; /* width of output field */ + int precision; /* min. # of digits for integers; max + number of chars for from string */ + int qualifier; /* 'h', 'l', or 'L' for integer fields */ + + for (str = buf; *fmt; ++fmt) { + if (*fmt != '%') { + *str++ = *fmt; + continue; + } + + /* process flags */ + flags = 0; + repeat: + ++fmt; /* this also skips first '%' */ + switch (*fmt) { + case '-': + flags |= LEFT; + goto repeat; + case '+': + flags |= PLUS; + goto repeat; + case ' ': + flags |= SPACE; + goto repeat; + case '#': + flags |= SPECIAL; + goto repeat; + case '0': + flags |= ZEROPAD; + goto repeat; + } + + /* get field width */ + field_width = -1; + if (isdigit(*fmt)) + field_width = skip_atoi(&fmt); + else if (*fmt == '*') { + ++fmt; + /* it's the next argument */ + field_width = va_arg(args, int); + if (field_width < 0) { + field_width = -field_width; + flags |= LEFT; + } + } + + /* get the precision */ + precision = -1; + if (*fmt == '.') { + ++fmt; + if (isdigit(*fmt)) + precision = skip_atoi(&fmt); + else if (*fmt == '*') { + ++fmt; + /* it's the next argument */ + precision = va_arg(args, int); + } + if (precision < 0) + precision = 0; + } + + /* get the conversion qualifier */ + qualifier = -1; + if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L') { + qualifier = *fmt; + ++fmt; + } + + /* default base */ + base = 10; + + switch (*fmt) { + case 'c': + if (!(flags & LEFT)) + while (--field_width > 0) + *str++ = ' '; + *str++ = (unsigned char)va_arg(args, int); + while (--field_width > 0) + *str++ = ' '; + continue; + + case 's': + s = va_arg(args, char *); + len = strnlen(s, precision); + + if (!(flags & LEFT)) + while (len < field_width--) + *str++ = ' '; + for (i = 0; i < len; ++i) + *str++ = *s++; + while (len < field_width--) + *str++ = ' '; + continue; + + case 'p': + if (field_width == -1) { + field_width = 2 * sizeof(void *); + flags |= ZEROPAD; + } + str = number(str, + (unsigned long)va_arg(args, void *), 16, + field_width, precision, flags); + continue; + + case 'n': + if (qualifier == 'l') { + long *ip = va_arg(args, long *); + *ip = (str - buf); + } else { + int *ip = va_arg(args, int *); + *ip = (str - buf); + } + continue; + + case '%': + *str++ = '%'; + continue; + + /* integer number formats - set up the flags and "break" */ + case 'o': + base = 8; + break; + + case 'X': + flags |= LARGE; + case 'x': + base = 16; + break; + + case 'd': + case 'i': + flags |= SIGN; + case 'u': + break; + + default: + *str++ = '%'; + if (*fmt) + *str++ = *fmt; + else + --fmt; + continue; + } + if (qualifier == 'l') + num = va_arg(args, unsigned long); + else if (qualifier == 'h') { + num = (unsigned short)va_arg(args, int); + if (flags & SIGN) + num = (short)num; + } else if (flags & SIGN) + num = va_arg(args, int); + else + num = va_arg(args, unsigned int); + str = number(str, num, base, field_width, precision, flags); + } + *str = '\0'; + return str - buf; +} + +int sprintf(char *buf, const char *fmt, ...) +{ + va_list args; + int i; + + va_start(args, fmt); + i = vsprintf(buf, fmt, args); + va_end(args); + return i; +} + +int printf(const char *fmt, ...) +{ + char printf_buf[1024]; + va_list args; + int printed; + + va_start(args, fmt); + printed = vsprintf(printf_buf, fmt, args); + va_end(args); + + puts(printf_buf); + + return printed; +} diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S deleted file mode 100644 index 6dbcc95b212..00000000000 --- a/arch/i386/boot/setup.S +++ /dev/null @@ -1,1075 +0,0 @@ -/* - * setup.S Copyright (C) 1991, 1992 Linus Torvalds - * - * setup.s is responsible for getting the system data from the BIOS, - * and putting them into the appropriate places in system memory. - * both setup.s and system has been loaded by the bootblock. - * - * This code asks the bios for memory/disk/other parameters, and - * puts them in a "safe" place: 0x90000-0x901FF, ie where the - * boot-block used to be. It is then up to the protected mode - * system to read them from there before the area is overwritten - * for buffer-blocks. - * - * Move PS/2 aux init code to psaux.c - * (troyer@saifr00.cfsat.Honeywell.COM) 03Oct92 - * - * some changes and additional features by Christoph Niemann, - * March 1993/June 1994 (Christoph.Niemann@linux.org) - * - * add APM BIOS checking by Stephen Rothwell, May 1994 - * (sfr@canb.auug.org.au) - * - * High load stuff, initrd support and position independency - * by Hans Lermen & Werner Almesberger, February 1996 - * <lermen@elserv.ffm.fgan.de>, <almesber@lrc.epfl.ch> - * - * Video handling moved to video.S by Martin Mares, March 1996 - * <mj@k332.feld.cvut.cz> - * - * Extended memory detection scheme retwiddled by orc@pell.chi.il.us (david - * parsons) to avoid loadlin confusion, July 1997 - * - * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999. - * <stiker@northlink.com> - * - * Fix to work around buggy BIOSes which don't use carry bit correctly - * and/or report extended memory in CX/DX for e801h memory size detection - * call. As a result the kernel got wrong figures. The int15/e801h docs - * from Ralf Brown interrupt list seem to indicate AX/BX should be used - * anyway. So to avoid breaking many machines (presumably there was a reason - * to orginally use CX/DX instead of AX/BX), we do a kludge to see - * if CX/DX have been changed in the e801 call and if so use AX/BX . - * Michael Miller, April 2001 <michaelm@mjmm.org> - * - * New A20 code ported from SYSLINUX by H. Peter Anvin. AMD Elan bugfixes - * by Robert Schwebel, December 2001 <robert@schwebel.de> - */ - -#include <asm/segment.h> -#include <linux/utsrelease.h> -#include <linux/compile.h> -#include <asm/boot.h> -#include <asm/e820.h> -#include <asm/page.h> -#include <asm/setup.h> - -/* Signature words to ensure LILO loaded us right */ -#define SIG1 0xAA55 -#define SIG2 0x5A5A - -INITSEG = DEF_INITSEG # 0x9000, we move boot here, out of the way -SYSSEG = DEF_SYSSEG # 0x1000, system loaded at 0x10000 (65536). -SETUPSEG = DEF_SETUPSEG # 0x9020, this is the current segment - # ... and the former contents of CS - -DELTA_INITSEG = SETUPSEG - INITSEG # 0x0020 - -.code16 -.globl begtext, begdata, begbss, endtext, enddata, endbss - -.text -begtext: -.data -begdata: -.bss -begbss: -.text - -start: - jmp trampoline - -# This is the setup header, and it must start at %cs:2 (old 0x9020:2) - - .ascii "HdrS" # header signature - .word 0x0206 # header version number (>= 0x0105) - # or else old loadlin-1.5 will fail) -realmode_swtch: .word 0, 0 # default_switch, SETUPSEG -start_sys_seg: .word SYSSEG - .word kernel_version # pointing to kernel version string - # above section of header is compatible - # with loadlin-1.5 (header v1.5). Don't - # change it. - -type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin, - # Bootlin, SYSLX, bootsect...) - # See Documentation/i386/boot.txt for - # assigned ids - -# flags, unused bits must be zero (RFU) bit within loadflags -loadflags: -LOADED_HIGH = 1 # If set, the kernel is loaded high -CAN_USE_HEAP = 0x80 # If set, the loader also has set - # heap_end_ptr to tell how much - # space behind setup.S can be used for - # heap purposes. - # Only the loader knows what is free -#ifndef __BIG_KERNEL__ - .byte 0 -#else - .byte LOADED_HIGH -#endif - -setup_move_size: .word 0x8000 # size to move, when setup is not - # loaded at 0x90000. We will move setup - # to 0x90000 then just before jumping - # into the kernel. However, only the - # loader knows how much data behind - # us also needs to be loaded. - -code32_start: # here loaders can put a different - # start address for 32-bit code. -#ifndef __BIG_KERNEL__ - .long 0x1000 # 0x1000 = default for zImage -#else - .long 0x100000 # 0x100000 = default for big kernel -#endif - -ramdisk_image: .long 0 # address of loaded ramdisk image - # Here the loader puts the 32-bit - # address where it loaded the image. - # This only will be read by the kernel. - -ramdisk_size: .long 0 # its size in bytes - -bootsect_kludge: - .long 0 # obsolete - -heap_end_ptr: .word modelist+1024 # (Header version 0x0201 or later) - # space from here (exclusive) down to - # end of setup code can be used by setup - # for local heap purposes. - -pad1: .word 0 -cmd_line_ptr: .long 0 # (Header version 0x0202 or later) - # If nonzero, a 32-bit pointer - # to the kernel command line. - # The command line should be - # located between the start of - # setup and the end of low - # memory (0xa0000), or it may - # get overwritten before it - # gets read. If this field is - # used, there is no longer - # anything magical about the - # 0x90000 segment; the setup - # can be located anywhere in - # low memory 0x10000 or higher. - -ramdisk_max: .long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff - # (Header version 0x0203 or later) - # The highest safe address for - # the contents of an initrd - -kernel_alignment: .long CONFIG_PHYSICAL_ALIGN #physical addr alignment - #required for protected mode - #kernel -#ifdef CONFIG_RELOCATABLE -relocatable_kernel: .byte 1 -#else -relocatable_kernel: .byte 0 -#endif -pad2: .byte 0 -pad3: .word 0 - -cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, - #added with boot protocol - #version 2.06 - -trampoline: call start_of_setup - .align 16 - # The offset at this point is 0x240 - .space (0xeff-0x240+1) # E820 & EDD space (ending at 0xeff) -# End of setup header ##################################################### - -start_of_setup: -# Bootlin depends on this being done early - movw $0x01500, %ax - movb $0x81, %dl - int $0x13 - -#ifdef SAFE_RESET_DISK_CONTROLLER -# Reset the disk controller. - movw $0x0000, %ax - movb $0x80, %dl - int $0x13 -#endif - -# Set %ds = %cs, we know that SETUPSEG = %cs at this point - movw %cs, %ax # aka SETUPSEG - movw %ax, %ds -# Check signature at end of setup - cmpw $SIG1, setup_sig1 - jne bad_sig - - cmpw $SIG2, setup_sig2 - jne bad_sig - - jmp good_sig1 - -# Routine to print asciiz string at ds:si -prtstr: - lodsb - andb %al, %al - jz fin - - call prtchr - jmp prtstr - -fin: ret - -# Space printing -prtsp2: call prtspc # Print double space -prtspc: movb $0x20, %al # Print single space (note: fall-thru) - -# Part of above routine, this one just prints ascii al -prtchr: pushw %ax - pushw %cx - movw $7,%bx - movw $0x01, %cx - movb $0x0e, %ah - int $0x10 - popw %cx - popw %ax - ret - -beep: movb $0x07, %al - jmp prtchr - -no_sig_mess: .string "No setup signature found ..." - -good_sig1: - jmp good_sig - -# We now have to find the rest of the setup code/data -bad_sig: - movw %cs, %ax # SETUPSEG - subw $DELTA_INITSEG, %ax # INITSEG - movw %ax, %ds - xorb %bh, %bh - movb (497), %bl # get setup sect from bootsect - subw $4, %bx # LILO loads 4 sectors of setup - shlw $8, %bx # convert to words (1sect=2^8 words) - movw %bx, %cx - shrw $3, %bx # convert to segment - addw $SYSSEG, %bx - movw %bx, %cs:start_sys_seg -# Move rest of setup code/data to here - movw $2048, %di # four sectors loaded by LILO - subw %si, %si - pushw %cs - popw %es - movw $SYSSEG, %ax - movw %ax, %ds - rep - movsw - movw %cs, %ax # aka SETUPSEG - movw %ax, %ds - cmpw $SIG1, setup_sig1 - jne no_sig - - cmpw $SIG2, setup_sig2 - jne no_sig - - jmp good_sig - -no_sig: - lea no_sig_mess, %si - call prtstr - -no_sig_loop: - hlt - jmp no_sig_loop - -good_sig: - movw %cs, %ax # aka SETUPSEG - subw $DELTA_INITSEG, %ax # aka INITSEG - movw %ax, %ds -# Check if an old loader tries to load a big-kernel - testb $LOADED_HIGH, %cs:loadflags # Do we have a big kernel? - jz loader_ok # No, no danger for old loaders. - - cmpb $0, %cs:type_of_loader # Do we have a loader that - # can deal with us? - jnz loader_ok # Yes, continue. - - pushw %cs # No, we have an old loader, - popw %ds # die. - lea loader_panic_mess, %si - call prtstr - - jmp no_sig_loop - -loader_panic_mess: .string "Wrong loader, giving up..." - -# check minimum cpuid -# we do this here because it is the last place we can actually -# show a user visible error message. Later the video modus -# might be already messed up. -loader_ok: - call verify_cpu - testl %eax,%eax - jz cpu_ok - movw %cs,%ax # aka SETUPSEG - movw %ax,%ds - lea cpu_panic_mess,%si - call prtstr -1: jmp 1b - -cpu_panic_mess: - .asciz "PANIC: CPU too old for this kernel." - -#include "../kernel/verify_cpu.S" - -cpu_ok: -# Get memory size (extended mem, kB) - - xorl %eax, %eax - movl %eax, (0x1e0) -#ifndef STANDARD_MEMORY_BIOS_CALL - movb %al, (E820NR) -# Try three different memory detection schemes. First, try -# e820h, which lets us assemble a memory map, then try e801h, -# which returns a 32-bit memory size, and finally 88h, which -# returns 0-64m - -# method E820H: -# the memory map from hell. e820h returns memory classified into -# a whole bunch of different types, and allows memory holes and -# everything. We scan through this memory map and build a list -# of the first 32 memory areas, which we return at [E820MAP]. -# This is documented at http://www.acpi.info/, in the ACPI 2.0 specification. - -#define SMAP 0x534d4150 - -meme820: - xorl %ebx, %ebx # continuation counter - movw $E820MAP, %di # point into the whitelist - # so we can have the bios - # directly write into it. - -jmpe820: - movl $0x0000e820, %eax # e820, upper word zeroed - movl $SMAP, %edx # ascii 'SMAP' - movl $20, %ecx # size of the e820rec - pushw %ds # data record. - popw %es - int $0x15 # make the call - jc bail820 # fall to e801 if it fails - - cmpl $SMAP, %eax # check the return is `SMAP' - jne bail820 # fall to e801 if it fails - -# cmpl $1, 16(%di) # is this usable memory? -# jne again820 - - # If this is usable memory, we save it by simply advancing %di by - # sizeof(e820rec). - # -good820: - movb (E820NR), %al # up to 128 entries - cmpb $E820MAX, %al - jae bail820 - - incb (E820NR) - movw %di, %ax - addw $20, %ax - movw %ax, %di -again820: - cmpl $0, %ebx # check to see if - jne jmpe820 # %ebx is set to EOF -bail820: - - -# method E801H: -# memory size is in 1k chunksizes, to avoid confusing loadlin. -# we store the 0xe801 memory size in a completely different place, -# because it will most likely be longer than 16 bits. -# (use 1e0 because that's what Larry Augustine uses in his -# alternative new memory detection scheme, and it's sensible -# to write everything into the same place.) - -meme801: - stc # fix to work around buggy - xorw %cx,%cx # BIOSes which don't clear/set - xorw %dx,%dx # carry on pass/error of - # e801h memory size call - # or merely pass cx,dx though - # without changing them. - movw $0xe801, %ax - int $0x15 - jc mem88 - - cmpw $0x0, %cx # Kludge to handle BIOSes - jne e801usecxdx # which report their extended - cmpw $0x0, %dx # memory in AX/BX rather than - jne e801usecxdx # CX/DX. The spec I have read - movw %ax, %cx # seems to indicate AX/BX - movw %bx, %dx # are more reasonable anyway... - -e801usecxdx: - andl $0xffff, %edx # clear sign extend - shll $6, %edx # and go from 64k to 1k chunks - movl %edx, (0x1e0) # store extended memory size - andl $0xffff, %ecx # clear sign extend - addl %ecx, (0x1e0) # and add lower memory into - # total size. - -# Ye Olde Traditional Methode. Returns the memory size (up to 16mb or -# 64mb, depending on the bios) in ax. -mem88: - -#endif - movb $0x88, %ah - int $0x15 - movw %ax, (2) - -# Set the keyboard repeat rate to the max - movw $0x0305, %ax - xorw %bx, %bx - int $0x16 - -# Check for video adapter and its parameters and allow the -# user to browse video modes. - call video # NOTE: we need %ds pointing - # to bootsector - -# Get hd0 data... - xorw %ax, %ax - movw %ax, %ds - ldsw (4 * 0x41), %si - movw %cs, %ax # aka SETUPSEG - subw $DELTA_INITSEG, %ax # aka INITSEG - pushw %ax - movw %ax, %es - movw $0x0080, %di - movw $0x10, %cx - pushw %cx - cld - rep - movsb -# Get hd1 data... - xorw %ax, %ax - movw %ax, %ds - ldsw (4 * 0x46), %si - popw %cx - popw %es - movw $0x0090, %di - rep - movsb -# Check that there IS a hd1 :-) - movw $0x01500, %ax - movb $0x81, %dl - int $0x13 - jc no_disk1 - - cmpb $3, %ah - je is_disk1 - -no_disk1: - movw %cs, %ax # aka SETUPSEG - subw $DELTA_INITSEG, %ax # aka INITSEG - movw %ax, %es - movw $0x0090, %di - movw $0x10, %cx - xorw %ax, %ax - cld - rep - stosb -is_disk1: -# check for Micro Channel (MCA) bus - movw %cs, %ax # aka SETUPSEG - subw $DELTA_INITSEG, %ax # aka INITSEG - movw %ax, %ds - xorw %ax, %ax - movw %ax, (0xa0) # set table length to 0 - movb $0xc0, %ah - stc - int $0x15 # moves feature table to es:bx - jc no_mca - - pushw %ds - movw %es, %ax - movw %ax, %ds - movw %cs, %ax # aka SETUPSEG - subw $DELTA_INITSEG, %ax # aka INITSEG - movw %ax, %es - movw %bx, %si - movw $0xa0, %di - movw (%si), %cx - addw $2, %cx # table length is a short - cmpw $0x10, %cx - jc sysdesc_ok - - movw $0x10, %cx # we keep only first 16 bytes -sysdesc_ok: - rep - movsb - popw %ds -no_mca: -#ifdef CONFIG_X86_VOYAGER - movb $0xff, 0x40 # flag on config found - movb $0xc0, %al - mov $0xff, %ah - int $0x15 # put voyager config info at es:di - jc no_voyager - movw $0x40, %si # place voyager info in apm table - cld - movw $7, %cx -voyager_rep: - movb %es:(%di), %al - movb %al,(%si) - incw %di - incw %si - decw %cx - jnz voyager_rep -no_voyager: -#endif -# Check for PS/2 pointing device - movw %cs, %ax # aka SETUPSEG - subw $DELTA_INITSEG, %ax # aka INITSEG - movw %ax, %ds - movb $0, (0x1ff) # default is no pointing device - int $0x11 # int 0x11: equipment list - testb $0x04, %al # check if mouse installed - jz no_psmouse - - movb $0xAA, (0x1ff) # device present -no_psmouse: - -#if defined(CONFIG_X86_SPEEDSTEP_SMI) || defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) - movl $0x0000E980, %eax # IST Support - movl $0x47534943, %edx # Request value - int $0x15 - - movl %eax, (96) - movl %ebx, (100) - movl %ecx, (104) - movl %edx, (108) -#endif - -#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE) -# Then check for an APM BIOS... - # %ds points to the bootsector - movw $0, 0x40 # version = 0 means no APM BIOS - movw $0x05300, %ax # APM BIOS installation check - xorw %bx, %bx - int $0x15 - jc done_apm_bios # Nope, no APM BIOS - - cmpw $0x0504d, %bx # Check for "PM" signature - jne done_apm_bios # No signature, no APM BIOS - - andw $0x02, %cx # Is 32 bit supported? - je done_apm_bios # No 32-bit, no (good) APM BIOS - - movw $0x05304, %ax # Disconnect first just in case - xorw %bx, %bx - int $0x15 # ignore return code - movw $0x05303, %ax # 32 bit connect - xorl %ebx, %ebx - xorw %cx, %cx # paranoia :-) - xorw %dx, %dx # ... - xorl %esi, %esi # ... - xorw %di, %di # ... - int $0x15 - jc no_32_apm_bios # Ack, error. - - movw %ax, (66) # BIOS code segment - movl %ebx, (68) # BIOS entry point offset - movw %cx, (72) # BIOS 16 bit code segment - movw %dx, (74) # BIOS data segment - movl %esi, (78) # BIOS code segment lengths - movw %di, (82) # BIOS data segment length -# Redo the installation check as the 32 bit connect -# modifies the flags returned on some BIOSs - movw $0x05300, %ax # APM BIOS installation check - xorw %bx, %bx - xorw %cx, %cx # paranoia - int $0x15 - jc apm_disconnect # error -> shouldn't happen - - cmpw $0x0504d, %bx # check for "PM" signature - jne apm_disconnect # no sig -> shouldn't happen - - movw %ax, (64) # record the APM BIOS version - movw %cx, (76) # and flags - jmp done_apm_bios - -apm_disconnect: # Tidy up - movw $0x05304, %ax # Disconnect - xorw %bx, %bx - int $0x15 # ignore return code - - jmp done_apm_bios - -no_32_apm_bios: - andw $0xfffd, (76) # remove 32 bit support bit -done_apm_bios: -#endif - -#include "edd.S" - -# Now we want to move to protected mode ... - cmpw $0, %cs:realmode_swtch - jz rmodeswtch_normal - - lcall *%cs:realmode_swtch - - jmp rmodeswtch_end - -rmodeswtch_normal: - pushw %cs - call default_switch - -rmodeswtch_end: -# Now we move the system to its rightful place ... but we check if we have a -# big-kernel. In that case we *must* not move it ... - testb $LOADED_HIGH, %cs:loadflags - jz do_move0 # .. then we have a normal low - # loaded zImage - # .. or else we have a high - # loaded bzImage - jmp end_move # ... and we skip moving - -do_move0: - movw $0x100, %ax # start of destination segment - movw %cs, %bp # aka SETUPSEG - subw $DELTA_INITSEG, %bp # aka INITSEG - movw %cs:start_sys_seg, %bx # start of source segment - cld -do_move: - movw %ax, %es # destination segment - incb %ah # instead of add ax,#0x100 - movw %bx, %ds # source segment - addw $0x100, %bx - subw %di, %di - subw %si, %si - movw $0x800, %cx - rep - movsw - cmpw %bp, %bx # assume start_sys_seg > 0x200, - # so we will perhaps read one - # page more than needed, but - # never overwrite INITSEG - # because destination is a - # minimum one page below source - jb do_move - -end_move: -# then we load the segment descriptors - movw %cs, %ax # aka SETUPSEG - movw %ax, %ds - -# Check whether we need to be downward compatible with version <=201 - cmpl $0, cmd_line_ptr - jne end_move_self # loader uses version >=202 features - cmpb $0x20, type_of_loader - je end_move_self # bootsect loader, we know of it - -# Boot loader doesnt support boot protocol version 2.02. -# If we have our code not at 0x90000, we need to move it there now. -# We also then need to move the params behind it (commandline) -# Because we would overwrite the code on the current IP, we move -# it in two steps, jumping high after the first one. - movw %cs, %ax - cmpw $SETUPSEG, %ax - je end_move_self - - cli # make sure we really have - # interrupts disabled ! - # because after this the stack - # should not be used - subw $DELTA_INITSEG, %ax # aka INITSEG - movw %ss, %dx - cmpw %ax, %dx - jb move_self_1 - - addw $INITSEG, %dx - subw %ax, %dx # this will go into %ss after - # the move -move_self_1: - movw %ax, %ds - movw $INITSEG, %ax # real INITSEG - movw %ax, %es - movw %cs:setup_move_size, %cx - std # we have to move up, so we use - # direction down because the - # areas may overlap - movw %cx, %di - decw %di - movw %di, %si - subw $move_self_here+0x200, %cx - rep - movsb - ljmp $SETUPSEG, $move_self_here - -move_self_here: - movw $move_self_here+0x200, %cx - rep - movsb - movw $SETUPSEG, %ax - movw %ax, %ds - movw %dx, %ss -end_move_self: # now we are at the right place - -# -# Enable A20. This is at the very best an annoying procedure. -# A20 code ported from SYSLINUX 1.52-1.63 by H. Peter Anvin. -# AMD Elan bug fix by Robert Schwebel. -# - -#if defined(CONFIG_X86_ELAN) - movb $0x02, %al # alternate A20 gate - outb %al, $0x92 # this works on SC410/SC520 -a20_elan_wait: - call a20_test - jz a20_elan_wait - jmp a20_done -#endif - - -A20_TEST_LOOPS = 32 # Iterations per wait -A20_ENABLE_LOOPS = 255 # Total loops to try - - -#ifndef CONFIG_X86_VOYAGER -a20_try_loop: - - # First, see if we are on a system with no A20 gate. -a20_none: - call a20_test - jnz a20_done - - # Next, try the BIOS (INT 0x15, AX=0x2401) -a20_bios: - movw $0x2401, %ax - pushfl # Be paranoid about flags - int $0x15 - popfl - - call a20_test - jnz a20_done - - # Try enabling A20 through the keyboard controller -#endif /* CONFIG_X86_VOYAGER */ -a20_kbc: - call empty_8042 - -#ifndef CONFIG_X86_VOYAGER - call a20_test # Just in case the BIOS worked - jnz a20_done # but had a delayed reaction. -#endif - - movb $0xD1, %al # command write - outb %al, $0x64 - call empty_8042 - - movb $0xDF, %al # A20 on - outb %al, $0x60 - call empty_8042 - -#ifndef CONFIG_X86_VOYAGER - # Wait until a20 really *is* enabled; it can take a fair amount of - # time on certain systems; Toshiba Tecras are known to have this - # problem. -a20_kbc_wait: - xorw %cx, %cx -a20_kbc_wait_loop: - call a20_test - jnz a20_done - loop a20_kbc_wait_loop - - # Final attempt: use "configuration port A" -a20_fast: - inb $0x92, %al # Configuration Port A - orb $0x02, %al # "fast A20" version - andb $0xFE, %al # don't accidentally reset - outb %al, $0x92 - - # Wait for configuration port A to take effect -a20_fast_wait: - xorw %cx, %cx -a20_fast_wait_loop: - call a20_test - jnz a20_done - loop a20_fast_wait_loop - - # A20 is still not responding. Try frobbing it again. - # - decb (a20_tries) - jnz a20_try_loop - - movw $a20_err_msg, %si - call prtstr - -a20_die: - hlt - jmp a20_die - -a20_tries: - .byte A20_ENABLE_LOOPS - -a20_err_msg: - .ascii "linux: fatal error: A20 gate not responding!" - .byte 13, 10, 0 - - # If we get here, all is good -a20_done: - -#endif /* CONFIG_X86_VOYAGER */ -# set up gdt and idt and 32bit start address - lidt idt_48 # load idt with 0,0 - xorl %eax, %eax # Compute gdt_base - movw %ds, %ax # (Convert %ds:gdt to a linear ptr) - shll $4, %eax - addl %eax, code32 - addl $gdt, %eax - movl %eax, (gdt_48+2) - lgdt gdt_48 # load gdt with whatever is - # appropriate - -# make sure any possible coprocessor is properly reset.. - xorw %ax, %ax - outb %al, $0xf0 - call delay - - outb %al, $0xf1 - call delay - -# well, that went ok, I hope. Now we mask all interrupts - the rest -# is done in init_IRQ(). - movb $0xFF, %al # mask all interrupts for now - outb %al, $0xA1 - call delay - - movb $0xFB, %al # mask all irq's but irq2 which - outb %al, $0x21 # is cascaded - -# Well, that certainly wasn't fun :-(. Hopefully it works, and we don't -# need no steenking BIOS anyway (except for the initial loading :-). -# The BIOS-routine wants lots of unnecessary data, and it's less -# "interesting" anyway. This is how REAL programmers do it. -# -# Well, now's the time to actually move into protected mode. To make -# things as simple as possible, we do no register set-up or anything, -# we let the gnu-compiled 32-bit programs do that. We just jump to -# absolute address 0x1000 (or the loader supplied one), -# in 32-bit protected mode. -# -# Note that the short jump isn't strictly needed, although there are -# reasons why it might be a good idea. It won't hurt in any case. - movw $1, %ax # protected mode (PE) bit - lmsw %ax # This is it! - jmp flush_instr - -flush_instr: - xorw %bx, %bx # Flag to indicate a boot - xorl %esi, %esi # Pointer to real-mode code - movw %cs, %si - subw $DELTA_INITSEG, %si - shll $4, %esi # Convert to 32-bit pointer - -# jump to startup_32 in arch/i386/boot/compressed/head.S -# -# NOTE: For high loaded big kernels we need a -# jmpi 0x100000,__BOOT_CS -# -# but we yet haven't reloaded the CS register, so the default size -# of the target offset still is 16 bit. -# However, using an operand prefix (0x66), the CPU will properly -# take our 48 bit far pointer. (INTeL 80386 Programmer's Reference -# Manual, Mixing 16-bit and 32-bit code, page 16-6) - - .byte 0x66, 0xea # prefix + jmpi-opcode -code32: .long startup_32 # will be set to %cs+startup_32 - .word __BOOT_CS -.code32 -startup_32: - movl $(__BOOT_DS), %eax - movl %eax, %ds - movl %eax, %es - movl %eax, %fs - movl %eax, %gs - movl %eax, %ss - - xorl %eax, %eax -1: incl %eax # check that A20 really IS enabled - movl %eax, 0x00000000 # loop forever if it isn't - cmpl %eax, 0x00100000 - je 1b - - # Jump to the 32bit entry point - jmpl *(code32_start - start + (DELTA_INITSEG << 4))(%esi) -.code16 - -# Here's a bunch of information about your current kernel.. -kernel_version: .ascii UTS_RELEASE - .ascii " (" - .ascii LINUX_COMPILE_BY - .ascii "@" - .ascii LINUX_COMPILE_HOST - .ascii ") " - .ascii UTS_VERSION - .byte 0 - -# This is the default real mode switch routine. -# to be called just before protected mode transition -default_switch: - cli # no interrupts allowed ! - movb $0x80, %al # disable NMI for bootup - # sequence - outb %al, $0x70 - lret - - -#ifndef CONFIG_X86_VOYAGER -# This routine tests whether or not A20 is enabled. If so, it -# exits with zf = 0. -# -# The memory address used, 0x200, is the int $0x80 vector, which -# should be safe. - -A20_TEST_ADDR = 4*0x80 - -a20_test: - pushw %cx - pushw %ax - xorw %cx, %cx - movw %cx, %fs # Low memory - decw %cx - movw %cx, %gs # High memory area - movw $A20_TEST_LOOPS, %cx - movw %fs:(A20_TEST_ADDR), %ax - pushw %ax -a20_test_wait: - incw %ax - movw %ax, %fs:(A20_TEST_ADDR) - call delay # Serialize and make delay constant - cmpw %gs:(A20_TEST_ADDR+0x10), %ax - loope a20_test_wait - - popw %fs:(A20_TEST_ADDR) - popw %ax - popw %cx - ret - -#endif /* CONFIG_X86_VOYAGER */ - -# This routine checks that the keyboard command queue is empty -# (after emptying the output buffers) -# -# Some machines have delusions that the keyboard buffer is always full -# with no keyboard attached... -# -# If there is no keyboard controller, we will usually get 0xff -# to all the reads. With each IO taking a microsecond and -# a timeout of 100,000 iterations, this can take about half a -# second ("delay" == outb to port 0x80). That should be ok, -# and should also be plenty of time for a real keyboard controller -# to empty. -# - -empty_8042: - pushl %ecx - movl $100000, %ecx - -empty_8042_loop: - decl %ecx - jz empty_8042_end_loop - - call delay - - inb $0x64, %al # 8042 status port - testb $1, %al # output buffer? - jz no_output - - call delay - inb $0x60, %al # read it - jmp empty_8042_loop - -no_output: - testb $2, %al # is input buffer full? - jnz empty_8042_loop # yes - loop -empty_8042_end_loop: - popl %ecx - ret - -# Read the cmos clock. Return the seconds in al -gettime: - pushw %cx - movb $0x02, %ah - int $0x1a - movb %dh, %al # %dh contains the seconds - andb $0x0f, %al - movb %dh, %ah - movb $0x04, %cl - shrb %cl, %ah - aad - popw %cx - ret - -# Delay is needed after doing I/O -delay: - outb %al,$0x80 - ret - -# Descriptor tables -# -# NOTE: The intel manual says gdt should be sixteen bytes aligned for -# efficiency reasons. However, there are machines which are known not -# to boot with misaligned GDTs, so alter this at your peril! If you alter -# GDT_ENTRY_BOOT_CS (in asm/segment.h) remember to leave at least two -# empty GDT entries (one for NULL and one reserved). -# -# NOTE: On some CPUs, the GDT must be 8 byte aligned. This is -# true for the Voyager Quad CPU card which will not boot without -# This directive. 16 byte aligment is recommended by intel. -# - .align 16 -gdt: - .fill GDT_ENTRY_BOOT_CS,8,0 - - .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) - .word 0 # base address = 0 - .word 0x9A00 # code read/exec - .word 0x00CF # granularity = 4096, 386 - # (+5th nibble of limit) - - .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) - .word 0 # base address = 0 - .word 0x9200 # data read/write - .word 0x00CF # granularity = 4096, 386 - # (+5th nibble of limit) -gdt_end: - .align 4 - - .word 0 # alignment byte -idt_48: - .word 0 # idt limit = 0 - .word 0, 0 # idt base = 0L - - .word 0 # alignment byte -gdt_48: - .word gdt_end - gdt - 1 # gdt limit - .word 0, 0 # gdt base (filled in later) - -# Include video setup & detection code - -#include "video.S" - -# Setup signature -- must be last -setup_sig1: .word SIG1 -setup_sig2: .word SIG2 - -# After this point, there is some free space which is used by the video mode -# handling code to store the temporary mode table (not used by the kernel). - -modelist: - -.text -endtext: -.data -enddata: -.bss -endbss: diff --git a/arch/i386/boot/setup.ld b/arch/i386/boot/setup.ld new file mode 100644 index 00000000000..df9234b3a5e --- /dev/null +++ b/arch/i386/boot/setup.ld @@ -0,0 +1,54 @@ +/* + * setup.ld + * + * Linker script for the i386 setup code + */ +OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") +OUTPUT_ARCH(i386) +ENTRY(_start) + +SECTIONS +{ + . = 0; + .bstext : { *(.bstext) } + .bsdata : { *(.bsdata) } + + . = 497; + .header : { *(.header) } + .inittext : { *(.inittext) } + .initdata : { *(.initdata) } + .text : { *(.text*) } + + . = ALIGN(16); + .rodata : { *(.rodata*) } + + .videocards : { + video_cards = .; + *(.videocards) + video_cards_end = .; + } + + . = ALIGN(16); + .data : { *(.data*) } + + .signature : { + setup_sig = .; + LONG(0x5a5aaa55) + } + + + . = ALIGN(16); + .bss : + { + __bss_start = .; + *(.bss) + __bss_end = .; + } + . = ALIGN(16); + _end = .; + + /DISCARD/ : { *(.note*) } + + . = ASSERT(_end <= 0x8000, "Setup too big!"); + . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!"); +} diff --git a/arch/i386/boot/string.c b/arch/i386/boot/string.c new file mode 100644 index 00000000000..481a2209778 --- /dev/null +++ b/arch/i386/boot/string.c @@ -0,0 +1,52 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/string.c + * + * Very basic string functions + */ + +#include "boot.h" + +int strcmp(const char *str1, const char *str2) +{ + const unsigned char *s1 = (const unsigned char *)str1; + const unsigned char *s2 = (const unsigned char *)str2; + int delta = 0; + + while (*s1 || *s2) { + delta = *s2 - *s1; + if (delta) + return delta; + s1++; + s2++; + } + return 0; +} + +size_t strnlen(const char *s, size_t maxlen) +{ + const char *es = s; + while (*es && maxlen) { + es++; + maxlen--; + } + + return (es - s); +} + +unsigned int atou(const char *s) +{ + unsigned int i = 0; + while (isdigit(*s)) + i = i * 10 + (*s++ - '0'); + return i; +} diff --git a/arch/i386/boot/tools/build.c b/arch/i386/boot/tools/build.c index 05798419a6a..b4248740ff0 100644 --- a/arch/i386/boot/tools/build.c +++ b/arch/i386/boot/tools/build.c @@ -1,13 +1,12 @@ /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1997 Martin Mares + * Copyright (C) 2007 H. Peter Anvin */ /* - * This file builds a disk-image from three different files: + * This file builds a disk-image from two different files: * - * - bootsect: compatibility mbr which prints an error message if - * someone tries to boot the kernel directly. * - setup: 8086 machine code, sets up system parm * - system: 80386 code for actual system * @@ -21,6 +20,7 @@ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 * Cross compiling fixes by Gertjan van Wingerde, July 1996 * Rewritten by Martin Mares, April 1997 + * Substantially overhauled by H. Peter Anvin, April 2007 */ #include <stdio.h> @@ -32,23 +32,25 @@ #include <sys/sysmacros.h> #include <unistd.h> #include <fcntl.h> +#include <sys/mman.h> #include <asm/boot.h> -typedef unsigned char byte; -typedef unsigned short word; -typedef unsigned long u32; +typedef unsigned char u8; +typedef unsigned short u16; +typedef unsigned long u32; #define DEFAULT_MAJOR_ROOT 0 #define DEFAULT_MINOR_ROOT 0 -/* Minimal number of setup sectors (see also bootsect.S) */ -#define SETUP_SECTS 4 +/* Minimal number of setup sectors */ +#define SETUP_SECT_MIN 5 +#define SETUP_SECT_MAX 64 -byte buf[1024]; -int fd; +/* This must be large enough to hold the entire setup */ +u8 buf[SETUP_SECT_MAX*512]; int is_big_kernel; -void die(const char * str, ...) +static void die(const char * str, ...) { va_list args; va_start(args, str); @@ -57,15 +59,9 @@ void die(const char * str, ...) exit(1); } -void file_open(const char *name) +static void usage(void) { - if ((fd = open(name, O_RDONLY, 0)) < 0) - die("Unable to open `%s': %m", name); -} - -void usage(void) -{ - die("Usage: build [-b] bootsect setup system [rootdev] [> image]"); + die("Usage: build [-b] setup system [rootdev] [> image]"); } int main(int argc, char ** argv) @@ -73,27 +69,30 @@ int main(int argc, char ** argv) unsigned int i, sz, setup_sectors; int c; u32 sys_size; - byte major_root, minor_root; + u8 major_root, minor_root; struct stat sb; + FILE *file; + int fd; + void *kernel; if (argc > 2 && !strcmp(argv[1], "-b")) { is_big_kernel = 1; argc--, argv++; } - if ((argc < 4) || (argc > 5)) + if ((argc < 3) || (argc > 4)) usage(); - if (argc > 4) { - if (!strcmp(argv[4], "CURRENT")) { + if (argc > 3) { + if (!strcmp(argv[3], "CURRENT")) { if (stat("/", &sb)) { perror("/"); die("Couldn't stat /"); } major_root = major(sb.st_dev); minor_root = minor(sb.st_dev); - } else if (strcmp(argv[4], "FLOPPY")) { - if (stat(argv[4], &sb)) { - perror(argv[4]); + } else if (strcmp(argv[3], "FLOPPY")) { + if (stat(argv[3], &sb)) { + perror(argv[3]); die("Couldn't stat root device."); } major_root = major(sb.st_rdev); @@ -108,79 +107,62 @@ int main(int argc, char ** argv) } fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root); - file_open(argv[1]); - i = read(fd, buf, sizeof(buf)); - fprintf(stderr,"Boot sector %d bytes.\n",i); - if (i != 512) - die("Boot block must be exactly 512 bytes"); + /* Copy the setup code */ + file = fopen(argv[1], "r"); + if (!file) + die("Unable to open `%s': %m", argv[1]); + c = fread(buf, 1, sizeof(buf), file); + if (ferror(file)) + die("read-error on `setup'"); + if (c < 1024) + die("The setup must be at least 1024 bytes"); if (buf[510] != 0x55 || buf[511] != 0xaa) die("Boot block hasn't got boot flag (0xAA55)"); + fclose(file); + + /* Pad unused space with zeros */ + setup_sectors = (c + 511) / 512; + if (setup_sectors < SETUP_SECT_MIN) + setup_sectors = SETUP_SECT_MIN; + i = setup_sectors*512; + memset(buf+c, 0, i-c); + + /* Set the default root device */ buf[508] = minor_root; buf[509] = major_root; - if (write(1, buf, 512) != 512) - die("Write call failed"); - close (fd); - - file_open(argv[2]); /* Copy the setup code */ - for (i=0 ; (c=read(fd, buf, sizeof(buf)))>0 ; i+=c ) - if (write(1, buf, c) != c) - die("Write call failed"); - if (c != 0) - die("read-error on `setup'"); - close (fd); - - setup_sectors = (i + 511) / 512; /* Pad unused space with zeros */ - /* for compatibility with ancient versions of LILO. */ - if (setup_sectors < SETUP_SECTS) - setup_sectors = SETUP_SECTS; - fprintf(stderr, "Setup is %d bytes.\n", i); - memset(buf, 0, sizeof(buf)); - while (i < setup_sectors * 512) { - c = setup_sectors * 512 - i; - if (c > sizeof(buf)) - c = sizeof(buf); - if (write(1, buf, c) != c) - die("Write call failed"); - i += c; - } - file_open(argv[3]); - if (fstat (fd, &sb)) - die("Unable to stat `%s': %m", argv[3]); + fprintf(stderr, "Setup is %d bytes (padded to %d bytes).\n", c, i); + + /* Open and stat the kernel file */ + fd = open(argv[2], O_RDONLY); + if (fd < 0) + die("Unable to open `%s': %m", argv[2]); + if (fstat(fd, &sb)) + die("Unable to stat `%s': %m", argv[2]); sz = sb.st_size; - fprintf (stderr, "System is %d kB\n", sz/1024); + fprintf (stderr, "System is %d kB\n", (sz+1023)/1024); + kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0); + if (kernel == MAP_FAILED) + die("Unable to mmap '%s': %m", argv[2]); sys_size = (sz + 15) / 16; if (!is_big_kernel && sys_size > DEF_SYSSIZE) die("System is too big. Try using bzImage or modules."); - while (sz > 0) { - int l, n; - - l = (sz > sizeof(buf)) ? sizeof(buf) : sz; - if ((n=read(fd, buf, l)) != l) { - if (n < 0) - die("Error reading %s: %m", argv[3]); - else - die("%s: Unexpected EOF", argv[3]); - } - if (write(1, buf, l) != l) - die("Write failed"); - sz -= l; - } + + /* Patch the setup code with the appropriate size parameters */ + buf[0x1f1] = setup_sectors-1; + buf[0x1f4] = sys_size; + buf[0x1f5] = sys_size >> 8; + buf[0x1f6] = sys_size >> 16; + buf[0x1f7] = sys_size >> 24; + + if (fwrite(buf, 1, i, stdout) != i) + die("Writing setup failed"); + + /* Copy the kernel code */ + if (fwrite(kernel, 1, sz, stdout) != sz) + die("Writing kernel failed"); close(fd); - if (lseek(1, 497, SEEK_SET) != 497) /* Write sizes to the bootsector */ - die("Output: seek failed"); - buf[0] = setup_sectors; - if (write(1, buf, 1) != 1) - die("Write of setup sector count failed"); - if (lseek(1, 500, SEEK_SET) != 500) - die("Output: seek failed"); - buf[0] = (sys_size & 0xff); - buf[1] = ((sys_size >> 8) & 0xff); - buf[2] = ((sys_size >> 16) & 0xff); - buf[3] = ((sys_size >> 24) & 0xff); - if (write(1, buf, 4) != 4) - die("Write of image length failed"); - - return 0; /* Everything is OK */ + /* Everything is OK */ + return 0; } diff --git a/arch/i386/boot/tty.c b/arch/i386/boot/tty.c new file mode 100644 index 00000000000..9c668aad351 --- /dev/null +++ b/arch/i386/boot/tty.c @@ -0,0 +1,112 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/tty.c + * + * Very simple screen I/O + * XXX: Probably should add very simple serial I/O? + */ + +#include "boot.h" + +/* + * These functions are in .inittext so they can be used to signal + * error during initialization. + */ + +void __attribute__((section(".inittext"))) putchar(int ch) +{ + unsigned char c = ch; + + if (c == '\n') + putchar('\r'); /* \n -> \r\n */ + + /* int $0x10 is known to have bugs involving touching registers + it shouldn't. Be extra conservative... */ + asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal" + : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch)); +} + +void __attribute__((section(".inittext"))) puts(const char *str) +{ + int n = 0; + while (*str) { + putchar(*str++); + n++; + } +} + +/* + * Read the CMOS clock through the BIOS, and return the + * seconds in BCD. + */ + +static u8 gettime(void) +{ + u16 ax = 0x0200; + u16 cx, dx; + + asm("int $0x1a" + : "+a" (ax), "=c" (cx), "=d" (dx) + : : "ebx", "esi", "edi"); + + return dx >> 8; +} + +/* + * Read from the keyboard + */ +int getchar(void) +{ + u16 ax = 0; + asm("int $0x16" : "+a" (ax)); + + return ax & 0xff; +} + +static int kbd_pending(void) +{ + u8 pending; + asm("int $0x16; setnz %0" + : "=rm" (pending) + : "a" (0x0100)); + return pending; +} + +void kbd_flush(void) +{ + for (;;) { + if (!kbd_pending()) + break; + getchar(); + } +} + +int getchar_timeout(void) +{ + int cnt = 30; + int t0, t1; + + t0 = gettime(); + + while (cnt) { + if (kbd_pending()) + return getchar(); + + t1 = gettime(); + if (t0 != t1) { + cnt--; + t0 = t1; + } + } + + return 0; /* Timeout! */ +} diff --git a/arch/i386/boot/version.c b/arch/i386/boot/version.c new file mode 100644 index 00000000000..c61462f7d9a --- /dev/null +++ b/arch/i386/boot/version.c @@ -0,0 +1,23 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/version.c + * + * Kernel version string + */ + +#include "boot.h" +#include <linux/utsrelease.h> +#include <linux/compile.h> + +const char kernel_version[] = + UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") " + UTS_VERSION; diff --git a/arch/i386/boot/vesa.h b/arch/i386/boot/vesa.h new file mode 100644 index 00000000000..ff5b73cd406 --- /dev/null +++ b/arch/i386/boot/vesa.h @@ -0,0 +1,79 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 1999-2007 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +#ifndef BOOT_VESA_H +#define BOOT_VESA_H + +typedef struct { + u16 off, seg; +} far_ptr; + +/* VESA General Information table */ +struct vesa_general_info { + u32 signature; /* 0 Magic number = "VESA" */ + u16 version; /* 4 */ + far_ptr vendor_string; /* 6 */ + u32 capabilities; /* 10 */ + far_ptr video_mode_ptr; /* 14 */ + u16 total_memory; /* 18 */ + + u16 oem_software_rev; /* 20 */ + far_ptr oem_vendor_name_ptr; /* 22 */ + far_ptr oem_product_name_ptr; /* 26 */ + far_ptr oem_product_rev_ptr; /* 30 */ + + u8 reserved[222]; /* 34 */ + u8 oem_data[256]; /* 256 */ +} __attribute__ ((packed)); + +#define VESA_MAGIC ('V' + ('E' << 8) + ('S' << 16) + ('A' << 24)) +#define VBE2_MAGIC ('V' + ('B' << 8) + ('E' << 16) + ('2' << 24)) + +struct vesa_mode_info { + u16 mode_attr; /* 0 */ + u8 win_attr[2]; /* 2 */ + u16 win_grain; /* 4 */ + u16 win_size; /* 6 */ + u16 win_seg[2]; /* 8 */ + far_ptr win_scheme; /* 12 */ + u16 logical_scan; /* 16 */ + + u16 h_res; /* 18 */ + u16 v_res; /* 20 */ + u8 char_width; /* 22 */ + u8 char_height; /* 23 */ + u8 memory_planes; /* 24 */ + u8 bpp; /* 25 */ + u8 banks; /* 26 */ + u8 memory_layout; /* 27 */ + u8 bank_size; /* 28 */ + u8 image_planes; /* 29 */ + u8 page_function; /* 30 */ + + u8 rmask; /* 31 */ + u8 rpos; /* 32 */ + u8 gmask; /* 33 */ + u8 gpos; /* 34 */ + u8 bmask; /* 35 */ + u8 bpos; /* 36 */ + u8 resv_mask; /* 37 */ + u8 resv_pos; /* 38 */ + u8 dcm_info; /* 39 */ + + u32 lfb_ptr; /* 40 Linear frame buffer address */ + u32 offscreen_ptr; /* 44 Offscreen memory address */ + u16 offscreen_size; /* 48 */ + + u8 reserved[206]; /* 50 */ +} __attribute__ ((packed)); + +#endif /* LIB_SYS_VESA_H */ diff --git a/arch/i386/boot/video-bios.c b/arch/i386/boot/video-bios.c new file mode 100644 index 00000000000..afea46c500c --- /dev/null +++ b/arch/i386/boot/video-bios.c @@ -0,0 +1,125 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/video-bios.c + * + * Standard video BIOS modes + * + * We have two options for this; silent and scanned. + */ + +#include "boot.h" +#include "video.h" + +__videocard video_bios; + +/* Set a conventional BIOS mode */ +static int set_bios_mode(u8 mode); + +static int bios_set_mode(struct mode_info *mi) +{ + return set_bios_mode(mi->mode - VIDEO_FIRST_BIOS); +} + +static int set_bios_mode(u8 mode) +{ + u16 ax; + u8 new_mode; + + ax = mode; /* AH=0x00 Set Video Mode */ + asm volatile(INT10 + : "+a" (ax) + : : "ebx", "ecx", "edx", "esi", "edi"); + + ax = 0x0f00; /* Get Current Video Mode */ + asm volatile(INT10 + : "+a" (ax) + : : "ebx", "ecx", "edx", "esi", "edi"); + + do_restore = 1; /* Assume video contents was lost */ + new_mode = ax & 0x7f; /* Not all BIOSes are clean with the top bit */ + + if (new_mode == mode) + return 0; /* Mode change OK */ + + if (new_mode != boot_params.screen_info.orig_video_mode) { + /* Mode setting failed, but we didn't end up where we + started. That's bad. Try to revert to the original + video mode. */ + ax = boot_params.screen_info.orig_video_mode; + asm volatile(INT10 + : "+a" (ax) + : : "ebx", "ecx", "edx", "esi", "edi"); + } + return -1; +} + +static int bios_probe(void) +{ + u8 mode; + u8 saved_mode = boot_params.screen_info.orig_video_mode; + u16 crtc; + struct mode_info *mi; + int nmodes = 0; + + if (adapter != ADAPTER_EGA && adapter != ADAPTER_VGA) + return 0; + + set_fs(0); + crtc = vga_crtc(); + + video_bios.modes = GET_HEAP(struct mode_info, 0); + + for (mode = 0x14; mode <= 0x7f; mode++) { + if (heap_free() < sizeof(struct mode_info)) + break; + + if (mode_defined(VIDEO_FIRST_BIOS+mode)) + continue; + + if (set_bios_mode(mode)) + continue; + + /* Try to verify that it's a text mode. */ + + /* Attribute Controller: make graphics controller disabled */ + if (in_idx(0x3c0, 0x10) & 0x01) + continue; + + /* Graphics Controller: verify Alpha addressing enabled */ + if (in_idx(0x3ce, 0x06) & 0x01) + continue; + + /* CRTC cursor location low should be zero(?) */ + if (in_idx(crtc, 0x0f)) + continue; + + mi = GET_HEAP(struct mode_info, 1); + mi->mode = VIDEO_FIRST_BIOS+mode; + mi->x = rdfs16(0x44a); + mi->y = rdfs8(0x484)+1; + nmodes++; + } + + set_bios_mode(saved_mode); + + return nmodes; +} + +__videocard video_bios = +{ + .card_name = "BIOS (scanned)", + .probe = bios_probe, + .set_mode = bios_set_mode, + .unsafe = 1, + .xmode_first = VIDEO_FIRST_BIOS, + .xmode_n = 0x80, +}; diff --git a/arch/i386/boot/video-vesa.c b/arch/i386/boot/video-vesa.c new file mode 100644 index 00000000000..e6aa9eb8d93 --- /dev/null +++ b/arch/i386/boot/video-vesa.c @@ -0,0 +1,284 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/video-vesa.c + * + * VESA text modes + */ + +#include "boot.h" +#include "video.h" +#include "vesa.h" + +/* VESA information */ +static struct vesa_general_info vginfo; +static struct vesa_mode_info vminfo; + +__videocard video_vesa; + +static void vesa_store_mode_params_graphics(void); + +static int vesa_probe(void) +{ +#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID) + u16 ax; + u16 mode; + addr_t mode_ptr; + struct mode_info *mi; + int nmodes = 0; + + video_vesa.modes = GET_HEAP(struct mode_info, 0); + + vginfo.signature = VBE2_MAGIC; + + /* Optimistically assume a VESA BIOS is register-clean... */ + ax = 0x4f00; + asm("int $0x10" : "+a" (ax), "=m" (vginfo) : "D" (&vginfo)); + + if (ax != 0x004f || + vginfo.signature != VESA_MAGIC || + vginfo.version < 0x0102) + return 0; /* Not present */ +#endif /* CONFIG_VIDEO_VESA || CONFIG_FIRMWARE_EDID */ +#ifdef CONFIG_VIDEO_VESA + set_fs(vginfo.video_mode_ptr.seg); + mode_ptr = vginfo.video_mode_ptr.off; + + while ((mode = rdfs16(mode_ptr)) != 0xffff) { + mode_ptr += 2; + + if (heap_free() < sizeof(struct mode_info)) + break; /* Heap full, can't save mode info */ + + if (mode & ~0x1ff) + continue; + + memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ + + ax = 0x4f01; + asm("int $0x10" + : "+a" (ax), "=m" (vminfo) + : "c" (mode), "D" (&vminfo)); + + if (ax != 0x004f) + continue; + + if ((vminfo.mode_attr & 0x15) == 0x05) { + /* Text Mode, TTY BIOS supported, + supported by hardware */ + mi = GET_HEAP(struct mode_info, 1); + mi->mode = mode + VIDEO_FIRST_VESA; + mi->x = vminfo.h_res; + mi->y = vminfo.v_res; + nmodes++; + } else if ((vminfo.mode_attr & 0x99) == 0x99) { +#ifdef CONFIG_FB + /* Graphics mode, color, linear frame buffer + supported -- register the mode but hide from + the menu. Only do this if framebuffer is + configured, however, otherwise the user will + be left without a screen. */ + mi = GET_HEAP(struct mode_info, 1); + mi->mode = mode + VIDEO_FIRST_VESA; + mi->x = mi->y = 0; + nmodes++; +#endif + } + } + + return nmodes; +#else + return 0; +#endif /* CONFIG_VIDEO_VESA */ +} + +static int vesa_set_mode(struct mode_info *mode) +{ + u16 ax; + int is_graphic; + u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA; + + memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ + + ax = 0x4f01; + asm("int $0x10" + : "+a" (ax), "=m" (vminfo) + : "c" (vesa_mode), "D" (&vminfo)); + + if (ax != 0x004f) + return -1; + + if ((vminfo.mode_attr & 0x15) == 0x05) { + /* It's a supported text mode */ + is_graphic = 0; + } else if ((vminfo.mode_attr & 0x99) == 0x99) { + /* It's a graphics mode with linear frame buffer */ + is_graphic = 1; + vesa_mode |= 0x4000; /* Request linear frame buffer */ + } else { + return -1; /* Invalid mode */ + } + + + ax = 0x4f02; + asm volatile("int $0x10" + : "+a" (ax) + : "b" (vesa_mode), "D" (0)); + + if (ax != 0x004f) + return -1; + + graphic_mode = is_graphic; + if (!is_graphic) { + /* Text mode */ + force_x = mode->x; + force_y = mode->y; + do_restore = 1; + } else { + /* Graphics mode */ + vesa_store_mode_params_graphics(); + } + + return 0; +} + + +/* Switch DAC to 8-bit mode */ +static void vesa_dac_set_8bits(void) +{ + u8 dac_size = 6; + + /* If possible, switch the DAC to 8-bit mode */ + if (vginfo.capabilities & 1) { + u16 ax, bx; + + ax = 0x4f08; + bx = 0x0800; + asm volatile(INT10 + : "+a" (ax), "+b" (bx) + : : "ecx", "edx", "esi", "edi"); + + if (ax == 0x004f) + dac_size = bx >> 8; + } + + /* Set the color sizes to the DAC size, and offsets to 0 */ + boot_params.screen_info.red_size = dac_size; + boot_params.screen_info.green_size = dac_size; + boot_params.screen_info.blue_size = dac_size; + boot_params.screen_info.rsvd_size = dac_size; + + boot_params.screen_info.red_pos = 0; + boot_params.screen_info.green_pos = 0; + boot_params.screen_info.blue_pos = 0; + boot_params.screen_info.rsvd_pos = 0; +} + +/* Save the VESA protected mode info */ +static void vesa_store_pm_info(void) +{ + u16 ax, bx, di, es; + + ax = 0x4f0a; + bx = di = 0; + asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es" + : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di) + : : "ecx", "esi"); + + if (ax != 0x004f) + return; + + boot_params.screen_info.vesapm_seg = es; + boot_params.screen_info.vesapm_off = di; +} + +/* + * Save video mode parameters for graphics mode + */ +static void vesa_store_mode_params_graphics(void) +{ + /* Tell the kernel we're in VESA graphics mode */ + boot_params.screen_info.orig_video_isVGA = 0x23; + + /* Mode parameters */ + boot_params.screen_info.vesa_attributes = vminfo.mode_attr; + boot_params.screen_info.lfb_linelength = vminfo.logical_scan; + boot_params.screen_info.lfb_width = vminfo.h_res; + boot_params.screen_info.lfb_height = vminfo.v_res; + boot_params.screen_info.lfb_depth = vminfo.bpp; + boot_params.screen_info.pages = vminfo.image_planes; + boot_params.screen_info.lfb_base = vminfo.lfb_ptr; + memcpy(&boot_params.screen_info.red_size, + &vminfo.rmask, 8); + + /* General parameters */ + boot_params.screen_info.lfb_size = vginfo.total_memory; + + if (vminfo.bpp <= 8) + vesa_dac_set_8bits(); + + vesa_store_pm_info(); +} + +/* + * Save EDID information for the kernel; this is invoked, separately, + * after mode-setting. + */ +void vesa_store_edid(void) +{ +#ifdef CONFIG_FIRMWARE_EDID + u16 ax, bx, cx, dx, di; + + /* Apparently used as a nonsense token... */ + memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info); + + if (vginfo.version < 0x0200) + return; /* EDID requires VBE 2.0+ */ + + ax = 0x4f15; /* VBE DDC */ + bx = 0x0000; /* Report DDC capabilities */ + cx = 0; /* Controller 0 */ + di = 0; /* ES:DI must be 0 by spec */ + + /* Note: The VBE DDC spec is different from the main VESA spec; + we genuinely have to assume all registers are destroyed here. */ + + asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es" + : "+a" (ax), "+b" (bx) + : "c" (cx), "D" (di) + : "esi"); + + if (ax != 0x004f) + return; /* No EDID */ + + /* BH = time in seconds to transfer EDD information */ + /* BL = DDC level supported */ + + ax = 0x4f15; /* VBE DDC */ + bx = 0x0001; /* Read EDID */ + cx = 0; /* Controller 0 */ + dx = 0; /* EDID block number */ + di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */ + asm(INT10 + : "+a" (ax), "+b" (bx), "+d" (dx) + : "c" (cx), "D" (di) + : "esi"); +#endif /* CONFIG_FIRMWARE_EDID */ +} + +__videocard video_vesa = +{ + .card_name = "VESA", + .probe = vesa_probe, + .set_mode = vesa_set_mode, + .xmode_first = VIDEO_FIRST_VESA, + .xmode_n = 0x200, +}; diff --git a/arch/i386/boot/video-vga.c b/arch/i386/boot/video-vga.c new file mode 100644 index 00000000000..700d09a9c9b --- /dev/null +++ b/arch/i386/boot/video-vga.c @@ -0,0 +1,260 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/video-vga.c + * + * Common all-VGA modes + */ + +#include "boot.h" +#include "video.h" + +static struct mode_info vga_modes[] = { + { VIDEO_80x25, 80, 25 }, + { VIDEO_8POINT, 80, 50 }, + { VIDEO_80x43, 80, 43 }, + { VIDEO_80x28, 80, 28 }, + { VIDEO_80x30, 80, 30 }, + { VIDEO_80x34, 80, 34 }, + { VIDEO_80x60, 80, 60 }, +}; + +static struct mode_info ega_modes[] = { + { VIDEO_80x25, 80, 25 }, + { VIDEO_8POINT, 80, 43 }, +}; + +static struct mode_info cga_modes[] = { + { VIDEO_80x25, 80, 25 }, +}; + +__videocard video_vga; + +/* Set basic 80x25 mode */ +static u8 vga_set_basic_mode(void) +{ + u16 ax; + u8 rows; + u8 mode; + +#ifdef CONFIG_VIDEO_400_HACK + if (adapter >= ADAPTER_VGA) { + asm(INT10 + : : "a" (0x1202), "b" (0x0030) + : "ecx", "edx", "esi", "edi"); + } +#endif + + ax = 0x0f00; + asm(INT10 + : "+a" (ax) + : : "ebx", "ecx", "edx", "esi", "edi"); + + mode = (u8)ax; + + set_fs(0); + rows = rdfs8(0x484); /* rows minus one */ + +#ifndef CONFIG_VIDEO_400_HACK + if ((ax == 0x5003 || ax == 0x5007) && + (rows == 0 || rows == 24)) + return mode; +#endif + + if (mode != 3 && mode != 7) + mode = 3; + + /* Set the mode */ + asm volatile(INT10 + : : "a" (mode) + : "ebx", "ecx", "edx", "esi", "edi"); + do_restore = 1; + return mode; +} + +static void vga_set_8font(void) +{ + /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */ + + /* Set 8x8 font */ + asm volatile(INT10 : : "a" (0x1112), "b" (0)); + + /* Use alternate print screen */ + asm volatile(INT10 : : "a" (0x1200), "b" (0x20)); + + /* Turn off cursor emulation */ + asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); + + /* Cursor is scan lines 6-7 */ + asm volatile(INT10 : : "a" (0x0100), "c" (0x0607)); +} + +static void vga_set_14font(void) +{ + /* Set 9x14 font - 80x28 on VGA */ + + /* Set 9x14 font */ + asm volatile(INT10 : : "a" (0x1111), "b" (0)); + + /* Turn off cursor emulation */ + asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); + + /* Cursor is scan lines 11-12 */ + asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c)); +} + +static void vga_set_80x43(void) +{ + /* Set 80x43 mode on VGA (not EGA) */ + + /* Set 350 scans */ + asm volatile(INT10 : : "a" (0x1201), "b" (0x30)); + + /* Reset video mode */ + asm volatile(INT10 : : "a" (0x0003)); + + vga_set_8font(); +} + +/* I/O address of the VGA CRTC */ +u16 vga_crtc(void) +{ + return (inb(0x3cc) & 1) ? 0x3d4 : 0x3b4; +} + +static void vga_set_480_scanlines(int end) +{ + u16 crtc; + u8 csel; + + crtc = vga_crtc(); + + out_idx(0x0c, crtc, 0x11); /* Vertical sync end, unlock CR0-7 */ + out_idx(0x0b, crtc, 0x06); /* Vertical total */ + out_idx(0x3e, crtc, 0x07); /* Vertical overflow */ + out_idx(0xea, crtc, 0x10); /* Vertical sync start */ + out_idx(end, crtc, 0x12); /* Vertical display end */ + out_idx(0xe7, crtc, 0x15); /* Vertical blank start */ + out_idx(0x04, crtc, 0x16); /* Vertical blank end */ + csel = inb(0x3cc); + csel &= 0x0d; + csel |= 0xe2; + outb(csel, 0x3cc); +} + +static void vga_set_80x30(void) +{ + vga_set_480_scanlines(0xdf); +} + +static void vga_set_80x34(void) +{ + vga_set_14font(); + vga_set_480_scanlines(0xdb); +} + +static void vga_set_80x60(void) +{ + vga_set_8font(); + vga_set_480_scanlines(0xdf); +} + +static int vga_set_mode(struct mode_info *mode) +{ + /* Set the basic mode */ + vga_set_basic_mode(); + + /* Override a possibly broken BIOS */ + force_x = mode->x; + force_y = mode->y; + + switch (mode->mode) { + case VIDEO_80x25: + break; + case VIDEO_8POINT: + vga_set_8font(); + break; + case VIDEO_80x43: + vga_set_80x43(); + break; + case VIDEO_80x28: + vga_set_14font(); + break; + case VIDEO_80x30: + vga_set_80x30(); + break; + case VIDEO_80x34: + vga_set_80x34(); + break; + case VIDEO_80x60: + vga_set_80x60(); + break; + } + + return 0; +} + +/* + * Note: this probe includes basic information required by all + * systems. It should be executed first, by making sure + * video-vga.c is listed first in the Makefile. + */ +static int vga_probe(void) +{ + static const char *card_name[] = { + "CGA/MDA/HGC", "EGA", "VGA" + }; + static struct mode_info *mode_lists[] = { + cga_modes, + ega_modes, + vga_modes, + }; + static int mode_count[] = { + sizeof(cga_modes)/sizeof(struct mode_info), + sizeof(ega_modes)/sizeof(struct mode_info), + sizeof(vga_modes)/sizeof(struct mode_info), + }; + u8 vga_flag; + + asm(INT10 + : "=b" (boot_params.screen_info.orig_video_ega_bx) + : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */ + : "ecx", "edx", "esi", "edi"); + + /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */ + if ((u8)boot_params.screen_info.orig_video_ega_bx != 0x10) { + /* EGA/VGA */ + asm(INT10 + : "=a" (vga_flag) + : "a" (0x1a00) + : "ebx", "ecx", "edx", "esi", "edi"); + + if (vga_flag == 0x1a) { + adapter = ADAPTER_VGA; + boot_params.screen_info.orig_video_isVGA = 1; + } else { + adapter = ADAPTER_EGA; + } + } else { + adapter = ADAPTER_CGA; + } + + video_vga.modes = mode_lists[adapter]; + video_vga.card_name = card_name[adapter]; + return mode_count[adapter]; +} + +__videocard video_vga = +{ + .card_name = "VGA", + .probe = vga_probe, + .set_mode = vga_set_mode, +}; diff --git a/arch/i386/boot/video.S b/arch/i386/boot/video.S deleted file mode 100644 index 8143c9516cb..00000000000 --- a/arch/i386/boot/video.S +++ /dev/null @@ -1,2043 +0,0 @@ -/* video.S - * - * Display adapter & video mode setup, version 2.13 (14-May-99) - * - * Copyright (C) 1995 -- 1998 Martin Mares <mj@ucw.cz> - * Based on the original setup.S code (C) Linus Torvalds and Mats Anderson - * - * Rewritten to use GNU 'as' by Chris Noe <stiker@northlink.com> May 1999 - * - * For further information, look at Documentation/svga.txt. - * - */ - -/* Enable autodetection of SVGA adapters and modes. */ -#undef CONFIG_VIDEO_SVGA - -/* Enable autodetection of VESA modes */ -#define CONFIG_VIDEO_VESA - -/* Enable compacting of mode table */ -#define CONFIG_VIDEO_COMPACT - -/* Retain screen contents when switching modes */ -#define CONFIG_VIDEO_RETAIN - -/* Enable local mode list */ -#undef CONFIG_VIDEO_LOCAL - -/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */ -#undef CONFIG_VIDEO_400_HACK - -/* Hack that lets you force specific BIOS mode ID and specific dimensions */ -#undef CONFIG_VIDEO_GFX_HACK -#define VIDEO_GFX_BIOS_AX 0x4f02 /* 800x600 on ThinkPad */ -#define VIDEO_GFX_BIOS_BX 0x0102 -#define VIDEO_GFX_DUMMY_RESOLUTION 0x6425 /* 100x37 */ - -/* This code uses an extended set of video mode numbers. These include: - * Aliases for standard modes - * NORMAL_VGA (-1) - * EXTENDED_VGA (-2) - * ASK_VGA (-3) - * Video modes numbered by menu position -- NOT RECOMMENDED because of lack - * of compatibility when extending the table. These are between 0x00 and 0xff. - */ -#define VIDEO_FIRST_MENU 0x0000 - -/* Standard BIOS video modes (BIOS number + 0x0100) */ -#define VIDEO_FIRST_BIOS 0x0100 - -/* VESA BIOS video modes (VESA number + 0x0200) */ -#define VIDEO_FIRST_VESA 0x0200 - -/* Video7 special modes (BIOS number + 0x0900) */ -#define VIDEO_FIRST_V7 0x0900 - -/* Special video modes */ -#define VIDEO_FIRST_SPECIAL 0x0f00 -#define VIDEO_80x25 0x0f00 -#define VIDEO_8POINT 0x0f01 -#define VIDEO_80x43 0x0f02 -#define VIDEO_80x28 0x0f03 -#define VIDEO_CURRENT_MODE 0x0f04 -#define VIDEO_80x30 0x0f05 -#define VIDEO_80x34 0x0f06 -#define VIDEO_80x60 0x0f07 -#define VIDEO_GFX_HACK 0x0f08 -#define VIDEO_LAST_SPECIAL 0x0f09 - -/* Video modes given by resolution */ -#define VIDEO_FIRST_RESOLUTION 0x1000 - -/* The "recalculate timings" flag */ -#define VIDEO_RECALC 0x8000 - -/* Positions of various video parameters passed to the kernel */ -/* (see also include/linux/tty.h) */ -#define PARAM_CURSOR_POS 0x00 -#define PARAM_VIDEO_PAGE 0x04 -#define PARAM_VIDEO_MODE 0x06 -#define PARAM_VIDEO_COLS 0x07 -#define PARAM_VIDEO_EGA_BX 0x0a -#define PARAM_VIDEO_LINES 0x0e -#define PARAM_HAVE_VGA 0x0f -#define PARAM_FONT_POINTS 0x10 - -#define PARAM_LFB_WIDTH 0x12 -#define PARAM_LFB_HEIGHT 0x14 -#define PARAM_LFB_DEPTH 0x16 -#define PARAM_LFB_BASE 0x18 -#define PARAM_LFB_SIZE 0x1c -#define PARAM_LFB_LINELENGTH 0x24 -#define PARAM_LFB_COLORS 0x26 -#define PARAM_VESAPM_SEG 0x2e -#define PARAM_VESAPM_OFF 0x30 -#define PARAM_LFB_PAGES 0x32 -#define PARAM_VESA_ATTRIB 0x34 -#define PARAM_CAPABILITIES 0x36 - -/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */ -#ifdef CONFIG_VIDEO_RETAIN -#define DO_STORE call store_screen -#else -#define DO_STORE -#endif /* CONFIG_VIDEO_RETAIN */ - -# This is the main entry point called by setup.S -# %ds *must* be pointing to the bootsector -video: pushw %ds # We use different segments - pushw %ds # FS contains original DS - popw %fs - pushw %cs # DS is equal to CS - popw %ds - pushw %cs # ES is equal to CS - popw %es - xorw %ax, %ax - movw %ax, %gs # GS is zero - cld - call basic_detect # Basic adapter type testing (EGA/VGA/MDA/CGA) -#ifdef CONFIG_VIDEO_SELECT - movw %fs:(0x01fa), %ax # User selected video mode - cmpw $ASK_VGA, %ax # Bring up the menu - jz vid2 - - call mode_set # Set the mode - jc vid1 - - leaw badmdt, %si # Invalid mode ID - call prtstr -vid2: call mode_menu -vid1: -#ifdef CONFIG_VIDEO_RETAIN - call restore_screen # Restore screen contents -#endif /* CONFIG_VIDEO_RETAIN */ - call store_edid -#endif /* CONFIG_VIDEO_SELECT */ - call mode_params # Store mode parameters - popw %ds # Restore original DS - ret - -# Detect if we have CGA, MDA, EGA or VGA and pass it to the kernel. -basic_detect: - movb $0, %fs:(PARAM_HAVE_VGA) - movb $0x12, %ah # Check EGA/VGA - movb $0x10, %bl - int $0x10 - movw %bx, %fs:(PARAM_VIDEO_EGA_BX) # Identifies EGA to the kernel - cmpb $0x10, %bl # No, it's a CGA/MDA/HGA card. - je basret - - incb adapter - movw $0x1a00, %ax # Check EGA or VGA? - int $0x10 - cmpb $0x1a, %al # 1a means VGA... - jne basret # anything else is EGA. - - incb %fs:(PARAM_HAVE_VGA) # We've detected a VGA - incb adapter -basret: ret - -# Store the video mode parameters for later usage by the kernel. -# This is done by asking the BIOS except for the rows/columns -# parameters in the default 80x25 mode -- these are set directly, -# because some very obscure BIOSes supply insane values. -mode_params: -#ifdef CONFIG_VIDEO_SELECT - cmpb $0, graphic_mode - jnz mopar_gr -#endif - movb $0x03, %ah # Read cursor position - xorb %bh, %bh - int $0x10 - movw %dx, %fs:(PARAM_CURSOR_POS) - movb $0x0f, %ah # Read page/mode/width - int $0x10 - movw %bx, %fs:(PARAM_VIDEO_PAGE) - movw %ax, %fs:(PARAM_VIDEO_MODE) # Video mode and screen width - cmpb $0x7, %al # MDA/HGA => segment differs - jnz mopar0 - - movw $0xb000, video_segment -mopar0: movw %gs:(0x485), %ax # Font size - movw %ax, %fs:(PARAM_FONT_POINTS) # (valid only on EGA/VGA) - movw force_size, %ax # Forced size? - orw %ax, %ax - jz mopar1 - - movb %ah, %fs:(PARAM_VIDEO_COLS) - movb %al, %fs:(PARAM_VIDEO_LINES) - ret - -mopar1: movb $25, %al - cmpb $0, adapter # If we are on CGA/MDA/HGA, the - jz mopar2 # screen must have 25 lines. - - movb %gs:(0x484), %al # On EGA/VGA, use the EGA+ BIOS - incb %al # location of max lines. -mopar2: movb %al, %fs:(PARAM_VIDEO_LINES) - ret - -#ifdef CONFIG_VIDEO_SELECT -# Fetching of VESA frame buffer parameters -mopar_gr: - leaw modelist+1024, %di - movb $0x23, %fs:(PARAM_HAVE_VGA) - movw 16(%di), %ax - movw %ax, %fs:(PARAM_LFB_LINELENGTH) - movw 18(%di), %ax - movw %ax, %fs:(PARAM_LFB_WIDTH) - movw 20(%di), %ax - movw %ax, %fs:(PARAM_LFB_HEIGHT) - movb 25(%di), %al - movb $0, %ah - movw %ax, %fs:(PARAM_LFB_DEPTH) - movb 29(%di), %al - movb $0, %ah - movw %ax, %fs:(PARAM_LFB_PAGES) - movl 40(%di), %eax - movl %eax, %fs:(PARAM_LFB_BASE) - movl 31(%di), %eax - movl %eax, %fs:(PARAM_LFB_COLORS) - movl 35(%di), %eax - movl %eax, %fs:(PARAM_LFB_COLORS+4) - movw 0(%di), %ax - movw %ax, %fs:(PARAM_VESA_ATTRIB) - -# get video mem size - leaw modelist+1024, %di - movw $0x4f00, %ax - int $0x10 - xorl %eax, %eax - movw 18(%di), %ax - movl %eax, %fs:(PARAM_LFB_SIZE) - -# store mode capabilities - movl 10(%di), %eax - movl %eax, %fs:(PARAM_CAPABILITIES) - -# switching the DAC to 8-bit is for <= 8 bpp only - movw %fs:(PARAM_LFB_DEPTH), %ax - cmpw $8, %ax - jg dac_done - -# get DAC switching capability - xorl %eax, %eax - movb 10(%di), %al - testb $1, %al - jz dac_set - -# attempt to switch DAC to 8-bit - movw $0x4f08, %ax - movw $0x0800, %bx - int $0x10 - cmpw $0x004f, %ax - jne dac_set - movb %bh, dac_size # store actual DAC size - -dac_set: -# set color size to DAC size - movb dac_size, %al - movb %al, %fs:(PARAM_LFB_COLORS+0) - movb %al, %fs:(PARAM_LFB_COLORS+2) - movb %al, %fs:(PARAM_LFB_COLORS+4) - movb %al, %fs:(PARAM_LFB_COLORS+6) - -# set color offsets to 0 - movb $0, %fs:(PARAM_LFB_COLORS+1) - movb $0, %fs:(PARAM_LFB_COLORS+3) - movb $0, %fs:(PARAM_LFB_COLORS+5) - movb $0, %fs:(PARAM_LFB_COLORS+7) - -dac_done: -# get protected mode interface informations - movw $0x4f0a, %ax - xorw %bx, %bx - xorw %di, %di - int $0x10 - cmp $0x004f, %ax - jnz no_pm - - movw %es, %fs:(PARAM_VESAPM_SEG) - movw %di, %fs:(PARAM_VESAPM_OFF) -no_pm: ret - -# The video mode menu -mode_menu: - leaw keymsg, %si # "Return/Space/Timeout" message - call prtstr - call flush -nokey: call getkt - - cmpb $0x0d, %al # ENTER ? - je listm # yes - manual mode selection - - cmpb $0x20, %al # SPACE ? - je defmd1 # no - repeat - - call beep - jmp nokey - -defmd1: ret # No mode chosen? Default 80x25 - -listm: call mode_table # List mode table -listm0: leaw name_bann, %si # Print adapter name - call prtstr - movw card_name, %si - orw %si, %si - jnz an2 - - movb adapter, %al - leaw old_name, %si - orb %al, %al - jz an1 - - leaw ega_name, %si - decb %al - jz an1 - - leaw vga_name, %si - jmp an1 - -an2: call prtstr - leaw svga_name, %si -an1: call prtstr - leaw listhdr, %si # Table header - call prtstr - movb $0x30, %dl # DL holds mode number - leaw modelist, %si -lm1: cmpw $ASK_VGA, (%si) # End? - jz lm2 - - movb %dl, %al # Menu selection number - call prtchr - call prtsp2 - lodsw - call prthw # Mode ID - call prtsp2 - movb 0x1(%si), %al - call prtdec # Rows - movb $0x78, %al # the letter 'x' - call prtchr - lodsw - call prtdec # Columns - movb $0x0d, %al # New line - call prtchr - movb $0x0a, %al - call prtchr - incb %dl # Next character - cmpb $0x3a, %dl - jnz lm1 - - movb $0x61, %dl - jmp lm1 - -lm2: leaw prompt, %si # Mode prompt - call prtstr - leaw edit_buf, %di # Editor buffer -lm3: call getkey - cmpb $0x0d, %al # Enter? - jz lment - - cmpb $0x08, %al # Backspace? - jz lmbs - - cmpb $0x20, %al # Printable? - jc lm3 - - cmpw $edit_buf+4, %di # Enough space? - jz lm3 - - stosb - call prtchr - jmp lm3 - -lmbs: cmpw $edit_buf, %di # Backspace - jz lm3 - - decw %di - movb $0x08, %al - call prtchr - call prtspc - movb $0x08, %al - call prtchr - jmp lm3 - -lment: movb $0, (%di) - leaw crlft, %si - call prtstr - leaw edit_buf, %si - cmpb $0, (%si) # Empty string = default mode - jz lmdef - - cmpb $0, 1(%si) # One character = menu selection - jz mnusel - - cmpw $0x6373, (%si) # "scan" => mode scanning - jnz lmhx - - cmpw $0x6e61, 2(%si) - jz lmscan - -lmhx: xorw %bx, %bx # Else => mode ID in hex -lmhex: lodsb - orb %al, %al - jz lmuse1 - - subb $0x30, %al - jc lmbad - - cmpb $10, %al - jc lmhx1 - - subb $7, %al - andb $0xdf, %al - cmpb $10, %al - jc lmbad - - cmpb $16, %al - jnc lmbad - -lmhx1: shlw $4, %bx - orb %al, %bl - jmp lmhex - -lmuse1: movw %bx, %ax - jmp lmuse - -mnusel: lodsb # Menu selection - xorb %ah, %ah - subb $0x30, %al - jc lmbad - - cmpb $10, %al - jc lmuse - - cmpb $0x61-0x30, %al - jc lmbad - - subb $0x61-0x30-10, %al - cmpb $36, %al - jnc lmbad - -lmuse: call mode_set - jc lmdef - -lmbad: leaw unknt, %si - call prtstr - jmp lm2 -lmscan: cmpb $0, adapter # Scanning only on EGA/VGA - jz lmbad - - movw $0, mt_end # Scanning of modes is - movb $1, scanning # done as new autodetection. - call mode_table - jmp listm0 -lmdef: ret - -# Additional parts of mode_set... (relative jumps, you know) -setv7: # Video7 extended modes - DO_STORE - subb $VIDEO_FIRST_V7>>8, %bh - movw $0x6f05, %ax - int $0x10 - stc - ret - -_setrec: jmp setrec # Ugly... -_set_80x25: jmp set_80x25 - -# Aliases for backward compatibility. -setalias: - movw $VIDEO_80x25, %ax - incw %bx - jz mode_set - - movb $VIDEO_8POINT-VIDEO_FIRST_SPECIAL, %al - incw %bx - jnz setbad # Fall-through! - -# Setting of user mode (AX=mode ID) => CF=success -mode_set: - movw %ax, %fs:(0x01fa) # Store mode for use in acpi_wakeup.S - movw %ax, %bx - cmpb $0xff, %ah - jz setalias - - testb $VIDEO_RECALC>>8, %ah - jnz _setrec - - cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah - jnc setres - - cmpb $VIDEO_FIRST_SPECIAL>>8, %ah - jz setspc - - cmpb $VIDEO_FIRST_V7>>8, %ah - jz setv7 - - cmpb $VIDEO_FIRST_VESA>>8, %ah - jnc check_vesa - - orb %ah, %ah - jz setmenu - - decb %ah - jz setbios - -setbad: clc - movb $0, do_restore # The screen needn't be restored - ret - -setvesa: - DO_STORE - subb $VIDEO_FIRST_VESA>>8, %bh - movw $0x4f02, %ax # VESA BIOS mode set call - int $0x10 - cmpw $0x004f, %ax # AL=4f if implemented - jnz setbad # AH=0 if OK - - stc - ret - -setbios: - DO_STORE - int $0x10 # Standard BIOS mode set call - pushw %bx - movb $0x0f, %ah # Check if really set - int $0x10 - popw %bx - cmpb %bl, %al - jnz setbad - - stc - ret - -setspc: xorb %bh, %bh # Set special mode - cmpb $VIDEO_LAST_SPECIAL-VIDEO_FIRST_SPECIAL, %bl - jnc setbad - - addw %bx, %bx - jmp *spec_inits(%bx) - -setmenu: - orb %al, %al # 80x25 is an exception - jz _set_80x25 - - pushw %bx # Set mode chosen from menu - call mode_table # Build the mode table - popw %ax - shlw $2, %ax - addw %ax, %si - cmpw %di, %si - jnc setbad - - movw (%si), %ax # Fetch mode ID -_m_s: jmp mode_set - -setres: pushw %bx # Set mode chosen by resolution - call mode_table - popw %bx - xchgb %bl, %bh -setr1: lodsw - cmpw $ASK_VGA, %ax # End of the list? - jz setbad - - lodsw - cmpw %bx, %ax - jnz setr1 - - movw -4(%si), %ax # Fetch mode ID - jmp _m_s - -check_vesa: -#ifdef CONFIG_FIRMWARE_EDID - leaw modelist+1024, %di - movw $0x4f00, %ax - int $0x10 - cmpw $0x004f, %ax - jnz setbad - - movw 4(%di), %ax - movw %ax, vbe_version -#endif - leaw modelist+1024, %di - subb $VIDEO_FIRST_VESA>>8, %bh - movw %bx, %cx # Get mode information structure - movw $0x4f01, %ax - int $0x10 - addb $VIDEO_FIRST_VESA>>8, %bh - cmpw $0x004f, %ax - jnz setbad - - movb (%di), %al # Check capabilities. - andb $0x19, %al - cmpb $0x09, %al - jz setvesa # This is a text mode - - movb (%di), %al # Check capabilities. - andb $0x99, %al - cmpb $0x99, %al - jnz _setbad # Doh! No linear frame buffer. - - subb $VIDEO_FIRST_VESA>>8, %bh - orw $0x4000, %bx # Use linear frame buffer - movw $0x4f02, %ax # VESA BIOS mode set call - int $0x10 - cmpw $0x004f, %ax # AL=4f if implemented - jnz _setbad # AH=0 if OK - - movb $1, graphic_mode # flag graphic mode - movb $0, do_restore # no screen restore - stc - ret - -_setbad: jmp setbad # Ugly... - -# Recalculate vertical display end registers -- this fixes various -# inconsistencies of extended modes on many adapters. Called when -# the VIDEO_RECALC flag is set in the mode ID. - -setrec: subb $VIDEO_RECALC>>8, %ah # Set the base mode - call mode_set - jnc rct3 - - movw %gs:(0x485), %ax # Font size in pixels - movb %gs:(0x484), %bl # Number of rows - incb %bl - mulb %bl # Number of visible - decw %ax # scan lines - 1 - movw $0x3d4, %dx - movw %ax, %bx - movb $0x12, %al # Lower 8 bits - movb %bl, %ah - outw %ax, %dx - movb $0x07, %al # Bits 8 and 9 in the overflow register - call inidx - xchgb %al, %ah - andb $0xbd, %ah - shrb %bh - jnc rct1 - orb $0x02, %ah -rct1: shrb %bh - jnc rct2 - orb $0x40, %ah -rct2: movb $0x07, %al - outw %ax, %dx - stc -rct3: ret - -# Table of routines for setting of the special modes. -spec_inits: - .word set_80x25 - .word set_8pixel - .word set_80x43 - .word set_80x28 - .word set_current - .word set_80x30 - .word set_80x34 - .word set_80x60 - .word set_gfx - -# Set the 80x25 mode. If already set, do nothing. -set_80x25: - movw $0x5019, force_size # Override possibly broken BIOS -use_80x25: -#ifdef CONFIG_VIDEO_400_HACK - movw $0x1202, %ax # Force 400 scan lines - movb $0x30, %bl - int $0x10 -#else - movb $0x0f, %ah # Get current mode ID - int $0x10 - cmpw $0x5007, %ax # Mode 7 (80x25 mono) is the only one available - jz st80 # on CGA/MDA/HGA and is also available on EGAM - - cmpw $0x5003, %ax # Unknown mode, force 80x25 color - jnz force3 - -st80: cmpb $0, adapter # CGA/MDA/HGA => mode 3/7 is always 80x25 - jz set80 - - movb %gs:(0x0484), %al # This is EGA+ -- beware of 80x50 etc. - orb %al, %al # Some buggy BIOS'es set 0 rows - jz set80 - - cmpb $24, %al # It's hopefully correct - jz set80 -#endif /* CONFIG_VIDEO_400_HACK */ -force3: DO_STORE - movw $0x0003, %ax # Forced set - int $0x10 -set80: stc - ret - -# Set the 80x50/80x43 8-pixel mode. Simple BIOS calls. -set_8pixel: - DO_STORE - call use_80x25 # The base is 80x25 -set_8pt: - movw $0x1112, %ax # Use 8x8 font - xorb %bl, %bl - int $0x10 - movw $0x1200, %ax # Use alternate print screen - movb $0x20, %bl - int $0x10 - movw $0x1201, %ax # Turn off cursor emulation - movb $0x34, %bl - int $0x10 - movb $0x01, %ah # Define cursor scan lines 6-7 - movw $0x0607, %cx - int $0x10 -set_current: - stc - ret - -# Set the 80x28 mode. This mode works on all VGA's, because it's a standard -# 80x25 mode with 14-point fonts instead of 16-point. -set_80x28: - DO_STORE - call use_80x25 # The base is 80x25 -set14: movw $0x1111, %ax # Use 9x14 font - xorb %bl, %bl - int $0x10 - movb $0x01, %ah # Define cursor scan lines 11-12 - movw $0x0b0c, %cx - int $0x10 - stc - ret - -# Set the 80x43 mode. This mode is works on all VGA's. -# It's a 350-scanline mode with 8-pixel font. -set_80x43: - DO_STORE - movw $0x1201, %ax # Set 350 scans - movb $0x30, %bl - int $0x10 - movw $0x0003, %ax # Reset video mode - int $0x10 - jmp set_8pt # Use 8-pixel font - -# Set the 80x30 mode (all VGA's). 480 scanlines, 16-pixel font. -set_80x30: - call use_80x25 # Start with real 80x25 - DO_STORE - movw $0x3cc, %dx # Get CRTC port - inb %dx, %al - movb $0xd4, %dl - rorb %al # Mono or color? - jc set48a - - movb $0xb4, %dl -set48a: movw $0x0c11, %ax # Vertical sync end (also unlocks CR0-7) - call outidx - movw $0x0b06, %ax # Vertical total - call outidx - movw $0x3e07, %ax # (Vertical) overflow - call outidx - movw $0xea10, %ax # Vertical sync start - call outidx - movw $0xdf12, %ax # Vertical display end - call outidx - movw $0xe715, %ax # Vertical blank start - call outidx - movw $0x0416, %ax # Vertical blank end - call outidx - pushw %dx - movb $0xcc, %dl # Misc output register (read) - inb %dx, %al - movb $0xc2, %dl # (write) - andb $0x0d, %al # Preserve clock select bits and color bit - orb $0xe2, %al # Set correct sync polarity - outb %al, %dx - popw %dx - movw $0x501e, force_size - stc # That's all. - ret - -# Set the 80x34 mode (all VGA's). 480 scans, 14-pixel font. -set_80x34: - call set_80x30 # Set 480 scans - call set14 # And 14-pt font - movw $0xdb12, %ax # VGA vertical display end - movw $0x5022, force_size -setvde: call outidx - stc - ret - -# Set the 80x60 mode (all VGA's). 480 scans, 8-pixel font. -set_80x60: - call set_80x30 # Set 480 scans - call set_8pt # And 8-pt font - movw $0xdf12, %ax # VGA vertical display end - movw $0x503c, force_size - jmp setvde - -# Special hack for ThinkPad graphics -set_gfx: -#ifdef CONFIG_VIDEO_GFX_HACK - movw $VIDEO_GFX_BIOS_AX, %ax - movw $VIDEO_GFX_BIOS_BX, %bx - int $0x10 - movw $VIDEO_GFX_DUMMY_RESOLUTION, force_size - stc -#endif - ret - -#ifdef CONFIG_VIDEO_RETAIN - -# Store screen contents to temporary buffer. -store_screen: - cmpb $0, do_restore # Already stored? - jnz stsr - - testb $CAN_USE_HEAP, loadflags # Have we space for storing? - jz stsr - - pushw %ax - pushw %bx - pushw force_size # Don't force specific size - movw $0, force_size - call mode_params # Obtain params of current mode - popw force_size - movb %fs:(PARAM_VIDEO_LINES), %ah - movb %fs:(PARAM_VIDEO_COLS), %al - movw %ax, %bx # BX=dimensions - mulb %ah - movw %ax, %cx # CX=number of characters - addw %ax, %ax # Calculate image size - addw $modelist+1024+4, %ax - cmpw heap_end_ptr, %ax - jnc sts1 # Unfortunately, out of memory - - movw %fs:(PARAM_CURSOR_POS), %ax # Store mode params - leaw modelist+1024, %di - stosw - movw %bx, %ax - stosw - pushw %ds # Store the screen - movw video_segment, %ds - xorw %si, %si - rep - movsw - popw %ds - incb do_restore # Screen will be restored later -sts1: popw %bx - popw %ax -stsr: ret - -# Restore screen contents from temporary buffer. -restore_screen: - cmpb $0, do_restore # Has the screen been stored? - jz res1 - - call mode_params # Get parameters of current mode - movb %fs:(PARAM_VIDEO_LINES), %cl - movb %fs:(PARAM_VIDEO_COLS), %ch - leaw modelist+1024, %si # Screen buffer - lodsw # Set cursor position - movw %ax, %dx - cmpb %cl, %dh - jc res2 - - movb %cl, %dh - decb %dh -res2: cmpb %ch, %dl - jc res3 - - movb %ch, %dl - decb %dl -res3: movb $0x02, %ah - movb $0x00, %bh - int $0x10 - lodsw # Display size - movb %ah, %dl # DL=number of lines - movb $0, %ah # BX=phys. length of orig. line - movw %ax, %bx - cmpb %cl, %dl # Too many? - jc res4 - - pushw %ax - movb %dl, %al - subb %cl, %al - mulb %bl - addw %ax, %si - addw %ax, %si - popw %ax - movb %cl, %dl -res4: cmpb %ch, %al # Too wide? - jc res5 - - movb %ch, %al # AX=width of src. line -res5: movb $0, %cl - xchgb %ch, %cl - movw %cx, %bp # BP=width of dest. line - pushw %es - movw video_segment, %es - xorw %di, %di # Move the data - addw %bx, %bx # Convert BX and BP to _bytes_ - addw %bp, %bp -res6: pushw %si - pushw %di - movw %ax, %cx - rep - movsw - popw %di - popw %si - addw %bp, %di - addw %bx, %si - decb %dl - jnz res6 - - popw %es # Done -res1: ret -#endif /* CONFIG_VIDEO_RETAIN */ - -# Write to indexed VGA register (AL=index, AH=data, DX=index reg. port) -outidx: outb %al, %dx - pushw %ax - movb %ah, %al - incw %dx - outb %al, %dx - decw %dx - popw %ax - ret - -# Build the table of video modes (stored after the setup.S code at the -# `modelist' label. Each video mode record looks like: -# .word MODE-ID (our special mode ID (see above)) -# .byte rows (number of rows) -# .byte columns (number of columns) -# Returns address of the end of the table in DI, the end is marked -# with a ASK_VGA ID. -mode_table: - movw mt_end, %di # Already filled? - orw %di, %di - jnz mtab1x - - leaw modelist, %di # Store standard modes: - movl $VIDEO_80x25 + 0x50190000, %eax # The 80x25 mode (ALL) - stosl - movb adapter, %al # CGA/MDA/HGA -- no more modes - orb %al, %al - jz mtabe - - decb %al - jnz mtabv - - movl $VIDEO_8POINT + 0x502b0000, %eax # The 80x43 EGA mode - stosl - jmp mtabe - -mtab1x: jmp mtab1 - -mtabv: leaw vga_modes, %si # All modes for std VGA - movw $vga_modes_end-vga_modes, %cx - rep # I'm unable to use movsw as I don't know how to store a half - movsb # of the expression above to cx without using explicit shr. - - cmpb $0, scanning # Mode scan requested? - jz mscan1 - - call mode_scan -mscan1: - -#ifdef CONFIG_VIDEO_LOCAL - call local_modes -#endif /* CONFIG_VIDEO_LOCAL */ - -#ifdef CONFIG_VIDEO_VESA - call vesa_modes # Detect VESA VGA modes -#endif /* CONFIG_VIDEO_VESA */ - -#ifdef CONFIG_VIDEO_SVGA - cmpb $0, scanning # Bypass when scanning - jnz mscan2 - - call svga_modes # Detect SVGA cards & modes -mscan2: -#endif /* CONFIG_VIDEO_SVGA */ - -mtabe: - -#ifdef CONFIG_VIDEO_COMPACT - leaw modelist, %si - movw %di, %dx - movw %si, %di -cmt1: cmpw %dx, %si # Scan all modes - jz cmt2 - - leaw modelist, %bx # Find in previous entries - movw 2(%si), %cx -cmt3: cmpw %bx, %si - jz cmt4 - - cmpw 2(%bx), %cx # Found => don't copy this entry - jz cmt5 - - addw $4, %bx - jmp cmt3 - -cmt4: movsl # Copy entry - jmp cmt1 - -cmt5: addw $4, %si # Skip entry - jmp cmt1 - -cmt2: -#endif /* CONFIG_VIDEO_COMPACT */ - - movw $ASK_VGA, (%di) # End marker - movw %di, mt_end -mtab1: leaw modelist, %si # SI=mode list, DI=list end -ret0: ret - -# Modes usable on all standard VGAs -vga_modes: - .word VIDEO_8POINT - .word 0x5032 # 80x50 - .word VIDEO_80x43 - .word 0x502b # 80x43 - .word VIDEO_80x28 - .word 0x501c # 80x28 - .word VIDEO_80x30 - .word 0x501e # 80x30 - .word VIDEO_80x34 - .word 0x5022 # 80x34 - .word VIDEO_80x60 - .word 0x503c # 80x60 -#ifdef CONFIG_VIDEO_GFX_HACK - .word VIDEO_GFX_HACK - .word VIDEO_GFX_DUMMY_RESOLUTION -#endif - -vga_modes_end: -# Detect VESA modes. - -#ifdef CONFIG_VIDEO_VESA -vesa_modes: - cmpb $2, adapter # VGA only - jnz ret0 - - movw %di, %bp # BP=original mode table end - addw $0x200, %di # Buffer space - movw $0x4f00, %ax # VESA Get card info call - int $0x10 - movw %bp, %di - cmpw $0x004f, %ax # Successful? - jnz ret0 - - cmpw $0x4556, 0x200(%di) - jnz ret0 - - cmpw $0x4153, 0x202(%di) - jnz ret0 - - movw $vesa_name, card_name # Set name to "VESA VGA" - pushw %gs - lgsw 0x20e(%di), %si # GS:SI=mode list - movw $128, %cx # Iteration limit -vesa1: -# gas version 2.9.1, using BFD version 2.9.1.0.23 buggers the next inst. -# XXX: lodsw %gs:(%si), %ax # Get next mode in the list - gs; lodsw - cmpw $0xffff, %ax # End of the table? - jz vesar - - cmpw $0x0080, %ax # Check validity of mode ID - jc vesa2 - - orb %ah, %ah # Valid IDs: 0x0000-0x007f/0x0100-0x07ff - jz vesan # Certain BIOSes report 0x80-0xff! - - cmpw $0x0800, %ax - jnc vesae - -vesa2: pushw %cx - movw %ax, %cx # Get mode information structure - movw $0x4f01, %ax - int $0x10 - movw %cx, %bx # BX=mode number - addb $VIDEO_FIRST_VESA>>8, %bh - popw %cx - cmpw $0x004f, %ax - jnz vesan # Don't report errors (buggy BIOSES) - - movb (%di), %al # Check capabilities. We require - andb $0x19, %al # a color text mode. - cmpb $0x09, %al - jnz vesan - - cmpw $0xb800, 8(%di) # Standard video memory address required - jnz vesan - - testb $2, (%di) # Mode characteristics supplied? - movw %bx, (%di) # Store mode number - jz vesa3 - - xorw %dx, %dx - movw 0x12(%di), %bx # Width - orb %bh, %bh - jnz vesan - - movb %bl, 0x3(%di) - movw 0x14(%di), %ax # Height - orb %ah, %ah - jnz vesan - - movb %al, 2(%di) - mulb %bl - cmpw $8193, %ax # Small enough for Linux console driver? - jnc vesan - - jmp vesaok - -vesa3: subw $0x8108, %bx # This mode has no detailed info specified, - jc vesan # so it must be a standard VESA mode. - - cmpw $5, %bx - jnc vesan - - movw vesa_text_mode_table(%bx), %ax - movw %ax, 2(%di) -vesaok: addw $4, %di # The mode is valid. Store it. -vesan: loop vesa1 # Next mode. Limit exceeded => error -vesae: leaw vesaer, %si - call prtstr - movw %bp, %di # Discard already found modes. -vesar: popw %gs - ret - -# Dimensions of standard VESA text modes -vesa_text_mode_table: - .byte 60, 80 # 0108 - .byte 25, 132 # 0109 - .byte 43, 132 # 010A - .byte 50, 132 # 010B - .byte 60, 132 # 010C -#endif /* CONFIG_VIDEO_VESA */ - -# Scan for video modes. A bit dirty, but should work. -mode_scan: - movw $0x0100, %cx # Start with mode 0 -scm1: movb $0, %ah # Test the mode - movb %cl, %al - int $0x10 - movb $0x0f, %ah - int $0x10 - cmpb %cl, %al - jnz scm2 # Mode not set - - movw $0x3c0, %dx # Test if it's a text mode - movb $0x10, %al # Mode bits - call inidx - andb $0x03, %al - jnz scm2 - - movb $0xce, %dl # Another set of mode bits - movb $0x06, %al - call inidx - shrb %al - jc scm2 - - movb $0xd4, %dl # Cursor location - movb $0x0f, %al - call inidx - orb %al, %al - jnz scm2 - - movw %cx, %ax # Ok, store the mode - stosw - movb %gs:(0x484), %al # Number of rows - incb %al - stosb - movw %gs:(0x44a), %ax # Number of columns - stosb -scm2: incb %cl - jns scm1 - - movw $0x0003, %ax # Return back to mode 3 - int $0x10 - ret - -tstidx: outw %ax, %dx # OUT DX,AX and inidx -inidx: outb %al, %dx # Read from indexed VGA register - incw %dx # AL=index, DX=index reg port -> AL=data - inb %dx, %al - decw %dx - ret - -# Try to detect type of SVGA card and supply (usually approximate) video -# mode table for it. - -#ifdef CONFIG_VIDEO_SVGA -svga_modes: - leaw svga_table, %si # Test all known SVGA adapters -dosvga: lodsw - movw %ax, %bp # Default mode table - orw %ax, %ax - jz didsv1 - - lodsw # Pointer to test routine - pushw %si - pushw %di - pushw %es - movw $0xc000, %bx - movw %bx, %es - call *%ax # Call test routine - popw %es - popw %di - popw %si - orw %bp, %bp - jz dosvga - - movw %bp, %si # Found, copy the modes - movb svga_prefix, %ah -cpsvga: lodsb - orb %al, %al - jz didsv - - stosw - movsw - jmp cpsvga - -didsv: movw %si, card_name # Store pointer to card name -didsv1: ret - -# Table of all known SVGA cards. For each card, we store a pointer to -# a table of video modes supported by the card and a pointer to a routine -# used for testing of presence of the card. The video mode table is always -# followed by the name of the card or the chipset. -svga_table: - .word ati_md, ati_test - .word oak_md, oak_test - .word paradise_md, paradise_test - .word realtek_md, realtek_test - .word s3_md, s3_test - .word chips_md, chips_test - .word video7_md, video7_test - .word cirrus5_md, cirrus5_test - .word cirrus6_md, cirrus6_test - .word cirrus1_md, cirrus1_test - .word ahead_md, ahead_test - .word everex_md, everex_test - .word genoa_md, genoa_test - .word trident_md, trident_test - .word tseng_md, tseng_test - .word 0 - -# Test routines and mode tables: - -# S3 - The test algorithm was taken from the SuperProbe package -# for XFree86 1.2.1. Report bugs to Christoph.Niemann@linux.org -s3_test: - movw $0x0f35, %cx # we store some constants in cl/ch - movw $0x03d4, %dx - movb $0x38, %al - call inidx - movb %al, %bh # store current CRT-register 0x38 - movw $0x0038, %ax - call outidx # disable writing to special regs - movb %cl, %al # check whether we can write special reg 0x35 - call inidx - movb %al, %bl # save the current value of CRT reg 0x35 - andb $0xf0, %al # clear bits 0-3 - movb %al, %ah - movb %cl, %al # and write it to CRT reg 0x35 - call outidx - call inidx # now read it back - andb %ch, %al # clear the upper 4 bits - jz s3_2 # the first test failed. But we have a - - movb %bl, %ah # second chance - movb %cl, %al - call outidx - jmp s3_1 # do the other tests - -s3_2: movw %cx, %ax # load ah with 0xf and al with 0x35 - orb %bl, %ah # set the upper 4 bits of ah with the orig value - call outidx # write ... - call inidx # ... and reread - andb %cl, %al # turn off the upper 4 bits - pushw %ax - movb %bl, %ah # restore old value in register 0x35 - movb %cl, %al - call outidx - popw %ax - cmpb %ch, %al # setting lower 4 bits was successful => bad - je no_s3 # writing is allowed => this is not an S3 - -s3_1: movw $0x4838, %ax # allow writing to special regs by putting - call outidx # magic number into CRT-register 0x38 - movb %cl, %al # check whether we can write special reg 0x35 - call inidx - movb %al, %bl - andb $0xf0, %al - movb %al, %ah - movb %cl, %al - call outidx - call inidx - andb %ch, %al - jnz no_s3 # no, we can't write => no S3 - - movw %cx, %ax - orb %bl, %ah - call outidx - call inidx - andb %ch, %al - pushw %ax - movb %bl, %ah # restore old value in register 0x35 - movb %cl, %al - call outidx - popw %ax - cmpb %ch, %al - jne no_s31 # writing not possible => no S3 - movb $0x30, %al - call inidx # now get the S3 id ... - leaw idS3, %di - movw $0x10, %cx - repne - scasb - je no_s31 - - movb %bh, %ah - movb $0x38, %al - jmp s3rest - -no_s3: movb $0x35, %al # restore CRT register 0x35 - movb %bl, %ah - call outidx -no_s31: xorw %bp, %bp # Detection failed -s3rest: movb %bh, %ah - movb $0x38, %al # restore old value of CRT register 0x38 - jmp outidx - -idS3: .byte 0x81, 0x82, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95 - .byte 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa8, 0xb0 - -s3_md: .byte 0x54, 0x2b, 0x84 - .byte 0x55, 0x19, 0x84 - .byte 0 - .ascii "S3" - .byte 0 - -# ATI cards. -ati_test: - leaw idati, %si - movw $0x31, %di - movw $0x09, %cx - repe - cmpsb - je atiok - - xorw %bp, %bp -atiok: ret - -idati: .ascii "761295520" - -ati_md: .byte 0x23, 0x19, 0x84 - .byte 0x33, 0x2c, 0x84 - .byte 0x22, 0x1e, 0x64 - .byte 0x21, 0x19, 0x64 - .byte 0x58, 0x21, 0x50 - .byte 0x5b, 0x1e, 0x50 - .byte 0 - .ascii "ATI" - .byte 0 - -# AHEAD -ahead_test: - movw $0x200f, %ax - movw $0x3ce, %dx - outw %ax, %dx - incw %dx - inb %dx, %al - cmpb $0x20, %al - je isahed - - cmpb $0x21, %al - je isahed - - xorw %bp, %bp -isahed: ret - -ahead_md: - .byte 0x22, 0x2c, 0x84 - .byte 0x23, 0x19, 0x84 - .byte 0x24, 0x1c, 0x84 - .byte 0x2f, 0x32, 0xa0 - .byte 0x32, 0x22, 0x50 - .byte 0x34, 0x42, 0x50 - .byte 0 - .ascii "Ahead" - .byte 0 - -# Chips & Tech. -chips_test: - movw $0x3c3, %dx - inb %dx, %al - orb $0x10, %al - outb %al, %dx - movw $0x104, %dx - inb %dx, %al - movb %al, %bl - movw $0x3c3, %dx - inb %dx, %al - andb $0xef, %al - outb %al, %dx - cmpb $0xa5, %bl - je cantok - - xorw %bp, %bp -cantok: ret - -chips_md: - .byte 0x60, 0x19, 0x84 - .byte 0x61, 0x32, 0x84 - .byte 0 - .ascii "Chips & Technologies" - .byte 0 - -# Cirrus Logic 5X0 -cirrus1_test: - movw $0x3d4, %dx - movb $0x0c, %al - outb %al, %dx - incw %dx - inb %dx, %al - movb %al, %bl - xorb %al, %al - outb %al, %dx - decw %dx - movb $0x1f, %al - outb %al, %dx - incw %dx - inb %dx, %al - movb %al, %bh - xorb %ah, %ah - shlb $4, %al - movw %ax, %cx - movb %bh, %al - shrb $4, %al - addw %ax, %cx - shlw $8, %cx - addw $6, %cx - movw %cx, %ax - movw $0x3c4, %dx - outw %ax, %dx - incw %dx - inb %dx, %al - andb %al, %al - jnz nocirr - - movb %bh, %al - outb %al, %dx - inb %dx, %al - cmpb $0x01, %al - je iscirr - -nocirr: xorw %bp, %bp -iscirr: movw $0x3d4, %dx - movb %bl, %al - xorb %ah, %ah - shlw $8, %ax - addw $0x0c, %ax - outw %ax, %dx - ret - -cirrus1_md: - .byte 0x1f, 0x19, 0x84 - .byte 0x20, 0x2c, 0x84 - .byte 0x22, 0x1e, 0x84 - .byte 0x31, 0x25, 0x64 - .byte 0 - .ascii "Cirrus Logic 5X0" - .byte 0 - -# Cirrus Logic 54XX -cirrus5_test: - movw $0x3c4, %dx - movb $6, %al - call inidx - movb %al, %bl # BL=backup - movw $6, %ax - call tstidx - cmpb $0x0f, %al - jne c5fail - - movw $0x1206, %ax - call tstidx - cmpb $0x12, %al - jne c5fail - - movb $0x1e, %al - call inidx - movb %al, %bh - movb %bh, %ah - andb $0xc0, %ah - movb $0x1e, %al - call tstidx - andb $0x3f, %al - jne c5xx - - movb $0x1e, %al - movb %bh, %ah - orb $0x3f, %ah - call tstidx - xorb $0x3f, %al - andb $0x3f, %al -c5xx: pushf - movb $0x1e, %al - movb %bh, %ah - outw %ax, %dx - popf - je c5done - -c5fail: xorw %bp, %bp -c5done: movb $6, %al - movb %bl, %ah - outw %ax, %dx - ret - -cirrus5_md: - .byte 0x14, 0x19, 0x84 - .byte 0x54, 0x2b, 0x84 - .byte 0 - .ascii "Cirrus Logic 54XX" - .byte 0 - -# Cirrus Logic 64XX -- no known extra modes, but must be identified, because -# it's misidentified by the Ahead test. -cirrus6_test: - movw $0x3ce, %dx - movb $0x0a, %al - call inidx - movb %al, %bl # BL=backup - movw $0xce0a, %ax - call tstidx - orb %al, %al - jne c2fail - - movw $0xec0a, %ax - call tstidx - cmpb $0x01, %al - jne c2fail - - movb $0xaa, %al - call inidx # 4X, 5X, 7X and 8X are valid 64XX chip ID's. - shrb $4, %al - subb $4, %al - jz c6done - - decb %al - jz c6done - - subb $2, %al - jz c6done - - decb %al - jz c6done - -c2fail: xorw %bp, %bp -c6done: movb $0x0a, %al - movb %bl, %ah - outw %ax, %dx - ret - -cirrus6_md: - .byte 0 - .ascii "Cirrus Logic 64XX" - .byte 0 - -# Everex / Trident -everex_test: - movw $0x7000, %ax - xorw %bx, %bx - int $0x10 - cmpb $0x70, %al - jne noevrx - - shrw $4, %dx - cmpw $0x678, %dx - je evtrid - - cmpw $0x236, %dx - jne evrxok - -evtrid: leaw trident_md, %bp -evrxok: ret - -noevrx: xorw %bp, %bp - ret - -everex_md: - .byte 0x03, 0x22, 0x50 - .byte 0x04, 0x3c, 0x50 - .byte 0x07, 0x2b, 0x64 - .byte 0x08, 0x4b, 0x64 - .byte 0x0a, 0x19, 0x84 - .byte 0x0b, 0x2c, 0x84 - .byte 0x16, 0x1e, 0x50 - .byte 0x18, 0x1b, 0x64 - .byte 0x21, 0x40, 0xa0 - .byte 0x40, 0x1e, 0x84 - .byte 0 - .ascii "Everex/Trident" - .byte 0 - -# Genoa. -genoa_test: - leaw idgenoa, %si # Check Genoa 'clues' - xorw %ax, %ax - movb %es:(0x37), %al - movw %ax, %di - movw $0x04, %cx - decw %si - decw %di -l1: incw %si - incw %di - movb (%si), %al - testb %al, %al - jz l2 - - cmpb %es:(%di), %al -l2: loope l1 - orw %cx, %cx - je isgen - - xorw %bp, %bp -isgen: ret - -idgenoa: .byte 0x77, 0x00, 0x99, 0x66 - -genoa_md: - .byte 0x58, 0x20, 0x50 - .byte 0x5a, 0x2a, 0x64 - .byte 0x60, 0x19, 0x84 - .byte 0x61, 0x1d, 0x84 - .byte 0x62, 0x20, 0x84 - .byte 0x63, 0x2c, 0x84 - .byte 0x64, 0x3c, 0x84 - .byte 0x6b, 0x4f, 0x64 - .byte 0x72, 0x3c, 0x50 - .byte 0x74, 0x42, 0x50 - .byte 0x78, 0x4b, 0x64 - .byte 0 - .ascii "Genoa" - .byte 0 - -# OAK -oak_test: - leaw idoakvga, %si - movw $0x08, %di - movw $0x08, %cx - repe - cmpsb - je isoak - - xorw %bp, %bp -isoak: ret - -idoakvga: .ascii "OAK VGA " - -oak_md: .byte 0x4e, 0x3c, 0x50 - .byte 0x4f, 0x3c, 0x84 - .byte 0x50, 0x19, 0x84 - .byte 0x51, 0x2b, 0x84 - .byte 0 - .ascii "OAK" - .byte 0 - -# WD Paradise. -paradise_test: - leaw idparadise, %si - movw $0x7d, %di - movw $0x04, %cx - repe - cmpsb - je ispara - - xorw %bp, %bp -ispara: ret - -idparadise: .ascii "VGA=" - -paradise_md: - .byte 0x41, 0x22, 0x50 - .byte 0x47, 0x1c, 0x84 - .byte 0x55, 0x19, 0x84 - .byte 0x54, 0x2c, 0x84 - .byte 0 - .ascii "Paradise" - .byte 0 - -# Trident. -trident_test: - movw $0x3c4, %dx - movb $0x0e, %al - outb %al, %dx - incw %dx - inb %dx, %al - xchgb %al, %ah - xorb %al, %al - outb %al, %dx - inb %dx, %al - xchgb %ah, %al - movb %al, %bl # Strange thing ... in the book this wasn't - andb $0x02, %bl # necessary but it worked on my card which - jz setb2 # is a trident. Without it the screen goes - # blurred ... - andb $0xfd, %al - jmp clrb2 - -setb2: orb $0x02, %al -clrb2: outb %al, %dx - andb $0x0f, %ah - cmpb $0x02, %ah - je istrid - - xorw %bp, %bp -istrid: ret - -trident_md: - .byte 0x50, 0x1e, 0x50 - .byte 0x51, 0x2b, 0x50 - .byte 0x52, 0x3c, 0x50 - .byte 0x57, 0x19, 0x84 - .byte 0x58, 0x1e, 0x84 - .byte 0x59, 0x2b, 0x84 - .byte 0x5a, 0x3c, 0x84 - .byte 0 - .ascii "Trident" - .byte 0 - -# Tseng. -tseng_test: - movw $0x3cd, %dx - inb %dx, %al # Could things be this simple ! :-) - movb %al, %bl - movb $0x55, %al - outb %al, %dx - inb %dx, %al - movb %al, %ah - movb %bl, %al - outb %al, %dx - cmpb $0x55, %ah - je istsen - -isnot: xorw %bp, %bp -istsen: ret - -tseng_md: - .byte 0x26, 0x3c, 0x50 - .byte 0x2a, 0x28, 0x64 - .byte 0x23, 0x19, 0x84 - .byte 0x24, 0x1c, 0x84 - .byte 0x22, 0x2c, 0x84 - .byte 0x21, 0x3c, 0x84 - .byte 0 - .ascii "Tseng" - .byte 0 - -# Video7. -video7_test: - movw $0x3cc, %dx - inb %dx, %al - movw $0x3b4, %dx - andb $0x01, %al - jz even7 - - movw $0x3d4, %dx -even7: movb $0x0c, %al - outb %al, %dx - incw %dx - inb %dx, %al - movb %al, %bl - movb $0x55, %al - outb %al, %dx - inb %dx, %al - decw %dx - movb $0x1f, %al - outb %al, %dx - incw %dx - inb %dx, %al - movb %al, %bh - decw %dx - movb $0x0c, %al - outb %al, %dx - incw %dx - movb %bl, %al - outb %al, %dx - movb $0x55, %al - xorb $0xea, %al - cmpb %bh, %al - jne isnot - - movb $VIDEO_FIRST_V7>>8, svga_prefix # Use special mode switching - ret - -video7_md: - .byte 0x40, 0x2b, 0x50 - .byte 0x43, 0x3c, 0x50 - .byte 0x44, 0x3c, 0x64 - .byte 0x41, 0x19, 0x84 - .byte 0x42, 0x2c, 0x84 - .byte 0x45, 0x1c, 0x84 - .byte 0 - .ascii "Video 7" - .byte 0 - -# Realtek VGA -realtek_test: - leaw idrtvga, %si - movw $0x45, %di - movw $0x0b, %cx - repe - cmpsb - je isrt - - xorw %bp, %bp -isrt: ret - -idrtvga: .ascii "REALTEK VGA" - -realtek_md: - .byte 0x1a, 0x3c, 0x50 - .byte 0x1b, 0x19, 0x84 - .byte 0x1c, 0x1e, 0x84 - .byte 0x1d, 0x2b, 0x84 - .byte 0x1e, 0x3c, 0x84 - .byte 0 - .ascii "REALTEK" - .byte 0 - -#endif /* CONFIG_VIDEO_SVGA */ - -# User-defined local mode table (VGA only) -#ifdef CONFIG_VIDEO_LOCAL -local_modes: - leaw local_mode_table, %si -locm1: lodsw - orw %ax, %ax - jz locm2 - - stosw - movsw - jmp locm1 - -locm2: ret - -# This is the table of local video modes which can be supplied manually -# by the user. Each entry consists of mode ID (word) and dimensions -# (byte for column count and another byte for row count). These modes -# are placed before all SVGA and VESA modes and override them if table -# compacting is enabled. The table must end with a zero word followed -# by NUL-terminated video adapter name. -local_mode_table: - .word 0x0100 # Example: 40x25 - .byte 25,40 - .word 0 - .ascii "Local" - .byte 0 -#endif /* CONFIG_VIDEO_LOCAL */ - -# Read a key and return the ASCII code in al, scan code in ah -getkey: xorb %ah, %ah - int $0x16 - ret - -# Read a key with a timeout of 30 seconds. -# The hardware clock is used to get the time. -getkt: call gettime - addb $30, %al # Wait 30 seconds - cmpb $60, %al - jl lminute - - subb $60, %al -lminute: - movb %al, %cl -again: movb $0x01, %ah - int $0x16 - jnz getkey # key pressed, so get it - - call gettime - cmpb %cl, %al - jne again - - movb $0x20, %al # timeout, return `space' - ret - -# Flush the keyboard buffer -flush: movb $0x01, %ah - int $0x16 - jz empty - - xorb %ah, %ah - int $0x16 - jmp flush - -empty: ret - -# Print hexadecimal number. -prthw: pushw %ax - movb %ah, %al - call prthb - popw %ax -prthb: pushw %ax - shrb $4, %al - call prthn - popw %ax - andb $0x0f, %al -prthn: cmpb $0x0a, %al - jc prth1 - - addb $0x07, %al -prth1: addb $0x30, %al - jmp prtchr - -# Print decimal number in al -prtdec: pushw %ax - pushw %cx - xorb %ah, %ah - movb $0x0a, %cl - idivb %cl - cmpb $0x09, %al - jbe lt100 - - call prtdec - jmp skip10 - -lt100: addb $0x30, %al - call prtchr -skip10: movb %ah, %al - addb $0x30, %al - call prtchr - popw %cx - popw %ax - ret - -store_edid: -#ifdef CONFIG_FIRMWARE_EDID - pushw %es # just save all registers - pushw %ax - pushw %bx - pushw %cx - pushw %dx - pushw %di - - pushw %fs - popw %es - - movl $0x13131313, %eax # memset block with 0x13 - movw $32, %cx - movw $0x140, %di - cld - rep - stosl - - cmpw $0x0200, vbe_version # only do EDID on >= VBE2.0 - jl no_edid - - pushw %es # save ES - xorw %di, %di # Report Capability - pushw %di - popw %es # ES:DI must be 0:0 - movw $0x4f15, %ax - xorw %bx, %bx - xorw %cx, %cx - int $0x10 - popw %es # restore ES - - cmpb $0x00, %ah # call successful - jne no_edid - - cmpb $0x4f, %al # function supported - jne no_edid - - movw $0x4f15, %ax # do VBE/DDC - movw $0x01, %bx - movw $0x00, %cx - movw $0x00, %dx - movw $0x140, %di - int $0x10 - -no_edid: - popw %di # restore all registers - popw %dx - popw %cx - popw %bx - popw %ax - popw %es -#endif - ret - -# VIDEO_SELECT-only variables -mt_end: .word 0 # End of video mode table if built -edit_buf: .space 6 # Line editor buffer -card_name: .word 0 # Pointer to adapter name -scanning: .byte 0 # Performing mode scan -do_restore: .byte 0 # Screen contents altered during mode change -svga_prefix: .byte VIDEO_FIRST_BIOS>>8 # Default prefix for BIOS modes -graphic_mode: .byte 0 # Graphic mode with a linear frame buffer -dac_size: .byte 6 # DAC bit depth -vbe_version: .word 0 # VBE bios version - -# Status messages -keymsg: .ascii "Press <RETURN> to see video modes available, " - .ascii "<SPACE> to continue or wait 30 secs" - .byte 0x0d, 0x0a, 0 - -listhdr: .byte 0x0d, 0x0a - .ascii "Mode: COLSxROWS:" - -crlft: .byte 0x0d, 0x0a, 0 - -prompt: .byte 0x0d, 0x0a - .asciz "Enter mode number or `scan': " - -unknt: .asciz "Unknown mode ID. Try again." - -badmdt: .ascii "You passed an undefined mode number." - .byte 0x0d, 0x0a, 0 - -vesaer: .ascii "Error: Scanning of VESA modes failed. Please " - .ascii "report to <mj@ucw.cz>." - .byte 0x0d, 0x0a, 0 - -old_name: .asciz "CGA/MDA/HGA" - -ega_name: .asciz "EGA" - -svga_name: .ascii " " - -vga_name: .asciz "VGA" - -vesa_name: .asciz "VESA" - -name_bann: .asciz "Video adapter: " -#endif /* CONFIG_VIDEO_SELECT */ - -# Other variables: -adapter: .byte 0 # Video adapter: 0=CGA/MDA/HGA,1=EGA,2=VGA -video_segment: .word 0xb800 # Video memory segment -force_size: .word 0 # Use this size instead of the one in BIOS vars diff --git a/arch/i386/boot/video.c b/arch/i386/boot/video.c new file mode 100644 index 00000000000..958130ef004 --- /dev/null +++ b/arch/i386/boot/video.c @@ -0,0 +1,461 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/video.c + * + * Select video mode + */ + +#include "boot.h" +#include "video.h" +#include "vesa.h" + +/* + * Mode list variables + */ +static struct card_info cards[]; /* List of cards to probe for */ + +/* + * Common variables + */ +int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */ +u16 video_segment; +int force_x, force_y; /* Don't query the BIOS for cols/rows */ + +int do_restore = 0; /* Screen contents changed during mode flip */ +int graphic_mode; /* Graphic mode with linear frame buffer */ + +static void store_cursor_position(void) +{ + u16 curpos; + u16 ax, bx; + + ax = 0x0300; + bx = 0; + asm(INT10 + : "=d" (curpos), "+a" (ax), "+b" (bx) + : : "ecx", "esi", "edi"); + + boot_params.screen_info.orig_x = curpos; + boot_params.screen_info.orig_y = curpos >> 8; +} + +static void store_video_mode(void) +{ + u16 ax, page; + + /* N.B.: the saving of the video page here is a bit silly, + since we pretty much assume page 0 everywhere. */ + ax = 0x0f00; + asm(INT10 + : "+a" (ax), "=b" (page) + : : "ecx", "edx", "esi", "edi"); + + /* Not all BIOSes are clean with respect to the top bit */ + boot_params.screen_info.orig_video_mode = ax & 0x7f; + boot_params.screen_info.orig_video_page = page; +} + +/* + * Store the video mode parameters for later usage by the kernel. + * This is done by asking the BIOS except for the rows/columns + * parameters in the default 80x25 mode -- these are set directly, + * because some very obscure BIOSes supply insane values. + */ +static void store_mode_params(void) +{ + u16 font_size; + int x, y; + + /* For graphics mode, it is up to the mode-setting driver + (currently only video-vesa.c) to store the parameters */ + if (graphic_mode) + return; + + store_cursor_position(); + store_video_mode(); + + if (boot_params.screen_info.orig_video_mode == 0x07) { + /* MDA, HGC, or VGA in monochrome mode */ + video_segment = 0xb000; + } else { + /* CGA, EGA, VGA and so forth */ + video_segment = 0xb800; + } + + set_fs(0); + font_size = rdfs16(0x485); /* Font size, BIOS area */ + boot_params.screen_info.orig_video_points = font_size; + + x = rdfs16(0x44a); + y = (adapter == ADAPTER_CGA) ? 25 : rdfs8(0x484)+1; + + if (force_x) + x = force_x; + if (force_y) + y = force_y; + + boot_params.screen_info.orig_video_cols = x; + boot_params.screen_info.orig_video_lines = y; +} + +/* Probe the video drivers and have them generate their mode lists. */ +static void probe_cards(int unsafe) +{ + struct card_info *card; + static u8 probed[2]; + + if (probed[unsafe]) + return; + + probed[unsafe] = 1; + + for (card = video_cards; card < video_cards_end; card++) { + if (card->unsafe == unsafe) { + if (card->probe) + card->nmodes = card->probe(); + else + card->nmodes = 0; + } + } +} + +/* Test if a mode is defined */ +int mode_defined(u16 mode) +{ + struct card_info *card; + struct mode_info *mi; + int i; + + for (card = video_cards; card < video_cards_end; card++) { + mi = card->modes; + for (i = 0; i < card->nmodes; i++, mi++) { + if (mi->mode == mode) + return 1; + } + } + + return 0; +} + +/* Set mode (without recalc) */ +static int raw_set_mode(u16 mode) +{ + int nmode, i; + struct card_info *card; + struct mode_info *mi; + + /* Drop the recalc bit if set */ + mode &= ~VIDEO_RECALC; + + /* Scan for mode based on fixed ID, position, or resolution */ + nmode = 0; + for (card = video_cards; card < video_cards_end; card++) { + mi = card->modes; + for (i = 0; i < card->nmodes; i++, mi++) { + int visible = mi->x || mi->y; + + if ((mode == nmode && visible) || + mode == mi->mode || + mode == (mi->y << 8)+mi->x) + return card->set_mode(mi); + + if (visible) + nmode++; + } + } + + /* Nothing found? Is it an "exceptional" (unprobed) mode? */ + for (card = video_cards; card < video_cards_end; card++) { + if (mode >= card->xmode_first && + mode < card->xmode_first+card->xmode_n) { + struct mode_info mix; + mix.mode = mode; + mix.x = mix.y = 0; + return card->set_mode(&mix); + } + } + + /* Otherwise, failure... */ + return -1; +} + +/* + * Recalculate the vertical video cutoff (hack!) + */ +static void vga_recalc_vertical(void) +{ + unsigned int font_size, rows; + u16 crtc; + u8 pt, ov; + + set_fs(0); + font_size = rdfs8(0x485); /* BIOS: font size (pixels) */ + rows = force_y ? force_y : rdfs8(0x484)+1; /* Text rows */ + + rows *= font_size; /* Visible scan lines */ + rows--; /* ... minus one */ + + crtc = vga_crtc(); + + pt = in_idx(crtc, 0x11); + pt &= ~0x80; /* Unlock CR0-7 */ + out_idx(pt, crtc, 0x11); + + out_idx((u8)rows, crtc, 0x12); /* Lower height register */ + + ov = in_idx(crtc, 0x07); /* Overflow register */ + ov &= 0xbd; + ov |= (rows >> (8-1)) & 0x02; + ov |= (rows >> (9-6)) & 0x40; + out_idx(ov, crtc, 0x07); +} + +/* Set mode (with recalc if specified) */ +static int set_mode(u16 mode) +{ + int rv; + + /* Very special mode numbers... */ + if (mode == VIDEO_CURRENT_MODE) + return 0; /* Nothing to do... */ + else if (mode == NORMAL_VGA) + mode = VIDEO_80x25; + else if (mode == EXTENDED_VGA) + mode = VIDEO_8POINT; + + rv = raw_set_mode(mode); + if (rv) + return rv; + + if (mode & VIDEO_RECALC) + vga_recalc_vertical(); + + return 0; +} + +static unsigned int get_entry(void) +{ + char entry_buf[4]; + int i, len = 0; + int key; + unsigned int v; + + do { + key = getchar(); + + if (key == '\b') { + if (len > 0) { + puts("\b \b"); + len--; + } + } else if ((key >= '0' && key <= '9') || + (key >= 'A' && key <= 'Z') || + (key >= 'a' && key <= 'z')) { + if (len < sizeof entry_buf) { + entry_buf[len++] = key; + putchar(key); + } + } + } while (key != '\r'); + putchar('\n'); + + if (len == 0) + return VIDEO_CURRENT_MODE; /* Default */ + + v = 0; + for (i = 0; i < len; i++) { + v <<= 4; + key = entry_buf[i] | 0x20; + v += (key > '9') ? key-'a'+10 : key-'0'; + } + + return v; +} + +static void display_menu(void) +{ + struct card_info *card; + struct mode_info *mi; + char ch; + int i; + + puts("Mode: COLSxROWS:\n"); + + ch = '0'; + for (card = video_cards; card < video_cards_end; card++) { + mi = card->modes; + for (i = 0; i < card->nmodes; i++, mi++) { + int visible = mi->x && mi->y; + u16 mode_id = mi->mode ? mi->mode : + (mi->y << 8)+mi->x; + + if (!visible) + continue; /* Hidden mode */ + + printf("%c %04X %3dx%-3d %s\n", + ch, mode_id, mi->x, mi->y, card->card_name); + + if (ch == '9') + ch = 'a'; + else if (ch == 'z' || ch == ' ') + ch = ' '; /* Out of keys... */ + else + ch++; + } + } +} + +#define H(x) ((x)-'a'+10) +#define SCAN ((H('s')<<12)+(H('c')<<8)+(H('a')<<4)+H('n')) + +static unsigned int mode_menu(void) +{ + int key; + unsigned int sel; + + puts("Press <ENTER> to see video modes available, " + "<SPACE> to continue, or wait 30 sec\n"); + + kbd_flush(); + while (1) { + key = getchar_timeout(); + if (key == ' ' || key == 0) + return VIDEO_CURRENT_MODE; /* Default */ + if (key == '\r') + break; + putchar('\a'); /* Beep! */ + } + + + for (;;) { + display_menu(); + + puts("Enter a video mode or \"scan\" to scan for " + "additional modes: "); + sel = get_entry(); + if (sel != SCAN) + return sel; + + probe_cards(1); + } +} + +#ifdef CONFIG_VIDEO_RETAIN +/* Save screen content to the heap */ +struct saved_screen { + int x, y; + int curx, cury; + u16 *data; +} saved; + +static void save_screen(void) +{ + /* Should be called after store_mode_params() */ + saved.x = boot_params.screen_info.orig_video_cols; + saved.y = boot_params.screen_info.orig_video_lines; + saved.curx = boot_params.screen_info.orig_x; + saved.cury = boot_params.screen_info.orig_y; + + if (heap_free() < saved.x*saved.y*sizeof(u16)+512) + return; /* Not enough heap to save the screen */ + + saved.data = GET_HEAP(u16, saved.x*saved.y); + + set_fs(video_segment); + copy_from_fs(saved.data, 0, saved.x*saved.y*sizeof(u16)); +} + +static void restore_screen(void) +{ + /* Should be called after store_mode_params() */ + int xs = boot_params.screen_info.orig_video_cols; + int ys = boot_params.screen_info.orig_video_lines; + int y; + addr_t dst = 0; + u16 *src = saved.data; + u16 ax, bx, dx; + + if (graphic_mode) + return; /* Can't restore onto a graphic mode */ + + if (!src) + return; /* No saved screen contents */ + + /* Restore screen contents */ + + set_fs(video_segment); + for (y = 0; y < ys; y++) { + int npad; + + if (y < saved.y) { + int copy = (xs < saved.x) ? xs : saved.x; + copy_to_fs(dst, src, copy*sizeof(u16)); + dst += copy*sizeof(u16); + src += saved.x; + npad = (xs < saved.x) ? 0 : xs-saved.x; + } else { + npad = xs; + } + + /* Writes "npad" blank characters to + video_segment:dst and advances dst */ + asm volatile("pushw %%es ; " + "movw %2,%%es ; " + "shrw %%cx ; " + "jnc 1f ; " + "stosw \n\t" + "1: rep;stosl ; " + "popw %%es" + : "+D" (dst), "+c" (npad) + : "bdS" (video_segment), + "a" (0x07200720)); + } + + /* Restore cursor position */ + ax = 0x0200; /* Set cursor position */ + bx = 0; /* Page number (<< 8) */ + dx = (saved.cury << 8)+saved.curx; + asm volatile(INT10 + : "+a" (ax), "+b" (bx), "+d" (dx) + : : "ecx", "esi", "edi"); +} +#else +#define save_screen() ((void)0) +#define restore_screen() ((void)0) +#endif + +void set_video(void) +{ + u16 mode = boot_params.hdr.vid_mode; + + RESET_HEAP(); + + store_mode_params(); + save_screen(); + probe_cards(0); + + for (;;) { + if (mode == ASK_VGA) + mode = mode_menu(); + + if (!set_mode(mode)) + break; + + printf("Undefined video mode number: %x\n", mode); + mode = ASK_VGA; + } + vesa_store_edid(); + store_mode_params(); + + if (do_restore) + restore_screen(); +} diff --git a/arch/i386/boot/video.h b/arch/i386/boot/video.h new file mode 100644 index 00000000000..b92447d5121 --- /dev/null +++ b/arch/i386/boot/video.h @@ -0,0 +1,152 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/video.h + * + * Header file for the real-mode video probing code + */ + +#ifndef BOOT_VIDEO_H +#define BOOT_VIDEO_H + +#include <linux/types.h> + +/* Enable autodetection of SVGA adapters and modes. */ +#undef CONFIG_VIDEO_SVGA + +/* Enable autodetection of VESA modes */ +#define CONFIG_VIDEO_VESA + +/* Retain screen contents when switching modes */ +#define CONFIG_VIDEO_RETAIN + +/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */ +#undef CONFIG_VIDEO_400_HACK + +/* This code uses an extended set of video mode numbers. These include: + * Aliases for standard modes + * NORMAL_VGA (-1) + * EXTENDED_VGA (-2) + * ASK_VGA (-3) + * Video modes numbered by menu position -- NOT RECOMMENDED because of lack + * of compatibility when extending the table. These are between 0x00 and 0xff. + */ +#define VIDEO_FIRST_MENU 0x0000 + +/* Standard BIOS video modes (BIOS number + 0x0100) */ +#define VIDEO_FIRST_BIOS 0x0100 + +/* VESA BIOS video modes (VESA number + 0x0200) */ +#define VIDEO_FIRST_VESA 0x0200 + +/* Video7 special modes (BIOS number + 0x0900) */ +#define VIDEO_FIRST_V7 0x0900 + +/* Special video modes */ +#define VIDEO_FIRST_SPECIAL 0x0f00 +#define VIDEO_80x25 0x0f00 +#define VIDEO_8POINT 0x0f01 +#define VIDEO_80x43 0x0f02 +#define VIDEO_80x28 0x0f03 +#define VIDEO_CURRENT_MODE 0x0f04 +#define VIDEO_80x30 0x0f05 +#define VIDEO_80x34 0x0f06 +#define VIDEO_80x60 0x0f07 +#define VIDEO_GFX_HACK 0x0f08 +#define VIDEO_LAST_SPECIAL 0x0f09 + +/* Video modes given by resolution */ +#define VIDEO_FIRST_RESOLUTION 0x1000 + +/* The "recalculate timings" flag */ +#define VIDEO_RECALC 0x8000 + +/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */ +#ifdef CONFIG_VIDEO_RETAIN +void store_screen(void); +#define DO_STORE() store_screen() +#else +#define DO_STORE() ((void)0) +#endif /* CONFIG_VIDEO_RETAIN */ + +/* + * Mode table structures + */ + +struct mode_info { + u16 mode; /* Mode number (vga= style) */ + u8 x, y; /* Width, height */ +}; + +struct card_info { + const char *card_name; + int (*set_mode)(struct mode_info *mode); + int (*probe)(void); + struct mode_info *modes; + int nmodes; /* Number of probed modes so far */ + int unsafe; /* Probing is unsafe, only do after "scan" */ + u16 xmode_first; /* Unprobed modes to try to call anyway */ + u16 xmode_n; /* Size of unprobed mode range */ +}; + +#define __videocard struct card_info __attribute__((section(".videocards"))) +extern struct card_info video_cards[], video_cards_end[]; + +int mode_defined(u16 mode); /* video.c */ + +/* Basic video information */ +#define ADAPTER_CGA 0 /* CGA/MDA/HGC */ +#define ADAPTER_EGA 1 +#define ADAPTER_VGA 2 + +extern int adapter; +extern u16 video_segment; +extern int force_x, force_y; /* Don't query the BIOS for cols/rows */ +extern int do_restore; /* Restore screen contents */ +extern int graphic_mode; /* Graphics mode with linear frame buffer */ + +/* + * int $0x10 is notorious for touching registers it shouldn't. + * gcc doesn't like %ebp being clobbered, so define it as a push/pop + * sequence here. + * + * A number of systems, including the original PC can clobber %bp in + * certain circumstances, like when scrolling. There exists at least + * one Trident video card which could clobber DS under a set of + * circumstances that we are unlikely to encounter (scrolling when + * using an extended graphics mode of more than 800x600 pixels), but + * it's cheap insurance to deal with that here. + */ +#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp" + +/* Accessing VGA indexed registers */ +static inline u8 in_idx(u16 port, u8 index) +{ + outb(index, port); + return inb(port+1); +} + +static inline void out_idx(u8 v, u16 port, u8 index) +{ + outw(index+(v << 8), port); +} + +/* Writes a value to an indexed port and then reads the port again */ +static inline u8 tst_idx(u8 v, u16 port, u8 index) +{ + out_idx(port, index, v); + return in_idx(port, index); +} + +/* Get the I/O port of the VGA CRTC */ +u16 vga_crtc(void); /* video-vga.c */ + +#endif /* BOOT_VIDEO_H */ diff --git a/arch/i386/boot/voyager.c b/arch/i386/boot/voyager.c new file mode 100644 index 00000000000..61c8fe0453b --- /dev/null +++ b/arch/i386/boot/voyager.c @@ -0,0 +1,46 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +/* + * arch/i386/boot/voyager.c + * + * Get the Voyager config information + */ + +#include "boot.h" + +#ifdef CONFIG_X86_VOYAGER + +int query_voyager(void) +{ + u8 err; + u16 es, di; + /* Abuse the apm_bios_info area for this */ + u8 *data_ptr = (u8 *)&boot_params.apm_bios_info; + + data_ptr[0] = 0xff; /* Flag on config not found(?) */ + + asm("pushw %%es ; " + "int $0x15 ; " + "setc %0 ; " + "movw %%es, %1 ; " + "popw %%es" + : "=q" (err), "=r" (es), "=D" (di) + : "a" (0xffc0)); + + if (err) + return -1; /* Not Voyager */ + + set_fs(es); + copy_from_fs(data_ptr, di, 7); /* Table is 7 bytes apparently */ + return 0; +} + +#endif /* CONFIG_X86_VOYAGER */ diff --git a/arch/i386/defconfig b/arch/i386/defconfig index 1a3a2217b7c..54ee1764fda 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.22-rc2 -# Mon May 21 13:23:44 2007 +# Linux kernel version: 2.6.22-git14 +# Fri Jul 20 09:53:15 2007 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y @@ -37,19 +37,18 @@ CONFIG_LOCALVERSION="" CONFIG_LOCALVERSION_AUTO=y CONFIG_SWAP=y CONFIG_SYSVIPC=y -# CONFIG_IPC_NS is not set CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set -# CONFIG_UTS_NS is not set +# CONFIG_USER_NS is not set # CONFIG_AUDIT is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=18 # CONFIG_CPUSETS is not set CONFIG_SYSFS_DEPRECATED=y -# CONFIG_RELAY is not set +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" CONFIG_CC_OPTIMIZE_FOR_SIZE=y @@ -73,16 +72,13 @@ CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_VM_EVENT_COUNTERS=y -CONFIG_SLAB=y -# CONFIG_SLUB is not set +CONFIG_SLUB_DEBUG=y +# CONFIG_SLAB is not set +CONFIG_SLUB=y # CONFIG_SLOB is not set CONFIG_RT_MUTEXES=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 - -# -# Loadable module support -# CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y @@ -90,14 +86,11 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set # CONFIG_KMOD is not set CONFIG_STOP_MACHINE=y - -# -# Block layer -# CONFIG_BLOCK=y CONFIG_LBD=y # CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_LSF is not set +# CONFIG_BLK_DEV_BSG is not set # # IO Schedulers @@ -166,7 +159,6 @@ CONFIG_X86_WP_WORKS_OK=y CONFIG_X86_INVLPG=y CONFIG_X86_BSWAP=y CONFIG_X86_POPAD_OK=y -CONFIG_X86_CMPXCHG64=y CONFIG_X86_GOOD_APIC=y CONFIG_X86_INTEL_USERCOPY=y CONFIG_X86_USE_PPRO_CHECKSUM=y @@ -202,6 +194,7 @@ CONFIG_X86_CPUID=y # CONFIG_EDD is not set # CONFIG_DELL_RBU is not set # CONFIG_DCDBAS is not set +CONFIG_DMIID=y # CONFIG_NOHIGHMEM is not set CONFIG_HIGHMEM4G=y # CONFIG_HIGHMEM64G is not set @@ -218,7 +211,9 @@ CONFIG_FLAT_NODE_MEM_MAP=y CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_RESOURCES_64BIT=y CONFIG_ZONE_DMA_FLAG=1 +CONFIG_BOUNCE=y CONFIG_NR_QUICK=1 +CONFIG_VIRT_TO_BUS=y # CONFIG_HIGHPTE is not set # CONFIG_MATH_EMULATION is not set CONFIG_MTRR=y @@ -245,7 +240,6 @@ CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y CONFIG_PM=y CONFIG_PM_LEGACY=y # CONFIG_PM_DEBUG is not set -# CONFIG_PM_SYSFS_DEPRECATED is not set # # ACPI (Advanced Configuration and Power Interface) Support @@ -285,7 +279,7 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y -# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y # # CPUFreq processor drivers @@ -326,7 +320,7 @@ CONFIG_PCI_MMCONFIG=y CONFIG_ARCH_SUPPORTS_MSI=y CONFIG_PCI_MSI=y # CONFIG_PCI_DEBUG is not set -CONFIG_HT_IRQ=y +# CONFIG_HT_IRQ is not set CONFIG_ISA_DMA_API=y # CONFIG_ISA is not set # CONFIG_MCA is not set @@ -382,7 +376,7 @@ CONFIG_IP_PNP_DHCP=y CONFIG_INET_TUNNEL=y CONFIG_INET_XFRM_MODE_TRANSPORT=y CONFIG_INET_XFRM_MODE_TUNNEL=y -CONFIG_INET_XFRM_MODE_BEET=y +# CONFIG_INET_XFRM_MODE_BEET is not set CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set @@ -401,27 +395,15 @@ CONFIG_IPV6=y # CONFIG_INET6_TUNNEL is not set CONFIG_INET6_XFRM_MODE_TRANSPORT=y CONFIG_INET6_XFRM_MODE_TUNNEL=y -CONFIG_INET6_XFRM_MODE_BEET=y +# CONFIG_INET6_XFRM_MODE_BEET is not set # CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set CONFIG_IPV6_SIT=y # CONFIG_IPV6_TUNNEL is not set # CONFIG_IPV6_MULTIPLE_TABLES is not set # CONFIG_NETWORK_SECMARK is not set # CONFIG_NETFILTER is not set - -# -# DCCP Configuration (EXPERIMENTAL) -# # CONFIG_IP_DCCP is not set - -# -# SCTP Configuration (EXPERIMENTAL) -# # CONFIG_IP_SCTP is not set - -# -# TIPC Configuration (EXPERIMENTAL) -# # CONFIG_TIPC is not set # CONFIG_ATM is not set # CONFIG_BRIDGE is not set @@ -458,6 +440,7 @@ CONFIG_IPV6_SIT=y # CONFIG_MAC80211 is not set # CONFIG_IEEE80211 is not set # CONFIG_RFKILL is not set +# CONFIG_NET_9P is not set # # Device Drivers @@ -472,21 +455,9 @@ CONFIG_FW_LOADER=y # CONFIG_DEBUG_DRIVER is not set # CONFIG_DEBUG_DEVRES is not set # CONFIG_SYS_HYPERVISOR is not set - -# -# Connector - unified userspace <-> kernelspace linker -# # CONFIG_CONNECTOR is not set # CONFIG_MTD is not set - -# -# Parallel port support -# # CONFIG_PARPORT is not set - -# -# Plug and Play support -# CONFIG_PNP=y # CONFIG_PNP_DEBUG is not set @@ -494,10 +465,7 @@ CONFIG_PNP=y # Protocols # CONFIG_PNPACPI=y - -# -# Block devices -# +CONFIG_BLK_DEV=y CONFIG_BLK_DEV_FD=y # CONFIG_BLK_CPQ_DA is not set # CONFIG_BLK_CPQ_CISS_DA is not set @@ -515,17 +483,14 @@ CONFIG_BLK_DEV_RAM_SIZE=4096 CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set - -# -# Misc devices -# +CONFIG_MISC_DEVICES=y # CONFIG_IBM_ASM is not set # CONFIG_PHANTOM is not set +# CONFIG_EEPROM_93CX6 is not set # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set # CONFIG_SONY_LAPTOP is not set # CONFIG_THINKPAD_ACPI is not set -# CONFIG_BLINK is not set CONFIG_IDE=y CONFIG_BLK_DEV_IDE=y @@ -597,6 +562,7 @@ CONFIG_BLK_DEV_IDEDMA=y # # CONFIG_RAID_ATTRS is not set CONFIG_SCSI=y +CONFIG_SCSI_DMA=y # CONFIG_SCSI_TGT is not set CONFIG_SCSI_NETLINK=y # CONFIG_SCSI_PROC_FS is not set @@ -607,8 +573,9 @@ CONFIG_SCSI_NETLINK=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set # CONFIG_CHR_DEV_OSST is not set -# CONFIG_BLK_DEV_SR is not set -# CONFIG_CHR_DEV_SG is not set +CONFIG_BLK_DEV_SR=y +# CONFIG_BLK_DEV_SR_VENDOR is not set +CONFIG_CHR_DEV_SG=y # CONFIG_CHR_DEV_SCH is not set # @@ -668,6 +635,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_SCSI_INIA100 is not set # CONFIG_SCSI_STEX is not set # CONFIG_SCSI_SYM53C8XX_2 is not set +# CONFIG_SCSI_IPR is not set # CONFIG_SCSI_QLOGIC_1280 is not set # CONFIG_SCSI_QLA_FC is not set # CONFIG_SCSI_QLA_ISCSI is not set @@ -676,14 +644,73 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_NSP32 is not set # CONFIG_SCSI_DEBUG is not set -# CONFIG_SCSI_ESP_CORE is not set # CONFIG_SCSI_SRP is not set -# CONFIG_ATA is not set - -# -# Multi-device support (RAID and LVM) -# -# CONFIG_MD is not set +CONFIG_ATA=y +# CONFIG_ATA_NONSTANDARD is not set +CONFIG_ATA_ACPI=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_SVW=y +CONFIG_ATA_PIIX=y +# CONFIG_SATA_MV is not set +CONFIG_SATA_NV=y +# CONFIG_PDC_ADMA is not set +# CONFIG_SATA_QSTOR is not set +# CONFIG_SATA_PROMISE is not set +# CONFIG_SATA_SX4 is not set +CONFIG_SATA_SIL=y +# CONFIG_SATA_SIL24 is not set +# CONFIG_SATA_SIS is not set +# CONFIG_SATA_ULI is not set +CONFIG_SATA_VIA=y +# CONFIG_SATA_VITESSE is not set +# CONFIG_SATA_INIC162X is not set +# CONFIG_PATA_ALI is not set +# CONFIG_PATA_AMD is not set +# CONFIG_PATA_ARTOP is not set +# CONFIG_PATA_ATIIXP is not set +# CONFIG_PATA_CMD640_PCI is not set +# CONFIG_PATA_CMD64X is not set +# CONFIG_PATA_CS5520 is not set +# CONFIG_PATA_CS5530 is not set +# CONFIG_PATA_CS5535 is not set +# CONFIG_PATA_CYPRESS is not set +# CONFIG_PATA_EFAR is not set +# CONFIG_ATA_GENERIC is not set +# CONFIG_PATA_HPT366 is not set +# CONFIG_PATA_HPT37X is not set +# CONFIG_PATA_HPT3X2N is not set +# CONFIG_PATA_HPT3X3 is not set +# CONFIG_PATA_IT821X is not set +# CONFIG_PATA_IT8213 is not set +# CONFIG_PATA_JMICRON is not set +# CONFIG_PATA_TRIFLEX is not set +# CONFIG_PATA_MARVELL is not set +# CONFIG_PATA_MPIIX is not set +# CONFIG_PATA_OLDPIIX is not set +# CONFIG_PATA_NETCELL is not set +# CONFIG_PATA_NS87410 is not set +# CONFIG_PATA_OPTI is not set +# CONFIG_PATA_OPTIDMA is not set +# CONFIG_PATA_PDC_OLD is not set +# CONFIG_PATA_RADISYS is not set +# CONFIG_PATA_RZ1000 is not set +# CONFIG_PATA_SC1200 is not set +# CONFIG_PATA_SERVERWORKS is not set +# CONFIG_PATA_PDC2027X is not set +# CONFIG_PATA_SIL680 is not set +# CONFIG_PATA_SIS is not set +# CONFIG_PATA_VIA is not set +# CONFIG_PATA_WINBOND is not set +CONFIG_MD=y +# CONFIG_BLK_DEV_MD is not set +CONFIG_BLK_DEV_DM=y +# CONFIG_DM_DEBUG is not set +# CONFIG_DM_CRYPT is not set +# CONFIG_DM_SNAPSHOT is not set +# CONFIG_DM_MIRROR is not set +# CONFIG_DM_ZERO is not set +# CONFIG_DM_MULTIPATH is not set +# CONFIG_DM_DELAY is not set # # Fusion MPT device support @@ -724,42 +751,27 @@ CONFIG_IEEE1394_OHCI1394=y # CONFIG_IEEE1394_ETH1394 is not set # CONFIG_IEEE1394_DV1394 is not set CONFIG_IEEE1394_RAWIO=y - -# -# I2O device support -# # CONFIG_I2O is not set -# CONFIG_MACINTOSH_DRIVERS is not set - -# -# Network device support -# +CONFIG_MACINTOSH_DRIVERS=y +# CONFIG_MAC_EMUMOUSEBTN is not set CONFIG_NETDEVICES=y +CONFIG_NETDEVICES_MULTIQUEUE=y # CONFIG_DUMMY is not set # CONFIG_BONDING is not set +# CONFIG_MACVLAN is not set # CONFIG_EQUALIZER is not set # CONFIG_TUN is not set # CONFIG_NET_SB1000 is not set - -# -# ARCnet devices -# # CONFIG_ARCNET is not set # CONFIG_PHYLIB is not set - -# -# Ethernet (10 or 100Mbit) -# CONFIG_NET_ETHERNET=y CONFIG_MII=y # CONFIG_HAPPYMEAL is not set # CONFIG_SUNGEM is not set # CONFIG_CASSINI is not set -# CONFIG_NET_VENDOR_3COM is not set - -# -# Tulip family network device support -# +CONFIG_NET_VENDOR_3COM=y +CONFIG_VORTEX=y +# CONFIG_TYPHOON is not set CONFIG_NET_TULIP=y # CONFIG_DE2104X is not set CONFIG_TULIP=y @@ -810,7 +822,6 @@ CONFIG_R8169=y # CONFIG_SIS190 is not set # CONFIG_SKGE is not set CONFIG_SKY2=y -# CONFIG_SK98LIN is not set # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y CONFIG_BNX2=y @@ -824,10 +835,6 @@ CONFIG_NETDEV_10000=y # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_MLX4_CORE is not set - -# -# Token Ring devices -# # CONFIG_TR is not set # @@ -856,15 +863,7 @@ CONFIG_NETCONSOLE=y CONFIG_NETPOLL=y # CONFIG_NETPOLL_TRAP is not set CONFIG_NET_POLL_CONTROLLER=y - -# -# ISDN subsystem -# # CONFIG_ISDN is not set - -# -# Telephony Support -# # CONFIG_PHONE is not set # @@ -872,6 +871,7 @@ CONFIG_NET_POLL_CONTROLLER=y # CONFIG_INPUT=y # CONFIG_INPUT_FF_MEMLESS is not set +# CONFIG_INPUT_POLLDEV is not set # # Userland interfaces @@ -937,6 +937,7 @@ CONFIG_HW_CONSOLE=y # CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_FIX_EARLYCON_MEM=y CONFIG_SERIAL_8250_PCI=y CONFIG_SERIAL_8250_PNP=y CONFIG_SERIAL_8250_NR_UARTS=4 @@ -952,10 +953,6 @@ CONFIG_SERIAL_CORE_CONSOLE=y CONFIG_UNIX98_PTYS=y CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=256 - -# -# IPMI -# # CONFIG_IPMI_HANDLER is not set # CONFIG_WATCHDOG is not set CONFIG_HW_RANDOM=y @@ -989,11 +986,7 @@ CONFIG_MAX_RAW_DEVS=256 CONFIG_HPET=y # CONFIG_HPET_RTC_IRQ is not set CONFIG_HPET_MMAP=y -CONFIG_HANGCHECK_TIMER=y - -# -# TPM devices -# +# CONFIG_HANGCHECK_TIMER is not set # CONFIG_TCG_TPM is not set # CONFIG_TELCLOCK is not set CONFIG_DEVPORT=y @@ -1004,11 +997,8 @@ CONFIG_DEVPORT=y # # CONFIG_SPI is not set # CONFIG_SPI_MASTER is not set - -# -# Dallas's 1-wire bus -# # CONFIG_W1 is not set +# CONFIG_POWER_SUPPLY is not set # CONFIG_HWMON is not set # @@ -1042,7 +1032,7 @@ CONFIG_DAB=y CONFIG_VGA_CONSOLE=y CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=128 -# CONFIG_VIDEO_SELECT is not set +CONFIG_VIDEO_SELECT=y CONFIG_DUMMY_CONSOLE=y # @@ -1059,15 +1049,11 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y -# CONFIG_OSS_OBSOLETE is not set # CONFIG_SOUND_TRIDENT is not set # CONFIG_SOUND_MSNDCLAS is not set # CONFIG_SOUND_MSNDPIN is not set # CONFIG_SOUND_OSS is not set - -# -# HID Devices -# +CONFIG_HID_SUPPORT=y CONFIG_HID=y # CONFIG_HID_DEBUG is not set @@ -1078,10 +1064,7 @@ CONFIG_USB_HID=y # CONFIG_USB_HIDINPUT_POWERBOOK is not set # CONFIG_HID_FF is not set # CONFIG_USB_HIDDEV is not set - -# -# USB support -# +CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y CONFIG_USB_ARCH_HAS_EHCI=y @@ -1095,6 +1078,7 @@ CONFIG_USB_DEVICEFS=y # CONFIG_USB_DEVICE_CLASS is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set +# CONFIG_USB_PERSIST is not set # CONFIG_USB_OTG is not set # @@ -1104,7 +1088,6 @@ CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set # CONFIG_USB_EHCI_TT_NEWSCHED is not set -# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set # CONFIG_USB_ISP116X_HCD is not set CONFIG_USB_OHCI_HCD=y # CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set @@ -1112,6 +1095,7 @@ CONFIG_USB_OHCI_HCD=y CONFIG_USB_OHCI_LITTLE_ENDIAN=y CONFIG_USB_UHCI_HCD=y # CONFIG_USB_SL811_HCD is not set +# CONFIG_USB_R8A66597_HCD is not set # # USB Device Class drivers @@ -1202,15 +1186,7 @@ CONFIG_USB_MON=y # # LED Triggers # - -# -# InfiniBand support -# # CONFIG_INFINIBAND is not set - -# -# EDAC - error detection and reporting (RAS) (EXPERIMENTAL) -# # CONFIG_EDAC is not set # @@ -1230,11 +1206,13 @@ CONFIG_USB_MON=y # # DMA Devices # +CONFIG_VIRTUALIZATION=y +# CONFIG_KVM is not set # -# Virtualization +# Userspace I/O # -# CONFIG_KVM is not set +# CONFIG_UIO is not set # # File systems @@ -1272,6 +1250,7 @@ CONFIG_DNOTIFY=y # CONFIG_AUTOFS_FS is not set CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set +CONFIG_GENERIC_ACL=y # # CD-ROM/DVD Filesystems @@ -1299,7 +1278,7 @@ CONFIG_PROC_KCORE=y CONFIG_PROC_SYSCTL=y CONFIG_SYSFS=y CONFIG_TMPFS=y -# CONFIG_TMPFS_POSIX_ACL is not set +CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y CONFIG_RAMFS=y @@ -1349,7 +1328,6 @@ CONFIG_SUNRPC=y # CONFIG_NCP_FS is not set # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set -# CONFIG_9P_FS is not set # # Partition Types @@ -1405,10 +1383,7 @@ CONFIG_NLS_UTF8=y # Distributed Lock Manager # # CONFIG_DLM is not set - -# -# Instrumentation Support -# +CONFIG_INSTRUMENTATION=y CONFIG_PROFILING=y CONFIG_OPROFILE=y CONFIG_KPROBES=y @@ -1418,7 +1393,7 @@ CONFIG_KPROBES=y # CONFIG_TRACE_IRQFLAGS_SUPPORT=y # CONFIG_PRINTK_TIME is not set -CONFIG_ENABLE_MUST_CHECK=y +# CONFIG_ENABLE_MUST_CHECK is not set CONFIG_MAGIC_SYSRQ=y CONFIG_UNUSED_SYMBOLS=y # CONFIG_DEBUG_FS is not set @@ -1426,15 +1401,17 @@ CONFIG_UNUSED_SYMBOLS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set CONFIG_DETECT_SOFTLOCKUP=y +# CONFIG_SCHED_DEBUG is not set # CONFIG_SCHEDSTATS is not set -# CONFIG_TIMER_STATS is not set -# CONFIG_DEBUG_SLAB is not set +CONFIG_TIMER_STATS=y +# CONFIG_SLUB_DEBUG_ON is not set # CONFIG_DEBUG_RT_MUTEXES is not set # CONFIG_RT_MUTEX_TESTER is not set # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_DEBUG_MUTEXES is not set # CONFIG_DEBUG_LOCK_ALLOC is not set # CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_DEBUG_KOBJECT is not set @@ -1444,7 +1421,6 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set # CONFIG_FRAME_POINTER is not set -# CONFIG_UNWIND_INFO is not set # CONFIG_FORCED_INLINING is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set @@ -1463,10 +1439,6 @@ CONFIG_DOUBLEFAULT=y # # CONFIG_KEYS is not set # CONFIG_SECURITY is not set - -# -# Cryptographic options -# # CONFIG_CRYPTO is not set # @@ -1477,6 +1449,7 @@ CONFIG_BITREVERSE=y # CONFIG_CRC16 is not set # CONFIG_CRC_ITU_T is not set CONFIG_CRC32=y +# CONFIG_CRC7 is not set # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y CONFIG_PLIST=y diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 06da59f6f83..dbe5e87e0d6 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_VM86) += vm86.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_MGEODE_LX) += geode.o obj-$(CONFIG_VMI) += vmi.o vmiclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index a574cd2c8b6..cacdd883bf2 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -618,6 +618,8 @@ static int __init acpi_parse_sbf(struct acpi_table_header *table) #ifdef CONFIG_HPET_TIMER #include <asm/hpet.h> +static struct __initdata resource *hpet_res; + static int __init acpi_parse_hpet(struct acpi_table_header *table) { struct acpi_table_hpet *hpet_tbl; @@ -638,8 +640,42 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", hpet_tbl->id, hpet_address); + /* + * Allocate and initialize the HPET firmware resource for adding into + * the resource tree during the lateinit timeframe. + */ +#define HPET_RESOURCE_NAME_SIZE 9 + hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE); + + if (!hpet_res) + return 0; + + memset(hpet_res, 0, sizeof(*hpet_res)); + hpet_res->name = (void *)&hpet_res[1]; + hpet_res->flags = IORESOURCE_MEM; + snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, "HPET %u", + hpet_tbl->sequence); + + hpet_res->start = hpet_address; + hpet_res->end = hpet_address + (1 * 1024) - 1; + return 0; } + +/* + * hpet_insert_resource inserts the HPET resources used into the resource + * tree. + */ +static __init int hpet_insert_resource(void) +{ + if (!hpet_res) + return 1; + + return insert_resource(&iomem_resource, hpet_res); +} + +late_initcall(hpet_insert_resource); + #else #define acpi_parse_hpet NULL #endif @@ -950,14 +986,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { }, { .callback = force_acpi_ht, - .ident = "DELL GX240", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"), - DMI_MATCH(DMI_BOARD_NAME, "OptiPlex GX240"), - }, - }, - { - .callback = force_acpi_ht, .ident = "HP VISUALIZE NT Workstation", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), diff --git a/arch/i386/kernel/acpi/sleep.c b/arch/i386/kernel/acpi/sleep.c index 4ee83577bf6..c42b5ab49de 100644 --- a/arch/i386/kernel/acpi/sleep.c +++ b/arch/i386/kernel/acpi/sleep.c @@ -14,7 +14,7 @@ /* address in low memory of the wakeup routine. */ unsigned long acpi_wakeup_address = 0; -unsigned long acpi_video_flags; +unsigned long acpi_realmode_flags; extern char wakeup_start, wakeup_end; extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); @@ -68,9 +68,11 @@ static int __init acpi_sleep_setup(char *str) { while ((str != NULL) && (*str != '\0')) { if (strncmp(str, "s3_bios", 7) == 0) - acpi_video_flags = 1; + acpi_realmode_flags |= 1; if (strncmp(str, "s3_mode", 7) == 0) - acpi_video_flags |= 2; + acpi_realmode_flags |= 2; + if (strncmp(str, "s3_beep", 7) == 0) + acpi_realmode_flags |= 4; str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); @@ -80,9 +82,11 @@ static int __init acpi_sleep_setup(char *str) __setup("acpi_sleep=", acpi_sleep_setup); +/* Ouch, we want to delete this. We already have better version in userspace, in + s2ram from suspend.sf.net project */ static __init int reset_videomode_after_s3(struct dmi_system_id *d) { - acpi_video_flags |= 2; + acpi_realmode_flags |= 2; return 0; } diff --git a/arch/i386/kernel/acpi/wakeup.S b/arch/i386/kernel/acpi/wakeup.S index a2295a34b2c..ed0a0f2c159 100644 --- a/arch/i386/kernel/acpi/wakeup.S +++ b/arch/i386/kernel/acpi/wakeup.S @@ -13,6 +13,21 @@ # cs = 0x1234, eip = 0x05 # +#define BEEP \ + inb $97, %al; \ + outb %al, $0x80; \ + movb $3, %al; \ + outb %al, $97; \ + outb %al, $0x80; \ + movb $-74, %al; \ + outb %al, $67; \ + outb %al, $0x80; \ + movb $-119, %al; \ + outb %al, $66; \ + outb %al, $0x80; \ + movb $15, %al; \ + outb %al, $66; + ALIGN .align 4096 ENTRY(wakeup_start) @@ -31,6 +46,11 @@ wakeup_code: movw %cs, %ax movw %ax, %ds # Make ds:0 point to wakeup_start movw %ax, %ss + + testl $4, realmode_flags - wakeup_code + jz 1f + BEEP +1: mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board movw $0x0e00 + 'S', %fs:(0x12) @@ -41,7 +61,7 @@ wakeup_code: cmpl $0x12345678, %eax jne bogus_real_magic - testl $1, video_flags - wakeup_code + testl $1, realmode_flags - wakeup_code jz 1f lcall $0xc000,$3 movw %cs, %ax @@ -49,7 +69,7 @@ wakeup_code: movw %ax, %ss 1: - testl $2, video_flags - wakeup_code + testl $2, realmode_flags - wakeup_code jz 1f mov video_mode - wakeup_code, %ax call mode_set @@ -88,7 +108,11 @@ wakeup_code: cmpl $0x12345678, %eax jne bogus_real_magic - ljmpl $__KERNEL_CS,$wakeup_pmode_return + testl $8, realmode_flags - wakeup_code + jz 1f + BEEP +1: + ljmpl $__KERNEL_CS, $wakeup_pmode_return real_save_gdt: .word 0 .long 0 @@ -97,7 +121,8 @@ real_save_cr3: .long 0 real_save_cr4: .long 0 real_magic: .long 0 video_mode: .long 0 -video_flags: .long 0 +realmode_flags: .long 0 +beep_flags: .long 0 real_efer_save_restore: .long 0 real_save_efer_edx: .long 0 real_save_efer_eax: .long 0 @@ -260,8 +285,8 @@ ENTRY(acpi_copy_wakeup_routine) movl saved_videomode, %edx movl %edx, video_mode - wakeup_start (%eax) - movl acpi_video_flags, %edx - movl %edx, video_flags - wakeup_start (%eax) + movl acpi_realmode_flags, %edx + movl %edx, realmode_flags - wakeup_start (%eax) movl $0x12345678, real_magic - wakeup_start (%eax) movl $0x12345678, saved_magic popl %ebx diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index d8cda14fff8..c3750c2c411 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -2,12 +2,17 @@ #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/list.h> +#include <linux/kprobes.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> #include <asm/alternative.h> #include <asm/sections.h> +#include <asm/pgtable.h> +#include <asm/mce.h> +#include <asm/nmi.h> -static int noreplace_smp = 0; -static int smp_alt_once = 0; -static int debug_alternative = 0; +#ifdef CONFIG_HOTPLUG_CPU +static int smp_alt_once; static int __init bootonly(char *str) { @@ -15,6 +20,11 @@ static int __init bootonly(char *str) return 1; } __setup("smp-alt-boot", bootonly); +#else +#define smp_alt_once 1 +#endif + +static int debug_alternative; static int __init debug_alt(char *str) { @@ -23,6 +33,8 @@ static int __init debug_alt(char *str) } __setup("debug-alternative", debug_alt); +static int noreplace_smp; + static int __init setup_noreplace_smp(char *str) { noreplace_smp = 1; @@ -144,7 +156,7 @@ static void nop_out(void *insns, unsigned int len) unsigned int noplen = len; if (noplen > ASM_NOP_MAX) noplen = ASM_NOP_MAX; - memcpy(insns, noptable[noplen], noplen); + text_poke(insns, noptable[noplen], noplen); insns += noplen; len -= noplen; } @@ -196,7 +208,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) continue; if (*ptr > text_end) continue; - **ptr = 0xf0; /* lock prefix */ + text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */ }; } @@ -354,10 +366,6 @@ void apply_paravirt(struct paravirt_patch_site *start, /* Pad the rest with nops */ nop_out(p->instr + used, p->len - used); } - - /* Sync to be conservative, in case we patched following - * instructions */ - sync_core(); } extern struct paravirt_patch_site __start_parainstructions[], __stop_parainstructions[]; @@ -367,6 +375,14 @@ void __init alternative_instructions(void) { unsigned long flags; + /* The patching is not fully atomic, so try to avoid local interruptions + that might execute the to be patched code. + Other CPUs are not running. */ + stop_nmi(); +#ifdef CONFIG_MCE + stop_mce(); +#endif + local_irq_save(flags); apply_alternatives(__alt_instructions, __alt_instructions_end); @@ -376,8 +392,6 @@ void __init alternative_instructions(void) #ifdef CONFIG_HOTPLUG_CPU if (num_possible_cpus() < 2) smp_alt_once = 1; -#else - smp_alt_once = 1; #endif #ifdef CONFIG_SMP @@ -401,4 +415,37 @@ void __init alternative_instructions(void) #endif apply_paravirt(__parainstructions, __parainstructions_end); local_irq_restore(flags); + + restart_nmi(); +#ifdef CONFIG_MCE + restart_mce(); +#endif +} + +/* + * Warning: + * When you use this code to patch more than one byte of an instruction + * you need to make sure that other CPUs cannot execute this code in parallel. + * Also no thread must be currently preempted in the middle of these instructions. + * And on the local CPU you need to be protected again NMI or MCE handlers + * seeing an inconsistent instruction while you patch. + */ +void __kprobes text_poke(void *oaddr, unsigned char *opcode, int len) +{ + u8 *addr = oaddr; + if (!pte_write(*lookup_address((unsigned long)addr))) { + struct page *p[2] = { virt_to_page(addr), virt_to_page(addr+PAGE_SIZE) }; + addr = vmap(p, 2, VM_MAP, PAGE_KERNEL); + if (!addr) + return; + addr += ((unsigned long)oaddr) % PAGE_SIZE; + } + memcpy(addr, opcode, len); + sync_core(); + /* Not strictly needed, but can speed CPU recovery up. Ignore cross cacheline + case. */ + if (cpu_has_clflush) + asm("clflush (%0) " :: "r" (oaddr) : "memory"); + if (addr != oaddr) + vunmap(addr); } diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 67824f3bb97..bfc6cb7df7e 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -263,6 +263,9 @@ static void lapic_timer_setup(enum clock_event_mode mode, v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write_around(APIC_LVTT, v); break; + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ + break; } local_irq_restore(flags); @@ -315,7 +318,7 @@ static void __devinit setup_APIC_timer(void) #define LAPIC_CAL_LOOPS (HZ/10) -static __initdata volatile int lapic_cal_loops = -1; +static __initdata int lapic_cal_loops = -1; static __initdata long lapic_cal_t1, lapic_cal_t2; static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; @@ -485,7 +488,7 @@ void __init setup_boot_APIC_clock(void) /* Let the interrupts run */ local_irq_enable(); - while(lapic_cal_loops <= LAPIC_CAL_LOOPS) + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) cpu_relax(); local_irq_disable(); @@ -521,6 +524,9 @@ void __init setup_boot_APIC_clock(void) */ if (nmi_watchdog != NMI_IO_APIC) lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=1!\n"); } /* Setup the lapic or request the broadcast */ diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index 4112afe712b..47001d50a08 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -222,6 +222,7 @@ #include <linux/capability.h> #include <linux/device.h> #include <linux/kernel.h> +#include <linux/freezer.h> #include <linux/smp.h> #include <linux/dmi.h> #include <linux/suspend.h> @@ -2311,7 +2312,6 @@ static int __init apm_init(void) remove_proc_entry("apm", NULL); return err; } - kapmd_task->flags |= PF_NOFREEZE; wake_up_process(kapmd_task); if (num_online_cpus() > 1 && !smp ) { diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 27a776c9044..7288ac88d74 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -17,6 +17,13 @@ #include <asm/thread_info.h> #include <asm/elf.h> +#include <xen/interface/xen.h> + +#ifdef CONFIG_LGUEST_GUEST +#include <linux/lguest.h> +#include "../../../drivers/lguest/lg.h" +#endif + #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -59,6 +66,7 @@ void foo(void) OFFSET(TI_addr_limit, thread_info, addr_limit); OFFSET(TI_restart_block, thread_info, restart_block); OFFSET(TI_sysenter_return, thread_info, sysenter_return); + OFFSET(TI_cpu, thread_info, cpu); BLANK(); OFFSET(GDS_size, Xgt_desc_struct, size); @@ -115,4 +123,25 @@ void foo(void) OFFSET(PARAVIRT_iret, paravirt_ops, iret); OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); #endif + +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +#endif + +#ifdef CONFIG_LGUEST_GUEST + BLANK(); + OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); + OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); + OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); + OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); + OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp); + OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc); + OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc); + OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt); + OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum); + OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); + OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); +#endif } diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile index 74f27a463db..778396c78d6 100644 --- a/arch/i386/kernel/cpu/Makefile +++ b/arch/i386/kernel/cpu/Makefile @@ -8,8 +8,7 @@ obj-y += amd.o obj-y += cyrix.o obj-y += centaur.o obj-y += transmeta.o -obj-y += intel.o intel_cacheinfo.o -obj-y += rise.o +obj-y += intel.o intel_cacheinfo.o addon_cpuid_features.o obj-y += nexgen.o obj-y += umc.o diff --git a/arch/i386/kernel/cpu/addon_cpuid_features.c b/arch/i386/kernel/cpu/addon_cpuid_features.c new file mode 100644 index 00000000000..3e91d3ee26e --- /dev/null +++ b/arch/i386/kernel/cpu/addon_cpuid_features.c @@ -0,0 +1,50 @@ + +/* + * Routines to indentify additional cpu features that are scattered in + * cpuid space. + */ + +#include <linux/cpu.h> + +#include <asm/processor.h> + +struct cpuid_bit { + u16 feature; + u8 reg; + u8 bit; + u32 level; +}; + +enum cpuid_regs { + CR_EAX = 0, + CR_ECX, + CR_EDX, + CR_EBX +}; + +void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) +{ + u32 max_level; + u32 regs[4]; + const struct cpuid_bit *cb; + + static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, + { 0, 0, 0, 0 } + }; + + for (cb = cpuid_bits; cb->feature; cb++) { + + /* Verify that the level is valid */ + max_level = cpuid_eax(cb->level & 0xffff0000); + if (max_level < cb->level || + max_level > (cb->level | 0xffff)) + continue; + + cpuid(cb->level, ®s[CR_EAX], ®s[CR_EBX], + ®s[CR_ECX], ®s[CR_EDX]); + + if (regs[cb->reg] & (1 << cb->bit)) + set_bit(cb->feature, c->x86_capability); + } +} diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index 6f47eeeb93e..c7ba455d5ac 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -231,6 +231,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) switch (c->x86) { case 15: + /* Use K8 tuning for Fam10h and Fam11h */ + case 0x10: + case 0x11: set_bit(X86_FEATURE_K8, c->x86_capability); break; case 6: @@ -272,8 +275,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } #endif - if (cpuid_eax(0x80000000) >= 0x80000006) - num_cache_leaves = 3; + if (cpuid_eax(0x80000000) >= 0x80000006) { + if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000)) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + } if (amd_apic_timer_broken()) set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability); diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 794d593c47e..d506201d397 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -353,6 +353,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c) if ( xlvl >= 0x80000004 ) get_model_name(c); /* Default name */ } + + init_scattered_cpuid_features(c); } early_intel_workaround(c); @@ -604,7 +606,6 @@ extern int nsc_init_cpu(void); extern int amd_init_cpu(void); extern int centaur_init_cpu(void); extern int transmeta_init_cpu(void); -extern int rise_init_cpu(void); extern int nexgen_init_cpu(void); extern int umc_init_cpu(void); @@ -616,7 +617,6 @@ void __init early_cpu_init(void) amd_init_cpu(); centaur_init_cpu(); transmeta_init_cpu(); - rise_init_cpu(); nexgen_init_cpu(); umc_init_cpu(); early_cpu_detect(); diff --git a/arch/i386/kernel/cpu/cpufreq/Kconfig b/arch/i386/kernel/cpu/cpufreq/Kconfig index e912aae9473..094118ba00d 100644 --- a/arch/i386/kernel/cpu/cpufreq/Kconfig +++ b/arch/i386/kernel/cpu/cpufreq/Kconfig @@ -90,10 +90,17 @@ config X86_POWERNOW_K8 If in doubt, say N. config X86_POWERNOW_K8_ACPI - bool - depends on X86_POWERNOW_K8 && ACPI_PROCESSOR - depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m) + bool "ACPI Support" + select ACPI_PROCESSOR + depends on X86_POWERNOW_K8 default y + help + This provides access to the K8s Processor Performance States via ACPI. + This driver is probably required for CPUFreq to work with multi-socket and + SMP systems. It is not required on at least some single-socket yet + multi-core systems, even if SMP is enabled. + + It is safe to say Y here. config X86_GX_SUSPMOD tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation" @@ -109,7 +116,7 @@ config X86_GX_SUSPMOD config X86_SPEEDSTEP_CENTRINO tristate "Intel Enhanced SpeedStep" select CPU_FREQ_TABLE - select X86_SPEEDSTEP_CENTRINO_TABLE if (!X86_SPEEDSTEP_CENTRINO_ACPI) + select X86_SPEEDSTEP_CENTRINO_TABLE help This adds the CPUFreq driver for Enhanced SpeedStep enabled mobile CPUs. This means Intel Pentium M (Centrino) CPUs. However, @@ -121,20 +128,6 @@ config X86_SPEEDSTEP_CENTRINO If in doubt, say N. -config X86_SPEEDSTEP_CENTRINO_ACPI - bool "Use ACPI tables to decode valid frequency/voltage (deprecated)" - depends on X86_SPEEDSTEP_CENTRINO && ACPI_PROCESSOR - depends on !(X86_SPEEDSTEP_CENTRINO = y && ACPI_PROCESSOR = m) - help - This is deprecated and this functionality is now merged into - acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of - speedstep_centrino. - Use primarily the information provided in the BIOS ACPI tables - to determine valid CPU frequency and voltage pairings. It is - required for the driver to work on non-Banias CPUs. - - If in doubt, say Y. - config X86_SPEEDSTEP_CENTRINO_TABLE bool "Built-in tables for Banias CPUs" depends on X86_SPEEDSTEP_CENTRINO @@ -230,7 +223,7 @@ comment "shared options" config X86_ACPI_CPUFREQ_PROC_INTF bool "/proc/acpi/processor/../performance interface (deprecated)" depends on PROC_FS - depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K7_ACPI || X86_POWERNOW_K8_ACPI + depends on X86_ACPI_CPUFREQ || X86_POWERNOW_K7_ACPI || X86_POWERNOW_K8_ACPI help This enables the deprecated /proc/acpi/processor/../performance interface. While it is helpful for debugging, the generic, diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c index 10baa3501ed..6f846bee210 100644 --- a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -167,11 +167,13 @@ static void do_drv_read(struct drv_cmd *cmd) static void do_drv_write(struct drv_cmd *cmd) { - u32 h = 0; + u32 lo, hi; switch (cmd->type) { case SYSTEM_INTEL_MSR_CAPABLE: - wrmsr(cmd->addr.msr.reg, cmd->val, h); + rdmsr(cmd->addr.msr.reg, lo, hi); + lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE); + wrmsr(cmd->addr.msr.reg, lo, hi); break; case SYSTEM_IO_CAPABLE: acpi_os_write_port((acpi_io_address)cmd->addr.io.port, @@ -372,7 +374,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, struct cpufreq_freqs freqs; cpumask_t online_policy_cpus; struct drv_cmd cmd; - unsigned int msr; unsigned int next_state = 0; /* Index into freq_table */ unsigned int next_perf_state = 0; /* Index into perf table */ unsigned int i; @@ -417,11 +418,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; cmd.addr.msr.reg = MSR_IA32_PERF_CTL; - msr = - (u32) perf->states[next_perf_state]. - control & INTEL_MSR_RANGE; - cmd.val = get_cur_val(online_policy_cpus); - cmd.val = (cmd.val & ~INTEL_MSR_RANGE) | msr; + cmd.val = (u32) perf->states[next_perf_state].control; break; case SYSTEM_IO_CAPABLE: cmd.type = SYSTEM_IO_CAPABLE; @@ -668,8 +665,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) data->max_freq = perf->states[0].core_frequency * 1000; /* table init */ for (i=0; i<perf->state_count; i++) { - if (i>0 && perf->states[i].core_frequency == - perf->states[i-1].core_frequency) + if (i>0 && perf->states[i].core_frequency >= + data->freq_table[valid_states-1].frequency / 1000) continue; data->freq_table[valid_states].index = i; diff --git a/arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c index 0d49d73d1b7..66acd503991 100644 --- a/arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c +++ b/arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c @@ -391,8 +391,6 @@ static struct cpufreq_driver nforce2_driver = { */ static unsigned int nforce2_detect_chipset(void) { - u8 revision; - nforce2_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE2, PCI_ANY_ID, PCI_ANY_ID, NULL); @@ -400,10 +398,8 @@ static unsigned int nforce2_detect_chipset(void) if (nforce2_chipset_dev == NULL) return -ENODEV; - pci_read_config_byte(nforce2_chipset_dev, PCI_REVISION_ID, &revision); - printk(KERN_INFO "cpufreq: Detected nForce2 chipset revision %X\n", - revision); + nforce2_chipset_dev->revision); printk(KERN_INFO "cpufreq: FSB changing is maybe unstable and can lead to crashes and data loss.\n"); diff --git a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c index 6667e9cceb9..461dabc4e49 100644 --- a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c @@ -79,7 +79,7 @@ #include <linux/smp.h> #include <linux/cpufreq.h> #include <linux/pci.h> -#include <asm/processor.h> +#include <asm/processor-cyrix.h> #include <asm/errno.h> /* PCI config registers, all at F0 */ @@ -115,7 +115,6 @@ struct gxfreq_params { u8 pci_suscfg; u8 pci_pmer1; u8 pci_pmer2; - u8 pci_rev; struct pci_dev *cs55x0; }; @@ -276,7 +275,7 @@ static void gx_set_cpuspeed(unsigned int khz) pci_write_config_byte(gx_params->cs55x0, PCI_VIDTC, 100);/* typical 50 to 100ms */ pci_write_config_byte(gx_params->cs55x0, PCI_PMER1, pmer1); - if (gx_params->pci_rev < 0x10) { /* CS5530(rev 1.2, 1.3) */ + if (gx_params->cs55x0->revision < 0x10) { /* CS5530(rev 1.2, 1.3) */ suscfg = gx_params->pci_suscfg | SUSMOD; } else { /* CS5530A,B.. */ suscfg = gx_params->pci_suscfg | SUSMOD | PWRSVE; @@ -471,7 +470,6 @@ static int __init cpufreq_gx_init(void) pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2)); pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration)); pci_read_config_byte(params->cs55x0, PCI_MODOFF, &(params->off_duration)); - pci_read_config_byte(params->cs55x0, PCI_REVISION_ID, ¶ms->pci_rev); if ((ret = cpufreq_register_driver(&gx_suspmod_driver))) { kfree(params); diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.c b/arch/i386/kernel/cpu/cpufreq/longhaul.c index a3df9c039bd..ef8f0bc3fc7 100644 --- a/arch/i386/kernel/cpu/cpufreq/longhaul.c +++ b/arch/i386/kernel/cpu/cpufreq/longhaul.c @@ -29,6 +29,7 @@ #include <linux/pci.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/delay.h> #include <asm/msr.h> #include <asm/timex.h> @@ -55,7 +56,6 @@ /* Flags */ #define USE_ACPI_C3 (1 << 1) #define USE_NORTHBRIDGE (1 << 2) -#define USE_VT8235 (1 << 3) static int cpu_model; static unsigned int numscales=16; @@ -63,22 +63,19 @@ static unsigned int fsb; static const struct mV_pos *vrm_mV_table; static const unsigned char *mV_vrm_table; -struct f_msr { - u8 vrm; - u8 pos; -}; -static struct f_msr f_msr_table[32]; static unsigned int highest_speed, lowest_speed; /* kHz */ static unsigned int minmult, maxmult; static int can_scale_voltage; static struct acpi_processor *pr = NULL; static struct acpi_processor_cx *cx = NULL; +static u32 acpi_regs_addr; static u8 longhaul_flags; -static u8 longhaul_pos; +static unsigned int longhaul_index; /* Module parameters */ static int scale_voltage; +static int disable_acpi_c3; #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg) @@ -144,7 +141,7 @@ static void do_longhaul1(unsigned int clock_ratio_index) rdmsrl(MSR_VIA_BCR2, bcr2.val); /* Enable software clock multiplier */ bcr2.bits.ESOFTBF = 1; - bcr2.bits.CLOCKMUL = clock_ratio_index; + bcr2.bits.CLOCKMUL = clock_ratio_index & 0xff; /* Sync to timer tick */ safe_halt(); @@ -163,14 +160,12 @@ static void do_longhaul1(unsigned int clock_ratio_index) /* For processor with Longhaul MSR */ -static void do_powersaver(int cx_address, unsigned int clock_ratio_index) +static void do_powersaver(int cx_address, unsigned int clock_ratio_index, + unsigned int dir) { union msr_longhaul longhaul; - u8 dest_pos; u32 t; - dest_pos = f_msr_table[clock_ratio_index].pos; - rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); /* Setup new frequency */ longhaul.bits.RevisionKey = longhaul.bits.RevisionID; @@ -178,11 +173,11 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index) longhaul.bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4; /* Setup new voltage */ if (can_scale_voltage) - longhaul.bits.SoftVID = f_msr_table[clock_ratio_index].vrm; + longhaul.bits.SoftVID = (clock_ratio_index >> 8) & 0x1f; /* Sync to timer tick */ safe_halt(); /* Raise voltage if necessary */ - if (can_scale_voltage && longhaul_pos < dest_pos) { + if (can_scale_voltage && dir) { longhaul.bits.EnableSoftVID = 1; wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); /* Change voltage */ @@ -199,7 +194,6 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index) } longhaul.bits.EnableSoftVID = 0; wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - longhaul_pos = dest_pos; } /* Change frequency on next halt or sleep */ @@ -220,7 +214,7 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index) wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); /* Reduce voltage if necessary */ - if (can_scale_voltage && longhaul_pos > dest_pos) { + if (can_scale_voltage && !dir) { longhaul.bits.EnableSoftVID = 1; wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); /* Change voltage */ @@ -237,7 +231,6 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index) } longhaul.bits.EnableSoftVID = 0; wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - longhaul_pos = dest_pos; } } @@ -248,25 +241,28 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index) * Sets a new clock ratio. */ -static void longhaul_setstate(unsigned int clock_ratio_index) +static void longhaul_setstate(unsigned int table_index) { + unsigned int clock_ratio_index; int speed, mult; struct cpufreq_freqs freqs; - static unsigned int old_ratio=-1; unsigned long flags; unsigned int pic1_mask, pic2_mask; + u16 bm_status = 0; + u32 bm_timeout = 1000; + unsigned int dir = 0; - if (old_ratio == clock_ratio_index) - return; - old_ratio = clock_ratio_index; - - mult = clock_ratio[clock_ratio_index]; + clock_ratio_index = longhaul_table[table_index].index; + /* Safety precautions */ + mult = clock_ratio[clock_ratio_index & 0x1f]; if (mult == -1) return; - speed = calc_speed(mult); if ((speed > highest_speed) || (speed < lowest_speed)) return; + /* Voltage transition before frequency transition? */ + if (can_scale_voltage && longhaul_index < table_index) + dir = 1; freqs.old = calc_speed(longhaul_get_cpu_mult()); freqs.new = speed; @@ -285,11 +281,24 @@ static void longhaul_setstate(unsigned int clock_ratio_index) outb(0xFF,0xA1); /* Overkill */ outb(0xFE,0x21); /* TMR0 only */ + /* Wait while PCI bus is busy. */ + if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE + || ((pr != NULL) && pr->flags.bm_control))) { + bm_status = inw(acpi_regs_addr); + bm_status &= 1 << 4; + while (bm_status && bm_timeout) { + outw(1 << 4, acpi_regs_addr); + bm_timeout--; + bm_status = inw(acpi_regs_addr); + bm_status &= 1 << 4; + } + } + if (longhaul_flags & USE_NORTHBRIDGE) { /* Disable AGP and PCI arbiters */ outb(3, 0x22); } else if ((pr != NULL) && pr->flags.bm_control) { - /* Disable bus master arbitration */ + /* Disable bus master arbitration */ acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1); } switch (longhaul_version) { @@ -314,9 +323,9 @@ static void longhaul_setstate(unsigned int clock_ratio_index) if (longhaul_flags & USE_ACPI_C3) { /* Don't allow wakeup */ acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0); - do_powersaver(cx->address, clock_ratio_index); + do_powersaver(cx->address, clock_ratio_index, dir); } else { - do_powersaver(0, clock_ratio_index); + do_powersaver(0, clock_ratio_index, dir); } break; } @@ -336,6 +345,9 @@ static void longhaul_setstate(unsigned int clock_ratio_index) freqs.new = calc_speed(longhaul_get_cpu_mult()); cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + + if (!bm_timeout) + printk(KERN_INFO PFX "Warning: Timeout while waiting for idle PCI bus.\n"); } /* @@ -369,7 +381,8 @@ static int guess_fsb(int mult) static int __init longhaul_get_ranges(void) { - unsigned int j, k = 0; + unsigned int i, j, k = 0; + unsigned int ratio; int mult; /* Get current frequency */ @@ -423,8 +436,7 @@ static int __init longhaul_get_ranges(void) if(!longhaul_table) return -ENOMEM; - for (j=0; j < numscales; j++) { - unsigned int ratio; + for (j = 0; j < numscales; j++) { ratio = clock_ratio[j]; if (ratio == -1) continue; @@ -434,13 +446,41 @@ static int __init longhaul_get_ranges(void) longhaul_table[k].index = j; k++; } + if (k <= 1) { + kfree(longhaul_table); + return -ENODEV; + } + /* Sort */ + for (j = 0; j < k - 1; j++) { + unsigned int min_f, min_i; + min_f = longhaul_table[j].frequency; + min_i = j; + for (i = j + 1; i < k; i++) { + if (longhaul_table[i].frequency < min_f) { + min_f = longhaul_table[i].frequency; + min_i = i; + } + } + if (min_i != j) { + unsigned int temp; + temp = longhaul_table[j].frequency; + longhaul_table[j].frequency = longhaul_table[min_i].frequency; + longhaul_table[min_i].frequency = temp; + temp = longhaul_table[j].index; + longhaul_table[j].index = longhaul_table[min_i].index; + longhaul_table[min_i].index = temp; + } + } longhaul_table[k].frequency = CPUFREQ_TABLE_END; - if (!k) { - kfree (longhaul_table); - return -EINVAL; - } + /* Find index we are running on */ + for (j = 0; j < k; j++) { + if (clock_ratio[longhaul_table[j].index & 0x1f] == mult) { + longhaul_index = j; + break; + } + } return 0; } @@ -448,7 +488,7 @@ static int __init longhaul_get_ranges(void) static void __init longhaul_setup_voltagescaling(void) { union msr_longhaul longhaul; - struct mV_pos minvid, maxvid; + struct mV_pos minvid, maxvid, vid; unsigned int j, speed, pos, kHz_step, numvscales; int min_vid_speed; @@ -459,11 +499,11 @@ static void __init longhaul_setup_voltagescaling(void) } if (!longhaul.bits.VRMRev) { - printk (KERN_INFO PFX "VRM 8.5\n"); + printk(KERN_INFO PFX "VRM 8.5\n"); vrm_mV_table = &vrm85_mV[0]; mV_vrm_table = &mV_vrm85[0]; } else { - printk (KERN_INFO PFX "Mobile VRM\n"); + printk(KERN_INFO PFX "Mobile VRM\n"); if (cpu_model < CPU_NEHEMIAH) return; vrm_mV_table = &mobilevrm_mV[0]; @@ -523,7 +563,6 @@ static void __init longhaul_setup_voltagescaling(void) /* Calculate kHz for one voltage step */ kHz_step = (highest_speed - min_vid_speed) / numvscales; - j = 0; while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) { speed = longhaul_table[j].frequency; @@ -531,15 +570,14 @@ static void __init longhaul_setup_voltagescaling(void) pos = (speed - min_vid_speed) / kHz_step + minvid.pos; else pos = minvid.pos; - f_msr_table[longhaul_table[j].index].vrm = mV_vrm_table[pos]; - f_msr_table[longhaul_table[j].index].pos = pos; + longhaul_table[j].index |= mV_vrm_table[pos] << 8; + vid = vrm_mV_table[mV_vrm_table[pos]]; + printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n", speed, j, vid.mV); j++; } - longhaul_pos = maxvid.pos; can_scale_voltage = 1; - printk(KERN_INFO PFX "Voltage scaling enabled. " - "Use of \"conservative\" governor is highly recommended.\n"); + printk(KERN_INFO PFX "Voltage scaling enabled.\n"); } @@ -553,15 +591,44 @@ static int longhaul_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { unsigned int table_index = 0; - unsigned int new_clock_ratio = 0; + unsigned int i; + unsigned int dir = 0; + u8 vid, current_vid; if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq, relation, &table_index)) return -EINVAL; - new_clock_ratio = longhaul_table[table_index].index & 0xFF; - - longhaul_setstate(new_clock_ratio); + /* Don't set same frequency again */ + if (longhaul_index == table_index) + return 0; + if (!can_scale_voltage) + longhaul_setstate(table_index); + else { + /* On test system voltage transitions exceeding single + * step up or down were turning motherboard off. Both + * "ondemand" and "userspace" are unsafe. C7 is doing + * this in hardware, C3 is old and we need to do this + * in software. */ + i = longhaul_index; + current_vid = (longhaul_table[longhaul_index].index >> 8) & 0x1f; + if (table_index > longhaul_index) + dir = 1; + while (i != table_index) { + vid = (longhaul_table[i].index >> 8) & 0x1f; + if (vid != current_vid) { + longhaul_setstate(i); + current_vid = vid; + msleep(200); + } + if (dir) + i++; + else + i--; + } + longhaul_setstate(table_index); + } + longhaul_index = table_index; return 0; } @@ -590,11 +657,10 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle, static int enable_arbiter_disable(void) { struct pci_dev *dev; - int status; + int status = 1; int reg; u8 pci_cmd; - status = 1; /* Find PLE133 host bridge */ reg = 0x78; dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0, @@ -627,13 +693,17 @@ static int enable_arbiter_disable(void) return 0; } -static int longhaul_setup_vt8235(void) +static int longhaul_setup_southbridge(void) { struct pci_dev *dev; u8 pci_cmd; /* Find VT8235 southbridge */ dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL); + if (dev == NULL) + /* Find VT8237 southbridge */ + dev = pci_get_device(PCI_VENDOR_ID_VIA, + PCI_DEVICE_ID_VIA_8237, NULL); if (dev != NULL) { /* Set transition time to max */ pci_read_config_byte(dev, 0xec, &pci_cmd); @@ -645,6 +715,14 @@ static int longhaul_setup_vt8235(void) pci_read_config_byte(dev, 0xe5, &pci_cmd); pci_cmd |= 1 << 7; pci_write_config_byte(dev, 0xe5, pci_cmd); + /* Get address of ACPI registers block*/ + pci_read_config_byte(dev, 0x81, &pci_cmd); + if (pci_cmd & 1 << 7) { + pci_read_config_dword(dev, 0x88, &acpi_regs_addr); + acpi_regs_addr &= 0xff00; + printk(KERN_INFO PFX "ACPI I/O at 0x%x\n", acpi_regs_addr); + } + pci_dev_put(dev); return 1; } @@ -657,7 +735,6 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) char *cpuname=NULL; int ret; u32 lo, hi; - int vt8235_present; /* Check what we have on this motherboard */ switch (c->x86_model) { @@ -755,7 +832,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) }; /* Doesn't hurt */ - vt8235_present = longhaul_setup_vt8235(); + longhaul_setup_southbridge(); /* Find ACPI data for processor */ acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, @@ -765,35 +842,29 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) /* Check ACPI support for C3 state */ if (pr != NULL && longhaul_version == TYPE_POWERSAVER) { cx = &pr->power.states[ACPI_STATE_C3]; - if (cx->address > 0 && cx->latency <= 1000) { + if (cx->address > 0 && cx->latency <= 1000) longhaul_flags |= USE_ACPI_C3; - goto print_support_type; - } } + /* Disable if it isn't working */ + if (disable_acpi_c3) + longhaul_flags &= ~USE_ACPI_C3; /* Check if northbridge is friendly */ - if (enable_arbiter_disable()) { + if (enable_arbiter_disable()) longhaul_flags |= USE_NORTHBRIDGE; - goto print_support_type; - } - /* Use VT8235 southbridge if present */ - if (longhaul_version == TYPE_POWERSAVER && vt8235_present) { - longhaul_flags |= USE_VT8235; - goto print_support_type; - } + /* Check ACPI support for bus master arbiter disable */ - if ((pr == NULL) || !(pr->flags.bm_control)) { + if (!(longhaul_flags & USE_ACPI_C3 + || longhaul_flags & USE_NORTHBRIDGE) + && ((pr == NULL) || !(pr->flags.bm_control))) { printk(KERN_ERR PFX "No ACPI support. Unsupported northbridge.\n"); return -ENODEV; } -print_support_type: if (longhaul_flags & USE_NORTHBRIDGE) - printk (KERN_INFO PFX "Using northbridge support.\n"); - else if (longhaul_flags & USE_VT8235) - printk (KERN_INFO PFX "Using VT8235 support.\n"); - else - printk (KERN_INFO PFX "Using ACPI support.\n"); + printk(KERN_INFO PFX "Using northbridge support.\n"); + if (longhaul_flags & USE_ACPI_C3) + printk(KERN_INFO PFX "Using ACPI support.\n"); ret = longhaul_get_ranges(); if (ret != 0) @@ -885,6 +956,9 @@ static void __exit longhaul_exit(void) kfree(longhaul_table); } +module_param (disable_acpi_c3, int, 0644); +MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support"); + module_param (scale_voltage, int, 0644); MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.h b/arch/i386/kernel/cpu/cpufreq/longhaul.h index 102548f1284..4fcc320997d 100644 --- a/arch/i386/kernel/cpu/cpufreq/longhaul.h +++ b/arch/i386/kernel/cpu/cpufreq/longhaul.h @@ -180,7 +180,7 @@ static const int __initdata ezrat_clock_ratio[32] = { -1, /* 0000 -> RESERVED (10.0x) */ 110, /* 0001 -> 11.0x */ - 120, /* 0010 -> 12.0x */ + -1, /* 0010 -> 12.0x */ -1, /* 0011 -> RESERVED (9.0x)*/ 105, /* 0100 -> 10.5x */ 115, /* 0101 -> 11.5x */ @@ -237,7 +237,7 @@ static const int __initdata ezrat_eblcr[32] = { static const int __initdata nehemiah_clock_ratio[32] = { 100, /* 0000 -> 10.0x */ - 160, /* 0001 -> 16.0x */ + -1, /* 0001 -> 16.0x */ 40, /* 0010 -> 4.0x */ 90, /* 0011 -> 9.0x */ 95, /* 0100 -> 9.5x */ @@ -252,10 +252,10 @@ static const int __initdata nehemiah_clock_ratio[32] = { 75, /* 1101 -> 7.5x */ 85, /* 1110 -> 8.5x */ 120, /* 1111 -> 12.0x */ - 100, /* 0000 -> 10.0x */ + -1, /* 0000 -> 10.0x */ 110, /* 0001 -> 11.0x */ - 120, /* 0010 -> 12.0x */ - 90, /* 0011 -> 9.0x */ + -1, /* 0010 -> 12.0x */ + -1, /* 0011 -> 9.0x */ 105, /* 0100 -> 10.5x */ 115, /* 0101 -> 11.5x */ 125, /* 0110 -> 12.5x */ @@ -267,7 +267,7 @@ static const int __initdata nehemiah_clock_ratio[32] = { 145, /* 1100 -> 14.5x */ 155, /* 1101 -> 15.5x */ -1, /* 1110 -> RESERVED (13.0x) */ - 120, /* 1111 -> 12.0x */ + -1, /* 1111 -> 12.0x */ }; static const int __initdata nehemiah_eblcr[32] = { diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c index 4ade55c5f33..34ed53a0673 100644 --- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c @@ -599,14 +599,17 @@ static void print_basics(struct powernow_k8_data *data) for (j = 0; j < data->numps; j++) { if (data->powernow_table[j].frequency != CPUFREQ_ENTRY_INVALID) { if (cpu_family == CPU_HW_PSTATE) { - printk(KERN_INFO PFX " %d : fid 0x%x gid 0x%x (%d MHz)\n", j, (data->powernow_table[j].index & 0xff00) >> 8, - (data->powernow_table[j].index & 0xff0000) >> 16, - data->powernow_table[j].frequency/1000); + printk(KERN_INFO PFX " %d : fid 0x%x did 0x%x (%d MHz)\n", + j, + (data->powernow_table[j].index & 0xff00) >> 8, + (data->powernow_table[j].index & 0xff0000) >> 16, + data->powernow_table[j].frequency/1000); } else { - printk(KERN_INFO PFX " %d : fid 0x%x (%d MHz), vid 0x%x\n", j, - data->powernow_table[j].index & 0xff, - data->powernow_table[j].frequency/1000, - data->powernow_table[j].index >> 8); + printk(KERN_INFO PFX " %d : fid 0x%x (%d MHz), vid 0x%x\n", + j, + data->powernow_table[j].index & 0xff, + data->powernow_table[j].frequency/1000, + data->powernow_table[j].index >> 8); } } } @@ -1086,7 +1089,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi if (cpu_family == CPU_HW_PSTATE) dprintk("targ: curr fid 0x%x, did 0x%x\n", - data->currfid, data->currvid); + data->currfid, data->currdid); else { dprintk("targ: curr fid 0x%x, vid 0x%x\n", data->currfid, data->currvid); @@ -1322,16 +1325,22 @@ static struct cpufreq_driver cpufreq_amd64_driver = { static int __cpuinit powernowk8_init(void) { unsigned int i, supported_cpus = 0; + unsigned int booted_cores = 1; for_each_online_cpu(i) { if (check_supported_cpu(i)) supported_cpus++; } +#ifdef CONFIG_SMP + booted_cores = cpu_data[0].booted_cores; +#endif + if (supported_cpus == num_online_cpus()) { printk(KERN_INFO PFX "Found %d %s " - "processors (" VERSION ")\n", supported_cpus, - boot_cpu_data.x86_model_id); + "processors (%d cpu cores) (" VERSION ")\n", + supported_cpus/booted_cores, + boot_cpu_data.x86_model_id, supported_cpus); return cpufreq_register_driver(&cpufreq_amd64_driver); } diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c index 35489fd6885..6c5dc2c85ae 100644 --- a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c @@ -21,12 +21,6 @@ #include <linux/delay.h> #include <linux/compiler.h> -#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI -#include <linux/acpi.h> -#include <linux/dmi.h> -#include <acpi/processor.h> -#endif - #include <asm/msr.h> #include <asm/processor.h> #include <asm/cpufeature.h> @@ -257,9 +251,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy) /* Matched a non-match */ dprintk("no table support for CPU model \"%s\"\n", cpu->x86_model_id); -#ifndef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI - dprintk("try compiling with CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI enabled\n"); -#endif + dprintk("try using the acpi-cpufreq driver\n"); return -ENOENT; } @@ -346,213 +338,6 @@ static unsigned int get_cur_freq(unsigned int cpu) } -#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI - -static struct acpi_processor_performance *acpi_perf_data[NR_CPUS]; - -/* - * centrino_cpu_early_init_acpi - Do the preregistering with ACPI P-States - * library - * - * Before doing the actual init, we need to do _PSD related setup whenever - * supported by the BIOS. These are handled by this early_init routine. - */ -static int centrino_cpu_early_init_acpi(void) -{ - unsigned int i, j; - struct acpi_processor_performance *data; - - for_each_possible_cpu(i) { - data = kzalloc(sizeof(struct acpi_processor_performance), - GFP_KERNEL); - if (!data) { - for_each_possible_cpu(j) { - kfree(acpi_perf_data[j]); - acpi_perf_data[j] = NULL; - } - return (-ENOMEM); - } - acpi_perf_data[i] = data; - } - - acpi_processor_preregister_performance(acpi_perf_data); - return 0; -} - - -#ifdef CONFIG_SMP -/* - * Some BIOSes do SW_ANY coordination internally, either set it up in hw - * or do it in BIOS firmware and won't inform about it to OS. If not - * detected, this has a side effect of making CPU run at a different speed - * than OS intended it to run at. Detect it and handle it cleanly. - */ -static int bios_with_sw_any_bug; -static int sw_any_bug_found(struct dmi_system_id *d) -{ - bios_with_sw_any_bug = 1; - return 0; -} - -static struct dmi_system_id sw_any_bug_dmi_table[] = { - { - .callback = sw_any_bug_found, - .ident = "Supermicro Server X6DLP", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"), - DMI_MATCH(DMI_BIOS_VERSION, "080010"), - DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"), - }, - }, - { } -}; -#endif - -/* - * centrino_cpu_init_acpi - register with ACPI P-States library - * - * Register with the ACPI P-States library (part of drivers/acpi/processor.c) - * in order to determine correct frequency and voltage pairings by reading - * the _PSS of the ACPI DSDT or SSDT tables. - */ -static int centrino_cpu_init_acpi(struct cpufreq_policy *policy) -{ - unsigned long cur_freq; - int result = 0, i; - unsigned int cpu = policy->cpu; - struct acpi_processor_performance *p; - - p = acpi_perf_data[cpu]; - - /* register with ACPI core */ - if (acpi_processor_register_performance(p, cpu)) { - dprintk(PFX "obtaining ACPI data failed\n"); - return -EIO; - } - - policy->shared_type = p->shared_type; - /* - * Will let policy->cpus know about dependency only when software - * coordination is required. - */ - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || - policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { - policy->cpus = p->shared_cpu_map; - } - -#ifdef CONFIG_SMP - dmi_check_system(sw_any_bug_dmi_table); - if (bios_with_sw_any_bug && cpus_weight(policy->cpus) == 1) { - policy->shared_type = CPUFREQ_SHARED_TYPE_ALL; - policy->cpus = cpu_core_map[cpu]; - } -#endif - - /* verify the acpi_data */ - if (p->state_count <= 1) { - dprintk("No P-States\n"); - result = -ENODEV; - goto err_unreg; - } - - if ((p->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || - (p->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { - dprintk("Invalid control/status registers (%x - %x)\n", - p->control_register.space_id, p->status_register.space_id); - result = -EIO; - goto err_unreg; - } - - for (i=0; i<p->state_count; i++) { - if ((p->states[i].control & INTEL_MSR_RANGE) != - (p->states[i].status & INTEL_MSR_RANGE)) { - dprintk("Different MSR bits in control (%llu) and status (%llu)\n", - p->states[i].control, p->states[i].status); - result = -EINVAL; - goto err_unreg; - } - - if (!p->states[i].core_frequency) { - dprintk("Zero core frequency for state %u\n", i); - result = -EINVAL; - goto err_unreg; - } - - if (p->states[i].core_frequency > p->states[0].core_frequency) { - dprintk("P%u has larger frequency (%llu) than P0 (%llu), skipping\n", i, - p->states[i].core_frequency, p->states[0].core_frequency); - p->states[i].core_frequency = 0; - continue; - } - } - - centrino_model[cpu] = kzalloc(sizeof(struct cpu_model), GFP_KERNEL); - if (!centrino_model[cpu]) { - result = -ENOMEM; - goto err_unreg; - } - - centrino_model[cpu]->model_name=NULL; - centrino_model[cpu]->max_freq = p->states[0].core_frequency * 1000; - centrino_model[cpu]->op_points = kmalloc(sizeof(struct cpufreq_frequency_table) * - (p->state_count + 1), GFP_KERNEL); - if (!centrino_model[cpu]->op_points) { - result = -ENOMEM; - goto err_kfree; - } - - for (i=0; i<p->state_count; i++) { - centrino_model[cpu]->op_points[i].index = p->states[i].control & INTEL_MSR_RANGE; - centrino_model[cpu]->op_points[i].frequency = p->states[i].core_frequency * 1000; - dprintk("adding state %i with frequency %u and control value %04x\n", - i, centrino_model[cpu]->op_points[i].frequency, centrino_model[cpu]->op_points[i].index); - } - centrino_model[cpu]->op_points[p->state_count].frequency = CPUFREQ_TABLE_END; - - cur_freq = get_cur_freq(cpu); - - for (i=0; i<p->state_count; i++) { - if (!p->states[i].core_frequency) { - dprintk("skipping state %u\n", i); - centrino_model[cpu]->op_points[i].frequency = CPUFREQ_ENTRY_INVALID; - continue; - } - - if (extract_clock(centrino_model[cpu]->op_points[i].index, cpu, 0) != - (centrino_model[cpu]->op_points[i].frequency)) { - dprintk("Invalid encoded frequency (%u vs. %u)\n", - extract_clock(centrino_model[cpu]->op_points[i].index, cpu, 0), - centrino_model[cpu]->op_points[i].frequency); - result = -EINVAL; - goto err_kfree_all; - } - - if (cur_freq == centrino_model[cpu]->op_points[i].frequency) - p->state = i; - } - - /* notify BIOS that we exist */ - acpi_processor_notify_smm(THIS_MODULE); - printk("speedstep-centrino with X86_SPEEDSTEP_CENTRINO_ACPI " - "config is deprecated.\n " - "Use X86_ACPI_CPUFREQ (acpi-cpufreq) instead.\n" ); - - return 0; - - err_kfree_all: - kfree(centrino_model[cpu]->op_points); - err_kfree: - kfree(centrino_model[cpu]); - err_unreg: - acpi_processor_unregister_performance(p, cpu); - dprintk(PFX "invalid ACPI data\n"); - return (result); -} -#else -static inline int centrino_cpu_init_acpi(struct cpufreq_policy *policy) { return -ENODEV; } -static inline int centrino_cpu_early_init_acpi(void) { return 0; } -#endif - static int centrino_cpu_init(struct cpufreq_policy *policy) { struct cpuinfo_x86 *cpu = &cpu_data[policy->cpu]; @@ -568,27 +353,25 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) centrino_driver.flags |= CPUFREQ_CONST_LOOPS; - if (centrino_cpu_init_acpi(policy)) { - if (policy->cpu != 0) - return -ENODEV; + if (policy->cpu != 0) + return -ENODEV; - for (i = 0; i < N_IDS; i++) - if (centrino_verify_cpu_id(cpu, &cpu_ids[i])) - break; + for (i = 0; i < N_IDS; i++) + if (centrino_verify_cpu_id(cpu, &cpu_ids[i])) + break; - if (i != N_IDS) - centrino_cpu[policy->cpu] = &cpu_ids[i]; + if (i != N_IDS) + centrino_cpu[policy->cpu] = &cpu_ids[i]; - if (!centrino_cpu[policy->cpu]) { - dprintk("found unsupported CPU with " - "Enhanced SpeedStep: send /proc/cpuinfo to " - MAINTAINER "\n"); - return -ENODEV; - } + if (!centrino_cpu[policy->cpu]) { + dprintk("found unsupported CPU with " + "Enhanced SpeedStep: send /proc/cpuinfo to " + MAINTAINER "\n"); + return -ENODEV; + } - if (centrino_cpu_init_table(policy)) { - return -ENODEV; - } + if (centrino_cpu_init_table(policy)) { + return -ENODEV; } /* Check to see if Enhanced SpeedStep is enabled, and try to @@ -634,20 +417,6 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy) cpufreq_frequency_table_put_attr(cpu); -#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI - if (!centrino_model[cpu]->model_name) { - static struct acpi_processor_performance *p; - - if (acpi_perf_data[cpu]) { - p = acpi_perf_data[cpu]; - dprintk("unregistering and freeing ACPI data\n"); - acpi_processor_unregister_performance(p, cpu); - kfree(centrino_model[cpu]->op_points); - kfree(centrino_model[cpu]); - } - } -#endif - centrino_model[cpu] = NULL; return 0; @@ -849,25 +618,12 @@ static int __init centrino_init(void) if (!cpu_has(cpu, X86_FEATURE_EST)) return -ENODEV; - centrino_cpu_early_init_acpi(); - return cpufreq_register_driver(¢rino_driver); } static void __exit centrino_exit(void) { -#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI - unsigned int j; -#endif - cpufreq_unregister_driver(¢rino_driver); - -#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI - for_each_possible_cpu(j) { - kfree(acpi_perf_data[j]); - acpi_perf_data[j] = NULL; - } -#endif } MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>"); diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-ich.c b/arch/i386/kernel/cpu/cpufreq/speedstep-ich.c index 698f980eb44..a5b2346faf1 100644 --- a/arch/i386/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/i386/kernel/cpu/cpufreq/speedstep-ich.c @@ -205,7 +205,6 @@ static unsigned int speedstep_detect_chipset (void) * host brige. Abort on these systems. */ static struct pci_dev *hostbridge; - u8 rev = 0; hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82815_MC, @@ -216,8 +215,7 @@ static unsigned int speedstep_detect_chipset (void) if (!hostbridge) return 2; /* 2-M */ - pci_read_config_byte(hostbridge, PCI_REVISION_ID, &rev); - if (rev < 5) { + if (hostbridge->revision < 5) { dprintk("hostbridge does not support speedstep\n"); speedstep_chipset_dev = NULL; pci_dev_put(hostbridge); diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index e88d2fba156..122d2d75aa9 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -4,7 +4,7 @@ #include <linux/pci.h> #include <asm/dma.h> #include <asm/io.h> -#include <asm/processor.h> +#include <asm/processor-cyrix.h> #include <asm/timer.h> #include <asm/pci-direct.h> #include <asm/tsc.h> diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c index e5be819492e..d5a456d27d8 100644 --- a/arch/i386/kernel/cpu/intel_cacheinfo.c +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -4,7 +4,7 @@ * Changes: * Venkatesh Pallipadi : Adding cache identification through cpuid(4) * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. - * Andi Kleen : CPUID4 emulation on AMD. + * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ #include <linux/init.h> @@ -135,7 +135,7 @@ unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same information to the user. This makes some assumptions about the machine: - No L3, L2 not shared, no SMT etc. that is currently true on AMD CPUs. + L2 not shared, no SMT etc. that is currently true on AMD CPUs. In theory the TLBs could be reported as fake type (they are in "dummy"). Maybe later */ @@ -159,13 +159,26 @@ union l2_cache { unsigned val; }; +union l3_cache { + struct { + unsigned line_size : 8; + unsigned lines_per_tag : 4; + unsigned assoc : 4; + unsigned res : 2; + unsigned size_encoded : 14; + }; + unsigned val; +}; + static const unsigned short assocs[] = { [1] = 1, [2] = 2, [4] = 4, [6] = 8, - [8] = 16, + [8] = 16, [0xa] = 32, [0xb] = 48, + [0xc] = 64, [0xf] = 0xffff // ?? - }; -static const unsigned char levels[] = { 1, 1, 2 }; -static const unsigned char types[] = { 1, 2, 3 }; +}; + +static const unsigned char levels[] = { 1, 1, 2, 3 }; +static const unsigned char types[] = { 1, 2, 3, 3 }; static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, union _cpuid4_leaf_ebx *ebx, @@ -175,37 +188,58 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, unsigned line_size, lines_per_tag, assoc, size_in_kb; union l1_cache l1i, l1d; union l2_cache l2; + union l3_cache l3; + union l1_cache *l1 = &l1d; eax->full = 0; ebx->full = 0; ecx->full = 0; cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val); - cpuid(0x80000006, &dummy, &dummy, &l2.val, &dummy); - - if (leaf > 2 || !l1d.val || !l1i.val || !l2.val) - return; - - eax->split.is_self_initializing = 1; - eax->split.type = types[leaf]; - eax->split.level = levels[leaf]; - eax->split.num_threads_sharing = 0; - eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; - - if (leaf <= 1) { - union l1_cache *l1 = leaf == 0 ? &l1d : &l1i; + cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val); + + switch (leaf) { + case 1: + l1 = &l1i; + case 0: + if (!l1->val) + return; assoc = l1->assoc; line_size = l1->line_size; lines_per_tag = l1->lines_per_tag; size_in_kb = l1->size_in_kb; - } else { + break; + case 2: + if (!l2.val) + return; assoc = l2.assoc; line_size = l2.line_size; lines_per_tag = l2.lines_per_tag; /* cpu_data has errata corrections for K7 applied */ size_in_kb = current_cpu_data.x86_cache_size; + break; + case 3: + if (!l3.val) + return; + assoc = l3.assoc; + line_size = l3.line_size; + lines_per_tag = l3.lines_per_tag; + size_in_kb = l3.size_encoded * 512; + break; + default: + return; } + eax->split.is_self_initializing = 1; + eax->split.type = types[leaf]; + eax->split.level = levels[leaf]; + if (leaf == 3) + eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; + else + eax->split.num_threads_sharing = 0; + eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; + + if (assoc == 0xf) eax->split.is_fully_associative = 1; ebx->split.coherency_line_size = line_size - 1; @@ -239,8 +273,7 @@ static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_le return 0; } -/* will only be called once; __init is safe here */ -static int __init find_num_cache_leaves(void) +static int __cpuinit find_num_cache_leaves(void) { unsigned int eax, ebx, ecx, edx; union _cpuid4_leaf_eax cache_eax; @@ -710,7 +743,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) return retval; } -static void __cpuexit cache_remove_dev(struct sys_device * sys_dev) +static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) { unsigned int cpu = sys_dev->id; unsigned long i; diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c index 56cd485b127..34c781eddee 100644 --- a/arch/i386/kernel/cpu/mcheck/mce.c +++ b/arch/i386/kernel/cpu/mcheck/mce.c @@ -60,6 +60,20 @@ void mcheck_init(struct cpuinfo_x86 *c) } } +static unsigned long old_cr4 __initdata; + +void __init stop_mce(void) +{ + old_cr4 = read_cr4(); + clear_in_cr4(X86_CR4_MCE); +} + +void __init restart_mce(void) +{ + if (old_cr4 & X86_CR4_MCE) + set_in_cr4(X86_CR4_MCE); +} + static int __init mcheck_disable(char *str) { mce_disabled = 1; diff --git a/arch/i386/kernel/cpu/mcheck/non-fatal.c b/arch/i386/kernel/cpu/mcheck/non-fatal.c index 6b5d3518a1c..bf39409b383 100644 --- a/arch/i386/kernel/cpu/mcheck/non-fatal.c +++ b/arch/i386/kernel/cpu/mcheck/non-fatal.c @@ -57,7 +57,7 @@ static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); static void mce_work_fn(struct work_struct *work) { on_each_cpu(mce_checkregs, NULL, 1, 1); - schedule_delayed_work(&mce_work, MCE_RATE); + schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); } static int __init init_nonfatal_mce_checker(void) @@ -82,7 +82,7 @@ static int __init init_nonfatal_mce_checker(void) /* * Check for non-fatal errors every MCE_RATE s */ - schedule_delayed_work(&mce_work, MCE_RATE); + schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); printk(KERN_INFO "Machine check exception polling timer started.\n"); return 0; } diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c index 7ba7c3abd3a..1203dc5ab87 100644 --- a/arch/i386/kernel/cpu/mcheck/therm_throt.c +++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c @@ -134,19 +134,21 @@ static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, int err; sys_dev = get_cpu_sysdev(cpu); - mutex_lock(&therm_cpu_lock); switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: + mutex_lock(&therm_cpu_lock); err = thermal_throttle_add_dev(sys_dev); + mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; case CPU_DEAD: case CPU_DEAD_FROZEN: + mutex_lock(&therm_cpu_lock); thermal_throttle_remove_dev(sys_dev); + mutex_unlock(&therm_cpu_lock); break; } - mutex_unlock(&therm_cpu_lock); return NOTIFY_OK; } diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c index 1001f1e0fe6..2287d4863a8 100644 --- a/arch/i386/kernel/cpu/mtrr/cyrix.c +++ b/arch/i386/kernel/cpu/mtrr/cyrix.c @@ -3,6 +3,7 @@ #include <asm/mtrr.h> #include <asm/msr.h> #include <asm/io.h> +#include <asm/processor-cyrix.h> #include "mtrr.h" int arr3_protected; diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c index f6e46943e6e..56f64e34829 100644 --- a/arch/i386/kernel/cpu/mtrr/generic.c +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -79,7 +79,7 @@ static void print_fixed(unsigned base, unsigned step, const mtrr_type*types) } /* Grab all of the MTRR state for this CPU into *state */ -void get_mtrr_state(void) +void __init get_mtrr_state(void) { unsigned int i; struct mtrr_var_range *vrs; diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index 75dc6d5214b..c48b6fea5ab 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -643,7 +643,7 @@ static struct sysdev_driver mtrr_sysdev_driver = { * initialized (i.e. before smp_init()). * */ -__init void mtrr_bp_init(void) +void __init mtrr_bp_init(void) { init_ifs(); diff --git a/arch/i386/kernel/cpu/mtrr/state.c b/arch/i386/kernel/cpu/mtrr/state.c index 7b39a2f954d..c9014ca4a57 100644 --- a/arch/i386/kernel/cpu/mtrr/state.c +++ b/arch/i386/kernel/cpu/mtrr/state.c @@ -3,6 +3,7 @@ #include <asm/io.h> #include <asm/mtrr.h> #include <asm/msr.h> +#include <asm-i386/processor-cyrix.h> #include "mtrr.h" diff --git a/arch/i386/kernel/cpu/perfctr-watchdog.c b/arch/i386/kernel/cpu/perfctr-watchdog.c index 4d26d514c56..4be488e73be 100644 --- a/arch/i386/kernel/cpu/perfctr-watchdog.c +++ b/arch/i386/kernel/cpu/perfctr-watchdog.c @@ -325,7 +325,7 @@ static struct wd_ops k7_wd_ops = { .stop = single_msr_stop_watchdog, .perfctr = MSR_K7_PERFCTR0, .evntsel = MSR_K7_EVNTSEL0, - .checkbit = 1ULL<<63, + .checkbit = 1ULL<<47, }; /* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */ @@ -346,7 +346,9 @@ static int setup_p6_watchdog(unsigned nmi_hz) perfctr_msr = MSR_P6_PERFCTR0; evntsel_msr = MSR_P6_EVNTSEL0; - wrmsrl(perfctr_msr, 0UL); + /* KVM doesn't implement this MSR */ + if (wrmsr_safe(perfctr_msr, 0, 0) < 0) + return 0; evntsel = P6_EVNTSEL_INT | P6_EVNTSEL_OS @@ -599,8 +601,8 @@ static struct wd_ops intel_arch_wd_ops = { .setup = setup_intel_arch_watchdog, .rearm = p6_rearm, .stop = single_msr_stop_watchdog, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .evntsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR1, + .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, }; static void probe_nmi_watchdog(void) diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index 89d91e6cc97..1e31b6caffb 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -29,7 +29,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", "3dnowext", "3dnow", + NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", + "3dnowext", "3dnow", /* Transmeta-defined */ "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, @@ -40,8 +41,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Other (Linux-defined) */ "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", NULL, NULL, NULL, NULL, - "constant_tsc", "up", NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "constant_tsc", "up", NULL, "arch_perfmon", + "pebs", "bts", NULL, "sync_rdtsc", + "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* Intel-defined (#2) */ @@ -57,9 +59,16 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8legacy", "abm", - "sse4a", "misalignsse", - "3dnowprefetch", "osvw", "ibs", NULL, NULL, NULL, NULL, NULL, + "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy", + "altmovcr8", "abm", "sse4a", + "misalignsse", "3dnowprefetch", + "osvw", "ibs", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Auxiliary (Linux-defined) */ + "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c deleted file mode 100644 index 50076f22e90..00000000000 --- a/arch/i386/kernel/cpu/rise.c +++ /dev/null @@ -1,52 +0,0 @@ -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/bitops.h> -#include <asm/processor.h> - -#include "cpu.h" - -static void __cpuinit init_rise(struct cpuinfo_x86 *c) -{ - printk("CPU: Rise iDragon"); - if (c->x86_model > 2) - printk(" II"); - printk("\n"); - - /* Unhide possibly hidden capability flags - The mp6 iDragon family don't have MSRs. - We switch on extra features with this cpuid weirdness: */ - __asm__ ( - "movl $0x6363452a, %%eax\n\t" - "movl $0x3231206c, %%ecx\n\t" - "movl $0x2a32313a, %%edx\n\t" - "cpuid\n\t" - "movl $0x63634523, %%eax\n\t" - "movl $0x32315f6c, %%ecx\n\t" - "movl $0x2333313a, %%edx\n\t" - "cpuid\n\t" : : : "eax", "ebx", "ecx", "edx" - ); - set_bit(X86_FEATURE_CX8, c->x86_capability); -} - -static struct cpu_dev rise_cpu_dev __cpuinitdata = { - .c_vendor = "Rise", - .c_ident = { "RiseRiseRise" }, - .c_models = { - { .vendor = X86_VENDOR_RISE, .family = 5, .model_names = - { - [0] = "iDragon", - [2] = "iDragon", - [8] = "iDragon II", - [9] = "iDragon II" - } - }, - }, - .c_init = init_rise, -}; - -int __init rise_init_cpu(void) -{ - cpu_devs[X86_VENDOR_RISE] = &rise_cpu_dev; - return 0; -} - diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index 9645bb51f76..e60cddbc4cf 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -10,6 +10,7 @@ #include <linux/efi.h> #include <linux/pfn.h> #include <linux/uaccess.h> +#include <linux/suspend.h> #include <asm/pgtable.h> #include <asm/page.h> @@ -320,6 +321,37 @@ static int __init request_standard_resources(void) subsys_initcall(request_standard_resources); +#if defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND) +/** + * e820_mark_nosave_regions - Find the ranges of physical addresses that do not + * correspond to e820 RAM areas and mark the corresponding pages as nosave for + * hibernation. + * + * This function requires the e820 map to be sorted and without any + * overlapping entries and assumes the first e820 area to be RAM. + */ +void __init e820_mark_nosave_regions(void) +{ + int i; + unsigned long pfn; + + pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size); + for (i = 1; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (pfn < PFN_UP(ei->addr)) + register_nosave_region(pfn, PFN_UP(ei->addr)); + + pfn = PFN_DOWN(ei->addr + ei->size); + if (ei->type != E820_RAM) + register_nosave_region(PFN_UP(ei->addr), pfn); + + if (pfn >= max_low_pfn) + break; + } +} +#endif + void __init add_memory_region(unsigned long long start, unsigned long long size, int type) { @@ -734,7 +766,7 @@ void __init print_memory_map(char *who) case E820_NVS: printk("(ACPI NVS)\n"); break; - default: printk("type %lu\n", e820.map[i].type); + default: printk("type %u\n", e820.map[i].type); break; } } diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c index a1808022ea1..2452c6fbe99 100644 --- a/arch/i386/kernel/efi.c +++ b/arch/i386/kernel/efi.c @@ -278,7 +278,7 @@ void efi_memmap_walk(efi_freemem_callback_t callback, void *arg) struct range { unsigned long start; unsigned long end; - } prev, curr; + } uninitialized_var(prev), curr; efi_memory_desc_t *md; unsigned long start, end; void *p; diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 3c3c220488c..a714d6b4350 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -409,8 +409,6 @@ restore_nocheck_notrace: 1: INTERRUPT_RETURN .section .fixup,"ax" iret_exc: - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) pushl $0 # no error code pushl $do_iret_error jmp error_code @@ -1023,6 +1021,91 @@ ENTRY(kernel_thread_helper) CFI_ENDPROC ENDPROC(kernel_thread_helper) +#ifdef CONFIG_XEN +ENTRY(xen_hypervisor_callback) + CFI_STARTPROC + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + + /* Check to see if we got the event in the critical + region in xen_iret_direct, after we've reenabled + events and checked for pending events. This simulates + iret instruction's behaviour where it delivers a + pending interrupt when enabling interrupts. */ + movl PT_EIP(%esp),%eax + cmpl $xen_iret_start_crit,%eax + jb 1f + cmpl $xen_iret_end_crit,%eax + jae 1f + + call xen_iret_crit_fixup + +1: mov %esp, %eax + call xen_evtchn_do_upcall + jmp ret_from_intr + CFI_ENDPROC +ENDPROC(xen_hypervisor_callback) + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(xen_failsafe_callback) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + testl %eax,%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + lea 16(%esp),%esp + CFI_ADJUST_CFA_OFFSET -16 + jz 5f + addl $16,%esp + jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) +5: pushl $0 # EAX == 0 => Category 1 (Bad segment) + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + jmp ret_from_exception + CFI_ENDPROC + +.section .fixup,"ax" +6: xorl %eax,%eax + movl %eax,4(%esp) + jmp 1b +7: xorl %eax,%eax + movl %eax,8(%esp) + jmp 2b +8: xorl %eax,%eax + movl %eax,12(%esp) + jmp 3b +9: xorl %eax,%eax + movl %eax,16(%esp) + jmp 4b +.previous +.section __ex_table,"a" + .align 4 + .long 1b,6b + .long 2b,7b + .long 3b,8b + .long 4b,9b +.previous +ENDPROC(xen_failsafe_callback) + +#endif /* CONFIG_XEN */ + .section .rodata,"a" #include "syscall_table.S" diff --git a/arch/i386/kernel/geode.c b/arch/i386/kernel/geode.c new file mode 100644 index 00000000000..41e8aec4c61 --- /dev/null +++ b/arch/i386/kernel/geode.c @@ -0,0 +1,155 @@ +/* + * AMD Geode southbridge support code + * Copyright (C) 2006, Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ioport.h> +#include <linux/io.h> +#include <asm/msr.h> +#include <asm/geode.h> + +static struct { + char *name; + u32 msr; + int size; + u32 base; +} lbars[] = { + { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 }, + { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 }, + { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 }, + { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 } +}; + +static void __init init_lbars(void) +{ + u32 lo, hi; + int i; + + for (i = 0; i < ARRAY_SIZE(lbars); i++) { + rdmsr(lbars[i].msr, lo, hi); + if (hi & 0x01) + lbars[i].base = lo & 0x0000ffff; + + if (lbars[i].base == 0) + printk(KERN_ERR "geode: Couldn't initialize '%s'\n", + lbars[i].name); + } +} + +int geode_get_dev_base(unsigned int dev) +{ + BUG_ON(dev >= ARRAY_SIZE(lbars)); + return lbars[dev].base; +} +EXPORT_SYMBOL_GPL(geode_get_dev_base); + +/* === GPIO API === */ + +void geode_gpio_set(unsigned int gpio, unsigned int reg) +{ + u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + + if (!base) + return; + + if (gpio < 16) + outl(1 << gpio, base + reg); + else + outl(1 << (gpio - 16), base + 0x80 + reg); +} +EXPORT_SYMBOL_GPL(geode_gpio_set); + +void geode_gpio_clear(unsigned int gpio, unsigned int reg) +{ + u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + + if (!base) + return; + + if (gpio < 16) + outl(1 << (gpio + 16), base + reg); + else + outl(1 << gpio, base + 0x80 + reg); +} +EXPORT_SYMBOL_GPL(geode_gpio_clear); + +int geode_gpio_isset(unsigned int gpio, unsigned int reg) +{ + u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + + if (!base) + return 0; + + if (gpio < 16) + return (inl(base + reg) & (1 << gpio)) ? 1 : 0; + else + return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0; +} +EXPORT_SYMBOL_GPL(geode_gpio_isset); + +void geode_gpio_set_irq(unsigned int group, unsigned int irq) +{ + u32 lo, hi; + + if (group > 7 || irq > 15) + return; + + rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi); + + lo &= ~(0xF << (group * 4)); + lo |= (irq & 0xF) << (group * 4); + + wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi); +} +EXPORT_SYMBOL_GPL(geode_gpio_set_irq); + +void geode_gpio_setup_event(unsigned int gpio, int pair, int pme) +{ + u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + u32 offset, shift, val; + + if (gpio >= 24) + offset = GPIO_MAP_W; + else if (gpio >= 16) + offset = GPIO_MAP_Z; + else if (gpio >= 8) + offset = GPIO_MAP_Y; + else + offset = GPIO_MAP_X; + + shift = (gpio % 8) * 4; + + val = inl(base + offset); + + /* Clear whatever was there before */ + val &= ~(0xF << shift); + + /* And set the new value */ + + val |= ((pair & 7) << shift); + + /* Set the PME bit if this is a PME event */ + + if (pme) + val |= (1 << (shift + 3)); + + outl(val, base + offset); +} +EXPORT_SYMBOL_GPL(geode_gpio_setup_event); + +static int __init geode_southbridge_init(void) +{ + if (!is_geode()) + return -ENODEV; + + init_lbars(); + return 0; +} + +postcore_initcall(geode_southbridge_init); diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index f74dfc419b5..7c52b222207 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -168,6 +168,12 @@ page_pde_offset = (__PAGE_OFFSET >> 20); .section .init.text,"ax",@progbits #endif + /* Do an early initialization of the fixmap area */ + movl $(swapper_pg_dir - __PAGE_OFFSET), %edx + movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax + addl $0x007, %eax /* 0x007 = PRESENT+RW+USER */ + movl %eax, 4092(%edx) + #ifdef CONFIG_SMP ENTRY(startup_32_smp) cld @@ -504,9 +510,12 @@ ENTRY(_stext) /* * BSS section */ -.section ".bss.page_aligned","w" +.section ".bss.page_aligned","wa" + .align PAGE_SIZE_asm ENTRY(swapper_pg_dir) .fill 1024,4,0 +ENTRY(swapper_pg_pmd) + .fill 1024,4,0 ENTRY(empty_zero_page) .fill 4096,1,0 @@ -530,6 +539,8 @@ fault_msg: .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n" .asciz "Stack: %p %p %p %p %p %p %p %p\n" +#include "../xen/xen-head.S" + /* * The IDT and GDT 'descriptors' are a strange 48-bit object * only used by the lidt and lgdt instructions. They are not diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c index 17d73459fc5..533d4932bc7 100644 --- a/arch/i386/kernel/hpet.c +++ b/arch/i386/kernel/hpet.c @@ -5,6 +5,7 @@ #include <linux/init.h> #include <linux/sysdev.h> #include <linux/pm.h> +#include <linux/delay.h> #include <asm/hpet.h> #include <asm/io.h> @@ -187,6 +188,10 @@ static void hpet_set_mode(enum clock_event_mode mode, cfg &= ~HPET_TN_ENABLE; hpet_writel(cfg, HPET_T0_CFG); break; + + case CLOCK_EVT_MODE_RESUME: + hpet_enable_int(); + break; } } @@ -217,6 +222,7 @@ static struct clocksource clocksource_hpet = { .mask = HPET_MASK, .shift = HPET_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = hpet_start_counter, }; /* @@ -226,7 +232,8 @@ int __init hpet_enable(void) { unsigned long id; uint64_t hpet_freq; - u64 tmp; + u64 tmp, start, now; + cycle_t t1; if (!is_hpet_capable()) return 0; @@ -273,6 +280,27 @@ int __init hpet_enable(void) /* Start the counter */ hpet_start_counter(); + /* Verify whether hpet counter works */ + t1 = read_hpet(); + rdtscll(start); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + rdtscll(now); + } while ((now - start) < 200000UL); + + if (t1 == read_hpet()) { + printk(KERN_WARNING + "HPET counter not counting. HPET disabled\n"); + goto out_nohpet; + } + /* Initialize and register HPET clocksource * * hpet period is in femto seconds per cycle @@ -291,7 +319,6 @@ int __init hpet_enable(void) clocksource_register(&clocksource_hpet); - if (id & HPET_ID_LEGSUP) { hpet_enable_int(); hpet_reserve_platform_timers(id); @@ -299,7 +326,7 @@ int __init hpet_enable(void) * Start hpet with the boot cpu mask and make it * global after the IO_APIC has been initialized. */ - hpet_clockevent.cpumask =cpumask_of_cpu(0); + hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); clockevents_register_device(&hpet_clockevent); global_clock_event = &hpet_clockevent; return 1; @@ -524,68 +551,3 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } #endif - - -/* - * Suspend/resume part - */ - -#ifdef CONFIG_PM - -static int hpet_suspend(struct sys_device *sys_device, pm_message_t state) -{ - unsigned long cfg = hpet_readl(HPET_CFG); - - cfg &= ~(HPET_CFG_ENABLE|HPET_CFG_LEGACY); - hpet_writel(cfg, HPET_CFG); - - return 0; -} - -static int hpet_resume(struct sys_device *sys_device) -{ - unsigned int id; - - hpet_start_counter(); - - id = hpet_readl(HPET_ID); - - if (id & HPET_ID_LEGSUP) - hpet_enable_int(); - - return 0; -} - -static struct sysdev_class hpet_class = { - set_kset_name("hpet"), - .suspend = hpet_suspend, - .resume = hpet_resume, -}; - -static struct sys_device hpet_device = { - .id = 0, - .cls = &hpet_class, -}; - - -static __init int hpet_register_sysfs(void) -{ - int err; - - if (!is_hpet_capable()) - return 0; - - err = sysdev_class_register(&hpet_class); - - if (!err) { - err = sysdev_register(&hpet_device); - if (err) - sysdev_class_unregister(&hpet_class); - } - - return err; -} - -device_initcall(hpet_register_sysfs); - -#endif diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c index f8a3c4054c7..6d839f2f1b1 100644 --- a/arch/i386/kernel/i8253.c +++ b/arch/i386/kernel/i8253.c @@ -3,18 +3,17 @@ * */ #include <linux/clockchips.h> -#include <linux/spinlock.h> +#include <linux/init.h> +#include <linux/interrupt.h> #include <linux/jiffies.h> -#include <linux/sysdev.h> #include <linux/module.h> -#include <linux/init.h> +#include <linux/spinlock.h> #include <asm/smp.h> #include <asm/delay.h> #include <asm/i8253.h> #include <asm/io.h> - -#include "io_ports.h" +#include <asm/timer.h> DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); @@ -41,26 +40,27 @@ static void init_pit_timer(enum clock_event_mode mode, case CLOCK_EVT_MODE_PERIODIC: /* binary, mode 2, LSB/MSB, ch 0 */ outb_p(0x34, PIT_MODE); - udelay(10); outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - udelay(10); outb(LATCH >> 8 , PIT_CH0); /* MSB */ break; - /* - * Avoid unnecessary state transitions, as it confuses - * Geode / Cyrix based boxen. - */ case CLOCK_EVT_MODE_SHUTDOWN: - if (evt->mode == CLOCK_EVT_MODE_UNUSED) - break; case CLOCK_EVT_MODE_UNUSED: - if (evt->mode == CLOCK_EVT_MODE_SHUTDOWN) - break; + if (evt->mode == CLOCK_EVT_MODE_PERIODIC || + evt->mode == CLOCK_EVT_MODE_ONESHOT) { + outb_p(0x30, PIT_MODE); + outb_p(0, PIT_CH0); + outb_p(0, PIT_CH0); + } + break; + case CLOCK_EVT_MODE_ONESHOT: /* One shot setup */ outb_p(0x38, PIT_MODE); - udelay(10); + break; + + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ break; } spin_unlock_irqrestore(&i8253_lock, flags); diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c index cff95d10a4d..d26fc063a76 100644 --- a/arch/i386/kernel/init_task.c +++ b/arch/i386/kernel/init_task.c @@ -42,5 +42,5 @@ EXPORT_SYMBOL(init_task); * per-CPU TSS segments. Threads are completely 'soft' on Linux, * no more per-task TSS's. */ -DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS; +DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 7f8b7af2b95..893df828075 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -353,14 +353,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) # include <linux/slab.h> /* kmalloc() */ # include <linux/timer.h> /* time_after() */ -#ifdef CONFIG_BALANCED_IRQ_DEBUG -# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0) -# define Dprintk(x...) do { TDprintk(x); } while (0) -# else -# define TDprintk(x...) -# define Dprintk(x...) -# endif - #define IRQBALANCE_CHECK_ARCH -999 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) #define MIN_BALANCED_IRQ_INTERVAL (HZ/2) @@ -443,7 +435,7 @@ static inline void balance_irq(int cpu, int irq) static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) { int i, j; - Dprintk("Rotating IRQs among CPUs.\n"); + for_each_online_cpu(i) { for (j = 0; j < NR_IRQS; j++) { if (!irq_desc[j].action) @@ -560,19 +552,11 @@ tryanothercpu: max_loaded = tmp_loaded; /* processor */ imbalance = (max_cpu_irq - min_cpu_irq) / 2; - Dprintk("max_loaded cpu = %d\n", max_loaded); - Dprintk("min_loaded cpu = %d\n", min_loaded); - Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq); - Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq); - Dprintk("load imbalance = %lu\n", imbalance); - /* if imbalance is less than approx 10% of max load, then * observe diminishing returns action. - quit */ - if (imbalance < (max_cpu_irq >> 3)) { - Dprintk("Imbalance too trivial\n"); + if (imbalance < (max_cpu_irq >> 3)) goto not_worth_the_effort; - } tryanotherirq: /* if we select an IRQ to move that can't go where we want, then @@ -629,9 +613,6 @@ tryanotherirq: cpus_and(tmp, target_cpu_mask, allowed_mask); if (!cpus_empty(tmp)) { - - Dprintk("irq = %d moved to cpu = %d\n", - selected_irq, min_loaded); /* mark for change destination */ set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); @@ -651,7 +632,6 @@ not_worth_the_effort: */ balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); - Dprintk("IRQ worth rotating not found\n"); return; } @@ -667,6 +647,7 @@ static int balanced_irq(void *unused) set_pending_irq(i, cpumask_of_cpu(0)); } + set_freezable(); for ( ; ; ) { time_remaining = schedule_timeout_interruptible(time_remaining); try_to_freeze(); @@ -1901,7 +1882,7 @@ __setup("no_timer_check", notimercheck); * - if this function detects that timer IRQs are defunct, then we fall * back to ISA timer IRQs */ -int __init timer_irq_works(void) +static int __init timer_irq_works(void) { unsigned long t1 = jiffies; diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index d2daf672f4a..dd2b97fc00b 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -21,7 +21,7 @@ #include <asm/apic.h> #include <asm/uaccess.h> -DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); DEFINE_PER_CPU(struct pt_regs *, irq_regs); @@ -149,15 +149,11 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) #ifdef CONFIG_4KSTACKS -/* - * These should really be __section__(".bss.page_aligned") as well, but - * gcc's 3.0 and earlier don't handle that correctly. - */ static char softirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__aligned__(THREAD_SIZE))); + __attribute__((__section__(".bss.page_aligned"))); static char hardirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__aligned__(THREAD_SIZE))); + __attribute__((__section__(".bss.page_aligned"))); /* * allocate per-cpu stacks for hardirq and for softirq processing diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index dde828a333c..448a50b1324 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -35,6 +35,7 @@ #include <asm/cacheflush.h> #include <asm/desc.h> #include <asm/uaccess.h> +#include <asm/alternative.h> void jprobe_return_end(void); @@ -169,16 +170,12 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) void __kprobes arch_arm_kprobe(struct kprobe *p) { - *p->addr = BREAKPOINT_INSTRUCTION; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); } void __kprobes arch_disarm_kprobe(struct kprobe *p) { - *p->addr = p->opcode; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + text_poke(p->addr, &p->opcode, 1); } void __kprobes arch_remove_kprobe(struct kprobe *p) diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index fba121f7973..99beac7f96c 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -295,7 +295,7 @@ static unsigned int last_irq_sums [NR_CPUS], alert_counter [NR_CPUS]; -void touch_nmi_watchdog (void) +void touch_nmi_watchdog(void) { if (nmi_watchdog > 0) { unsigned cpu; @@ -304,8 +304,10 @@ void touch_nmi_watchdog (void) * Just reset the alert counters, (other CPUs might be * spinning on locks we hold): */ - for_each_present_cpu (cpu) - alert_counter[cpu] = 0; + for_each_present_cpu(cpu) { + if (alert_counter[cpu]) + alert_counter[cpu] = 0; + } } /* @@ -351,7 +353,7 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) * Take the local apic timer and PIT/HPET into account. We don't * know which one is active, when we have highres/dyntick on */ - sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0); + sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_cpu(cpu).irqs[0]; /* if the none of the timers isn't firing, this cpu isn't doing much */ if (!touched && last_irq_sums[cpu] == sum) { diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index faab09abca5..ea962c0667d 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -124,20 +124,28 @@ unsigned paravirt_patch_ignore(unsigned len) return len; } +struct branch { + unsigned char opcode; + u32 delta; +} __attribute__((packed)); + unsigned paravirt_patch_call(void *target, u16 tgt_clobbers, void *site, u16 site_clobbers, unsigned len) { unsigned char *call = site; unsigned long delta = (unsigned long)target - (unsigned long)(call+5); + struct branch b; if (tgt_clobbers & ~site_clobbers) return len; /* target would clobber too much for this site */ if (len < 5) return len; /* call too long for patch site */ - *call++ = 0xe8; /* call */ - *(unsigned long *)call = delta; + b.opcode = 0xe8; /* call */ + b.delta = delta; + BUILD_BUG_ON(sizeof(b) != 5); + text_poke(call, (unsigned char *)&b, 5); return 5; } @@ -146,12 +154,14 @@ unsigned paravirt_patch_jmp(void *target, void *site, unsigned len) { unsigned char *jmp = site; unsigned long delta = (unsigned long)target - (unsigned long)(jmp+5); + struct branch b; if (len < 5) return len; /* call too long for patch site */ - *jmp++ = 0xe9; /* jmp */ - *(unsigned long *)jmp = delta; + b.opcode = 0xe9; /* jmp */ + b.delta = delta; + text_poke(jmp, (unsigned char *)&b, 5); return 5; } @@ -228,6 +238,41 @@ static int __init print_banner(void) } core_initcall(print_banner); +static struct resource reserve_ioports = { + .start = 0, + .end = IO_SPACE_LIMIT, + .name = "paravirt-ioport", + .flags = IORESOURCE_IO | IORESOURCE_BUSY, +}; + +static struct resource reserve_iomem = { + .start = 0, + .end = -1, + .name = "paravirt-iomem", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + +/* + * Reserve the whole legacy IO space to prevent any legacy drivers + * from wasting time probing for their hardware. This is a fairly + * brute-force approach to disabling all non-virtual drivers. + * + * Note that this must be called very early to have any effect. + */ +int paravirt_disable_iospace(void) +{ + int ret; + + ret = request_resource(&ioport_resource, &reserve_ioports); + if (ret == 0) { + ret = request_resource(&iomem_resource, &reserve_iomem); + if (ret) + release_resource(&reserve_ioports); + } + + return ret; +} + struct paravirt_ops paravirt_ops = { .name = "bare hardware", .paravirt_enabled = 0, @@ -267,7 +312,7 @@ struct paravirt_ops paravirt_ops = { .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, - .get_scheduled_cycles = native_read_tsc, + .sched_clock = native_sched_clock, .get_cpu_khz = native_calculate_cpu_khz, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 06dfa65ad18..84664710b78 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -300,6 +300,7 @@ early_param("idle", idle_setup); void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; + unsigned long d0, d1, d2, d3, d6, d7; printk("\n"); printk("Pid: %d, comm: %20s\n", current->pid, current->comm); @@ -324,6 +325,17 @@ void show_regs(struct pt_regs * regs) cr3 = read_cr3(); cr4 = read_cr4_safe(); printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + get_debugreg(d3, 3); + printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + d0, d1, d2, d3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + printk("DR6: %08lx DR7: %08lx\n", d6, d7); + show_trace(NULL, regs, ®s->esp); } @@ -538,8 +550,31 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) return 1; } -static noinline void __switch_to_xtra(struct task_struct *next_p, - struct tss_struct *tss) +#ifdef CONFIG_SECCOMP +void hard_disable_TSC(void) +{ + write_cr4(read_cr4() | X86_CR4_TSD); +} +void disable_TSC(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_disable_TSC(); + preempt_enable(); +} +void hard_enable_TSC(void) +{ + write_cr4(read_cr4() & ~X86_CR4_TSD); +} +#endif /* CONFIG_SECCOMP */ + +static noinline void +__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss) { struct thread_struct *next; @@ -555,6 +590,17 @@ static noinline void __switch_to_xtra(struct task_struct *next_p, set_debugreg(next->debugreg[7], 7); } +#ifdef CONFIG_SECCOMP + if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ + test_tsk_thread_flag(next_p, TIF_NOTSC)) { + /* prev and next are different */ + if (test_tsk_thread_flag(next_p, TIF_NOTSC)) + hard_disable_TSC(); + else + hard_enable_TSC(); + } +#endif + if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { /* * Disable the bitmap via an invalid offset. We still cache @@ -586,33 +632,6 @@ static noinline void __switch_to_xtra(struct task_struct *next_p, } /* - * This function selects if the context switch from prev to next - * has to tweak the TSC disable bit in the cr4. - */ -static inline void disable_tsc(struct task_struct *prev_p, - struct task_struct *next_p) -{ - struct thread_info *prev, *next; - - /* - * gcc should eliminate the ->thread_info dereference if - * has_secure_computing returns 0 at compile time (SECCOMP=n). - */ - prev = task_thread_info(prev_p); - next = task_thread_info(next_p); - - if (has_secure_computing(prev) || has_secure_computing(next)) { - /* slow path here */ - if (has_secure_computing(prev) && - !has_secure_computing(next)) { - write_cr4(read_cr4() & ~X86_CR4_TSD); - } else if (!has_secure_computing(prev) && - has_secure_computing(next)) - write_cr4(read_cr4() | X86_CR4_TSD); - } -} - -/* * switch_to(x,yn) should switch tasks from x to y. * * We fsave/fwait so that an exception goes off at the right time @@ -689,11 +708,9 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas /* * Now maybe handle debug registers and/or IO bitmaps */ - if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW) - || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))) - __switch_to_xtra(next_p, tss); - - disable_tsc(prev_p, next_p); + if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || + task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) + __switch_to_xtra(prev_p, next_p, tss); /* * Leave lazy mode, flushing any hypercalls made here. diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index 0c0ceec5de0..0c8f00e69c4 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -164,14 +164,22 @@ static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_ u32 *desc; unsigned long base; - down(&child->mm->context.sem); - desc = child->mm->context.ldt + (seg & ~7); - base = (desc[0] >> 16) | ((desc[1] & 0xff) << 16) | (desc[1] & 0xff000000); + seg &= ~7UL; - /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) - addr &= 0xffff; - addr += base; + down(&child->mm->context.sem); + if (unlikely((seg >> 3) >= child->mm->context.size)) + addr = -1L; /* bogus selector, access would fault */ + else { + desc = child->mm->context.ldt + seg; + base = ((desc[0] >> 16) | + ((desc[1] & 0xff) << 16) | + (desc[1] & 0xff000000)); + + /* 16-bit code segment? */ + if (!((desc[1] >> 22) & 1)) + addr &= 0xffff; + addr += base; + } up(&child->mm->context.sem); } return addr; @@ -358,17 +366,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) switch (request) { /* when I and D space are separate, these will need to be fixed. */ case PTRACE_PEEKTEXT: /* read word at location addr. */ - case PTRACE_PEEKDATA: { - unsigned long tmp; - int copied; - - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); - ret = -EIO; - if (copied != sizeof(tmp)) - break; - ret = put_user(tmp, datap); + case PTRACE_PEEKDATA: + ret = generic_ptrace_peekdata(child, addr, data); break; - } /* read the word at location addr in the USER area. */ case PTRACE_PEEKUSR: { @@ -395,10 +395,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) /* when I and D space are separate, this will have to be fixed. */ case PTRACE_POKETEXT: /* write the word at location addr. */ case PTRACE_POKEDATA: - ret = 0; - if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data)) - break; - ret = -EIO; + ret = generic_ptrace_pokedata(child, addr, data); break; case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index 5513f8d5b5b..0d796248866 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -113,6 +113,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), }, }, + { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 745", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), + DMI_MATCH(DMI_BOARD_NAME, "0WF810"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 698c24fe482..d474cd639bc 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -102,19 +102,10 @@ static unsigned int highmem_pages = -1; /* * Setup options */ -struct drive_info_struct { char dummy[32]; } drive_info; -#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \ - defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) -EXPORT_SYMBOL(drive_info); -#endif struct screen_info screen_info; EXPORT_SYMBOL(screen_info); struct apm_info apm_info; EXPORT_SYMBOL(apm_info); -struct sys_desc_table_struct { - unsigned short length; - unsigned char table[0]; -}; struct edid_info edid_info; EXPORT_SYMBOL_GPL(edid_info); struct ist_info ist_info; @@ -134,7 +125,7 @@ unsigned long saved_videomode; static char __initdata command_line[COMMAND_LINE_SIZE]; -unsigned char __initdata boot_params[PARAM_SIZE]; +struct boot_params __initdata boot_params; #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; @@ -282,18 +273,18 @@ unsigned long __init find_max_low_pfn(void) printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); if (max_pfn > MAX_NONPAE_PFN) - printk(KERN_WARNING "Use a PAE enabled kernel.\n"); + printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); else printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); max_pfn = MAXMEM_PFN; #else /* !CONFIG_HIGHMEM */ -#ifndef CONFIG_X86_PAE +#ifndef CONFIG_HIGHMEM64G if (max_pfn > MAX_NONPAE_PFN) { max_pfn = MAX_NONPAE_PFN; printk(KERN_WARNING "Warning only 4GB will be used.\n"); - printk(KERN_WARNING "Use a PAE enabled kernel.\n"); + printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); } -#endif /* !CONFIG_X86_PAE */ +#endif /* !CONFIG_HIGHMEM64G */ #endif /* !CONFIG_HIGHMEM */ } else { if (highmem_pages == -1) @@ -475,7 +466,7 @@ void __init setup_bootmem_allocator(void) * * This should all compile down to nothing when NUMA is off. */ -void __init remapped_pgdat_init(void) +static void __init remapped_pgdat_init(void) { int nid; @@ -528,7 +519,6 @@ void __init setup_arch(char **cmdline_p) #endif ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); - drive_info = DRIVE_INFO; screen_info = SCREEN_INFO; edid_info = EDID_INFO; apm_info.bios = APM_BIOS_INFO; @@ -611,6 +601,8 @@ void __init setup_arch(char **cmdline_p) * NOTE: at this point the bootmem allocator is fully available. */ + paravirt_post_allocator_init(); + dmi_scan_machine(); #ifdef CONFIG_X86_GENERICARCH @@ -648,6 +640,7 @@ void __init setup_arch(char **cmdline_p) #endif e820_register_memory(); + e820_mark_nosave_regions(); #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index d574e38f0f7..f5dd85656c1 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -199,6 +199,13 @@ asmlinkage int sys_sigreturn(unsigned long __unused) return eax; badframe: + if (show_unhandled_signals && printk_ratelimit()) + printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx" + " esp:%lx oeax:%lx\n", + current->pid > 1 ? KERN_INFO : KERN_EMERG, + current->comm, current->pid, frame, regs->eip, + regs->esp, regs->orig_eax); + force_sig(SIGSEGV, current); return 0; } diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 6299c080f6e..2d35d850202 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -22,6 +22,7 @@ #include <asm/mtrr.h> #include <asm/tlbflush.h> +#include <asm/mmu_context.h> #include <mach_apic.h> /* @@ -249,13 +250,13 @@ static unsigned long flush_va; static DEFINE_SPINLOCK(tlbstate_lock); /* - * We cannot call mmdrop() because we are in interrupt context, + * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. * * We need to reload %cr3 since the page tables may be going * away from under us.. */ -static inline void leave_mm (unsigned long cpu) +void leave_mm(unsigned long cpu) { if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) BUG(); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 0b2954534b8..e4f61d1c624 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -148,7 +148,7 @@ void __init smp_alloc_memory(void) * a given CPU */ -static void __cpuinit smp_store_cpu_info(int id) +void __cpuinit smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = cpu_data + id; @@ -308,8 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu) /* representing cpus for which sibling maps can be computed */ static cpumask_t cpu_sibling_setup_map; -static inline void -set_cpu_sibling_map(int cpu) +void __cpuinit set_cpu_sibling_map(int cpu) { int i; struct cpuinfo_x86 *c = cpu_data; @@ -1144,8 +1143,7 @@ void __init native_smp_prepare_boot_cpu(void) } #ifdef CONFIG_HOTPLUG_CPU -static void -remove_siblinginfo(int cpu) +void remove_siblinginfo(int cpu) { int sibling; struct cpuinfo_x86 *c = cpu_data; diff --git a/arch/i386/kernel/smpcommon.c b/arch/i386/kernel/smpcommon.c index 1868ae18eb4..bbfe85a0f69 100644 --- a/arch/i386/kernel/smpcommon.c +++ b/arch/i386/kernel/smpcommon.c @@ -47,7 +47,7 @@ int smp_call_function(void (*func) (void *info), void *info, int nonatomic, EXPORT_SYMBOL(smp_call_function); /** - * smp_call_function_single - Run a function on another CPU + * smp_call_function_single - Run a function on a specific CPU * @cpu: The target CPU. Cannot be the calling CPU. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. @@ -66,9 +66,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int ret; int me = get_cpu(); if (cpu == me) { - WARN_ON(1); + local_irq_disable(); + func(info); + local_irq_enable(); put_cpu(); - return -EBUSY; + return 0; } ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index bf6adce5226..8344c70adf6 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -323,3 +323,4 @@ ENTRY(sys_call_table) .long sys_signalfd .long sys_timerfd .long sys_eventfd + .long sys_fallocate diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index ff4ee6f3326..6deb159d08e 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -336,7 +336,9 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) int in_gate_area(struct task_struct *task, unsigned long addr) { - return 0; + const struct vm_area_struct *vma = get_gate_vma(task); + + return vma && addr >= vma->vm_start && addr < vma->vm_end; } int in_gate_area_no_task(unsigned long addr) diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index a665df61f08..19a6c678d02 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -207,55 +207,9 @@ unsigned long read_persistent_clock(void) return retval; } -static void sync_cmos_clock(unsigned long dummy); - -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); -int no_sync_cmos_clock; - -static void sync_cmos_clock(unsigned long dummy) -{ - struct timeval now, next; - int fail = 1; - - /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - * This code is run on a timer. If the clock is set, that timer - * may not expire at the correct time. Thus, we adjust... - */ - if (!ntp_synced()) - /* - * Not synced, exit, do not restart a timer (if one is - * running, let it run out). - */ - return; - - do_gettimeofday(&now); - if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && - now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) - fail = set_rtc_mmss(now.tv_sec); - - next.tv_usec = USEC_AFTER - now.tv_usec; - if (next.tv_usec <= 0) - next.tv_usec += USEC_PER_SEC; - - if (!fail) - next.tv_sec = 659; - else - next.tv_sec = 0; - - if (next.tv_usec >= USEC_PER_SEC) { - next.tv_sec++; - next.tv_usec -= USEC_PER_SEC; - } - mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next)); -} - -void notify_arch_cmos_timer(void) +int update_persistent_clock(struct timespec now) { - if (!no_sync_cmos_clock) - mod_timer(&sync_cmos_timer, jiffies + 1); + return set_rtc_mmss(now.tv_sec); } extern void (*late_time_init)(void); diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 90da0575fcf..cfffe3dd9e8 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -41,6 +41,10 @@ #include <linux/mca.h> #endif +#if defined(CONFIG_EDAC) +#include <linux/edac.h> +#endif + #include <asm/processor.h> #include <asm/system.h> #include <asm/io.h> @@ -148,7 +152,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!stack) { unsigned long dummy; stack = &dummy; - if (task && task != current) + if (task != current) stack = (unsigned long *)task->thread.esp; } @@ -207,6 +211,7 @@ static void print_trace_address(void *data, unsigned long addr) { printk("%s [<%08lx>] ", (char *)data, addr); print_symbol("%s\n", addr); + touch_nmi_watchdog(); } static struct stacktrace_ops print_trace_ops = { @@ -390,7 +395,7 @@ void die(const char * str, struct pt_regs * regs, long err) unsigned long esp; unsigned short ss; - report_bug(regs->eip); + report_bug(regs->eip, regs); printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT @@ -433,6 +438,7 @@ void die(const char * str, struct pt_regs * regs, long err) bust_spinlocks(0); die.lock_owner = -1; + add_taint(TAINT_DIE); spin_unlock_irqrestore(&die.lock, flags); if (!regs) @@ -517,10 +523,12 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ fastcall void do_##name(struct pt_regs * regs, long error_code) \ { \ siginfo_t info; \ + if (irq) \ + local_irq_enable(); \ info.si_signo = signr; \ info.si_errno = 0; \ info.si_code = sicode; \ @@ -560,13 +568,13 @@ DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) #endif DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip) +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0) DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) -DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0) +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) +DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) fastcall void __kprobes do_general_protection(struct pt_regs * regs, long error_code) @@ -610,6 +618,13 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, current->thread.error_code = error_code; current->thread.trap_no = 13; + if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && + printk_ratelimit()) + printk(KERN_INFO + "%s[%d] general protection eip:%lx esp:%lx error:%lx\n", + current->comm, current->pid, + regs->eip, regs->esp, error_code); + force_sig(SIGSEGV, current); return; @@ -635,6 +650,14 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " "CPU %d.\n", reason, smp_processor_id()); printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); + +#if defined(CONFIG_EDAC) + if(edac_handler_set()) { + edac_atomic_assert_error(); + return; + } +#endif + if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); @@ -752,6 +775,8 @@ static __kprobes void default_do_nmi(struct pt_regs * regs) reassert_nmi(); } +static int ignore_nmis; + fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) { int cpu; @@ -762,11 +787,24 @@ fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) ++nmi_count(cpu); - default_do_nmi(regs); + if (!ignore_nmis) + default_do_nmi(regs); nmi_exit(); } +void stop_nmi(void) +{ + acpi_nmi_disable(); + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; + acpi_nmi_enable(); +} + #ifdef CONFIG_KPROBES fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) { @@ -1053,6 +1091,7 @@ asmlinkage void math_state_restore(void) thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; } +EXPORT_SYMBOL_GPL(math_state_restore); #ifndef CONFIG_MATH_EMULATION diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index ea63a30ca3e..debd7dbb415 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -27,6 +27,7 @@ static int tsc_enabled; * an extra value to store the TSC freq */ unsigned int tsc_khz; +EXPORT_SYMBOL_GPL(tsc_khz); int tsc_disable; @@ -58,10 +59,11 @@ __setup("notsc", tsc_setup); */ static int tsc_unstable; -static inline int check_tsc_unstable(void) +int check_tsc_unstable(void) { return tsc_unstable; } +EXPORT_SYMBOL_GPL(check_tsc_unstable); /* Accellerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) @@ -84,7 +86,7 @@ static inline int check_tsc_unstable(void) * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static unsigned long cyc2ns_scale __read_mostly; +unsigned long cyc2ns_scale __read_mostly; #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ @@ -93,15 +95,10 @@ static inline void set_cyc2ns_scale(unsigned long cpu_khz) cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; } -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - /* * Scheduler clock - returns current time in nanosec units. */ -unsigned long long sched_clock(void) +unsigned long long native_sched_clock(void) { unsigned long long this_offset; @@ -118,12 +115,24 @@ unsigned long long sched_clock(void) return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); /* read the Time Stamp Counter: */ - get_scheduled_cycles(this_offset); + rdtscll(this_offset); /* return the value in ns */ return cycles_2_ns(this_offset); } +/* We need to define a real function for sched_clock, to override the + weak default version */ +#ifdef CONFIG_PARAVIRT +unsigned long long sched_clock(void) +{ + return paravirt_sched_clock(); +} +#else +unsigned long long sched_clock(void) + __attribute__((alias("native_sched_clock"))); +#endif + unsigned long native_calculate_cpu_khz(void) { unsigned long long start, end; diff --git a/arch/i386/kernel/verify_cpu.S b/arch/i386/kernel/verify_cpu.S deleted file mode 100644 index f1d1eacf4ab..00000000000 --- a/arch/i386/kernel/verify_cpu.S +++ /dev/null @@ -1,94 +0,0 @@ -/* Check if CPU has some minimum CPUID bits - This runs in 16bit mode so that the caller can still use the BIOS - to output errors on the screen */ -#include <asm/cpufeature.h> -#include <asm/msr.h> - -verify_cpu: - pushfl # Save caller passed flags - pushl $0 # Kill any dangerous flags - popfl - -#if CONFIG_X86_MINIMUM_CPU_MODEL >= 4 - pushfl - pop %eax - orl $(1<<18),%eax # try setting AC - push %eax - popfl - pushfl - popl %eax - testl $(1<<18),%eax - jz bad -#endif -#if REQUIRED_MASK1 != 0 - pushfl # standard way to check for cpuid - popl %eax - movl %eax,%ebx - xorl $0x200000,%eax - pushl %eax - popfl - pushfl - popl %eax - cmpl %eax,%ebx - pushfl # standard way to check for cpuid - popl %eax - movl %eax,%ebx - xorl $0x200000,%eax - pushl %eax - popfl - pushfl - popl %eax - cmpl %eax,%ebx - jz bad # REQUIRED_MASK1 != 0 requires CPUID - - movl $0x0,%eax # See if cpuid 1 is implemented - cpuid - cmpl $0x1,%eax - jb bad # no cpuid 1 - -#if REQUIRED_MASK1 & NEED_CMPXCHG64 - /* Some VIA C3s need magic MSRs to enable CX64. Do this here */ - cmpl $0x746e6543,%ebx # Cent - jne 1f - cmpl $0x48727561,%edx # aurH - jne 1f - cmpl $0x736c7561,%ecx # auls - jne 1f - movl $1,%eax # check model - cpuid - movl %eax,%ebx - shr $8,%ebx - andl $0xf,%ebx - cmp $6,%ebx # check family == 6 - jne 1f - shr $4,%eax - andl $0xf,%eax - cmpl $6,%eax # check model >= 6 - jb 1f - # assume models >= 6 all support this MSR - movl $MSR_VIA_FCR,%ecx - rdmsr - orl $((1<<1)|(1<<7)),%eax # enable CMPXCHG64 and PGE - wrmsr -1: -#endif - movl $0x1,%eax # Does the cpu have what it takes - cpuid - -#if CONFIG_X86_MINIMUM_CPU_MODEL > 4 -#error add proper model checking here -#endif - - andl $REQUIRED_MASK1,%edx - xorl $REQUIRED_MASK1,%edx - jnz bad -#endif /* REQUIRED_MASK1 */ - - popfl - xor %eax,%eax - ret - -bad: - popfl - movl $1,%eax - ret diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index c12720d7cbc..72042bb7ec9 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -362,7 +362,7 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) } #endif -static void vmi_allocate_pt(u32 pfn) +static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn) { vmi_set_page_type(pfn, VMI_PAGE_L1); vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); @@ -891,7 +891,7 @@ static inline int __init activate_vmi(void) paravirt_ops.setup_boot_clock = vmi_time_bsp_init; paravirt_ops.setup_secondary_clock = vmi_time_ap_init; #endif - paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles; + paravirt_ops.sched_clock = vmi_sched_clock; paravirt_ops.get_cpu_khz = vmi_cpu_khz; /* We have true wallclock functions; disable CMOS clock sync */ diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c index 26a37f8a876..b1b5ab08b26 100644 --- a/arch/i386/kernel/vmiclock.c +++ b/arch/i386/kernel/vmiclock.c @@ -32,6 +32,7 @@ #include <asm/apicdef.h> #include <asm/apic.h> #include <asm/timer.h> +#include <asm/i8253.h> #include <irq_vectors.h> #include "io_ports.h" @@ -64,10 +65,10 @@ int vmi_set_wallclock(unsigned long now) return 0; } -/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */ -unsigned long long vmi_get_sched_cycles(void) +/* paravirt_ops.sched_clock = vmi_sched_clock */ +unsigned long long vmi_sched_clock(void) { - return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE); + return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); } /* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ @@ -142,6 +143,7 @@ static void vmi_timer_set_mode(enum clock_event_mode mode, switch (mode) { case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_RESUME: break; case CLOCK_EVT_MODE_PERIODIC: cycles_per_hz = vmi_timer_ops.get_cycle_frequency(); diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index aa87b06c7c8..7d72cce0052 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -60,7 +60,9 @@ SECTIONS __stop___ex_table = .; } - BUG_TABLE + NOTES :text :note + + BUG_TABLE :text . = ALIGN(4); .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { @@ -88,6 +90,7 @@ SECTIONS . = ALIGN(4096); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) *(.data.idt) } @@ -180,6 +183,7 @@ SECTIONS .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { __per_cpu_start = .; *(.data.percpu) + *(.data.percpu.shared_aligned) __per_cpu_end = .; } . = ALIGN(4096); @@ -206,6 +210,4 @@ SECTIONS STABS_DEBUG DWARF_DEBUG - - NOTES } diff --git a/arch/i386/kernel/vsyscall-note.S b/arch/i386/kernel/vsyscall-note.S index d4b5be4f3d5..07c0daf7823 100644 --- a/arch/i386/kernel/vsyscall-note.S +++ b/arch/i386/kernel/vsyscall-note.S @@ -3,23 +3,43 @@ * Here we can supply some information useful to userland. */ -#include <linux/uts.h> #include <linux/version.h> +#include <linux/elfnote.h> -#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type) \ - .section name, flags; \ - .balign 4; \ - .long 1f - 0f; /* name length */ \ - .long 3f - 2f; /* data length */ \ - .long type; /* note type */ \ -0: .asciz vendor; /* vendor name */ \ -1: .balign 4; \ -2: +/* Ideally this would use UTS_NAME, but using a quoted string here + doesn't work. Remember to change this when changing the + kernel's name. */ +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END -#define ASM_ELF_NOTE_END \ -3: .balign 4; /* pad out section */ \ - .previous +#ifdef CONFIG_XEN +/* + * Add a special note telling glibc's dynamic linker a fake hardware + * flavor that it will use to choose the search path for libraries in the + * same way it uses real hardware capabilities like "mmx". + * We supply "nosegneg" as the fake capability, to indicate that we + * do not like negative offsets in instructions using segment overrides, + * since we implement those inefficiently. This makes it possible to + * install libraries optimized to avoid those access patterns in someplace + * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file + * corresponding to the bits here is needed to make ldconfig work right. + * It should contain: + * hwcap 1 nosegneg + * to match the mapping of bit to name that we give here. + * + * At runtime, the fake hardware feature will be considered to be present + * if its bit is set in the mask word. So, we start with the mask 0, and + * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen. + */ - ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0) - .long LINUX_VERSION_CODE - ASM_ELF_NOTE_END +#include "../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ + + .globl VDSO_NOTE_MASK +ELFNOTE_START(GNU, 2, "a") + .long 1 /* ncaps */ +VDSO_NOTE_MASK: + .long 0 /* mask */ + .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ +ELFNOTE_END +#endif diff --git a/arch/i386/lib/Makefile b/arch/i386/lib/Makefile index 22d8ac5815f..4d105fdfe81 100644 --- a/arch/i386/lib/Makefile +++ b/arch/i386/lib/Makefile @@ -4,7 +4,7 @@ lib-y = checksum.o delay.o usercopy.o getuser.o putuser.o memcpy.o strstr.o \ - bitops.o semaphore.o + bitops.o semaphore.o string.o lib-$(CONFIG_X86_USE_3DNOW) += mmx.o diff --git a/arch/i386/lib/string.c b/arch/i386/lib/string.c new file mode 100644 index 00000000000..2c773fefa3d --- /dev/null +++ b/arch/i386/lib/string.c @@ -0,0 +1,257 @@ +/* + * Most of the string-functions are rather heavily hand-optimized, + * see especially strsep,strstr,str[c]spn. They should work, but are not + * very easy to understand. Everything is done entirely within the register + * set, making the functions fast and clean. String instructions have been + * used through-out, making for "slightly" unclear code :-) + * + * AK: On P4 and K7 using non string instruction implementations might be faster + * for large memory blocks. But most of them are unlikely to be used on large + * strings. + */ + +#include <linux/string.h> +#include <linux/module.h> + +#ifdef __HAVE_ARCH_STRCPY +char *strcpy(char * dest,const char *src) +{ + int d0, d1, d2; + asm volatile( "1:\tlodsb\n\t" + "stosb\n\t" + "testb %%al,%%al\n\t" + "jne 1b" + : "=&S" (d0), "=&D" (d1), "=&a" (d2) + :"0" (src),"1" (dest) : "memory"); + return dest; +} +EXPORT_SYMBOL(strcpy); +#endif + +#ifdef __HAVE_ARCH_STRNCPY +char *strncpy(char * dest,const char *src,size_t count) +{ + int d0, d1, d2, d3; + asm volatile( "1:\tdecl %2\n\t" + "js 2f\n\t" + "lodsb\n\t" + "stosb\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n\t" + "rep\n\t" + "stosb\n" + "2:" + : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3) + :"0" (src),"1" (dest),"2" (count) : "memory"); + return dest; +} +EXPORT_SYMBOL(strncpy); +#endif + +#ifdef __HAVE_ARCH_STRCAT +char *strcat(char * dest,const char * src) +{ + int d0, d1, d2, d3; + asm volatile( "repne\n\t" + "scasb\n\t" + "decl %1\n" + "1:\tlodsb\n\t" + "stosb\n\t" + "testb %%al,%%al\n\t" + "jne 1b" + : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) + : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory"); + return dest; +} +EXPORT_SYMBOL(strcat); +#endif + +#ifdef __HAVE_ARCH_STRNCAT +char *strncat(char * dest,const char * src,size_t count) +{ + int d0, d1, d2, d3; + asm volatile( "repne\n\t" + "scasb\n\t" + "decl %1\n\t" + "movl %8,%3\n" + "1:\tdecl %3\n\t" + "js 2f\n\t" + "lodsb\n\t" + "stosb\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n" + "2:\txorl %2,%2\n\t" + "stosb" + : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) + : "0" (src),"1" (dest),"2" (0),"3" (0xffffffffu), "g" (count) + : "memory"); + return dest; +} +EXPORT_SYMBOL(strncat); +#endif + +#ifdef __HAVE_ARCH_STRCMP +int strcmp(const char * cs,const char * ct) +{ + int d0, d1; + int res; + asm volatile( "1:\tlodsb\n\t" + "scasb\n\t" + "jne 2f\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n\t" + "xorl %%eax,%%eax\n\t" + "jmp 3f\n" + "2:\tsbbl %%eax,%%eax\n\t" + "orb $1,%%al\n" + "3:" + :"=a" (res), "=&S" (d0), "=&D" (d1) + :"1" (cs),"2" (ct) + :"memory"); + return res; +} +EXPORT_SYMBOL(strcmp); +#endif + +#ifdef __HAVE_ARCH_STRNCMP +int strncmp(const char * cs,const char * ct,size_t count) +{ + int res; + int d0, d1, d2; + asm volatile( "1:\tdecl %3\n\t" + "js 2f\n\t" + "lodsb\n\t" + "scasb\n\t" + "jne 3f\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n" + "2:\txorl %%eax,%%eax\n\t" + "jmp 4f\n" + "3:\tsbbl %%eax,%%eax\n\t" + "orb $1,%%al\n" + "4:" + :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2) + :"1" (cs),"2" (ct),"3" (count) + :"memory"); + return res; +} +EXPORT_SYMBOL(strncmp); +#endif + +#ifdef __HAVE_ARCH_STRCHR +char *strchr(const char * s, int c) +{ + int d0; + char * res; + asm volatile( "movb %%al,%%ah\n" + "1:\tlodsb\n\t" + "cmpb %%ah,%%al\n\t" + "je 2f\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n\t" + "movl $1,%1\n" + "2:\tmovl %1,%0\n\t" + "decl %0" + :"=a" (res), "=&S" (d0) + :"1" (s),"0" (c) + :"memory"); + return res; +} +EXPORT_SYMBOL(strchr); +#endif + +#ifdef __HAVE_ARCH_STRRCHR +char *strrchr(const char * s, int c) +{ + int d0, d1; + char * res; + asm volatile( "movb %%al,%%ah\n" + "1:\tlodsb\n\t" + "cmpb %%ah,%%al\n\t" + "jne 2f\n\t" + "leal -1(%%esi),%0\n" + "2:\ttestb %%al,%%al\n\t" + "jne 1b" + :"=g" (res), "=&S" (d0), "=&a" (d1) + :"0" (0),"1" (s),"2" (c) + :"memory"); + return res; +} +EXPORT_SYMBOL(strrchr); +#endif + +#ifdef __HAVE_ARCH_STRLEN +size_t strlen(const char * s) +{ + int d0; + int res; + asm volatile( "repne\n\t" + "scasb\n\t" + "notl %0\n\t" + "decl %0" + :"=c" (res), "=&D" (d0) + :"1" (s),"a" (0), "0" (0xffffffffu) + :"memory"); + return res; +} +EXPORT_SYMBOL(strlen); +#endif + +#ifdef __HAVE_ARCH_MEMCHR +void *memchr(const void *cs,int c,size_t count) +{ + int d0; + void *res; + if (!count) + return NULL; + asm volatile( "repne\n\t" + "scasb\n\t" + "je 1f\n\t" + "movl $1,%0\n" + "1:\tdecl %0" + :"=D" (res), "=&c" (d0) + :"a" (c),"0" (cs),"1" (count) + :"memory"); + return res; +} +EXPORT_SYMBOL(memchr); +#endif + +#ifdef __HAVE_ARCH_MEMSCAN +void *memscan(void * addr, int c, size_t size) +{ + if (!size) + return addr; + asm volatile("repnz; scasb\n\t" + "jnz 1f\n\t" + "dec %%edi\n" + "1:" + : "=D" (addr), "=c" (size) + : "0" (addr), "1" (size), "a" (c) + : "memory"); + return addr; +} +EXPORT_SYMBOL(memscan); +#endif + +#ifdef __HAVE_ARCH_STRNLEN +size_t strnlen(const char *s, size_t count) +{ + int d0; + int res; + asm volatile( "movl %2,%0\n\t" + "jmp 2f\n" + "1:\tcmpb $0,(%0)\n\t" + "je 3f\n\t" + "incl %0\n" + "2:\tdecl %1\n\t" + "cmpl $-1,%1\n\t" + "jne 1b\n" + "3:\tsubl %2,%0" + :"=a" (res), "=&d" (d0) + :"c" (s),"1" (count) + :"memory"); + return res; +} +EXPORT_SYMBOL(strnlen); +#endif diff --git a/arch/i386/mach-generic/es7000.c b/arch/i386/mach-generic/es7000.c index b47f951c0ec..4742626f08c 100644 --- a/arch/i386/mach-generic/es7000.c +++ b/arch/i386/mach-generic/es7000.c @@ -66,4 +66,4 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) } #endif -struct genapic apic_es7000 = APIC_INIT("es7000", probe_es7000); +struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); diff --git a/arch/i386/mach-visws/traps.c b/arch/i386/mach-visws/traps.c index 5199bd03254..843b67acf43 100644 --- a/arch/i386/mach-visws/traps.c +++ b/arch/i386/mach-visws/traps.c @@ -23,13 +23,13 @@ static __init void lithium_init(void) set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS); if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || - (li_pcia_read16(PCI_DEVICE_ID) != PCI_VENDOR_ID_SGI_LITHIUM)) { + (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) { printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A'); panic("This machine is not SGI Visual Workstation 320/540"); } if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || - (li_pcib_read16(PCI_DEVICE_ID) != PCI_VENDOR_ID_SGI_LITHIUM)) { + (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) { printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B'); panic("This machine is not SGI Visual Workstation 320/540"); } diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c index b4b24e0e45e..f9d59533815 100644 --- a/arch/i386/mach-voyager/voyager_thread.c +++ b/arch/i386/mach-voyager/voyager_thread.c @@ -52,7 +52,7 @@ execute(const char *string) NULL, }; - if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) { + if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) { printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", string, ret); } diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index 1ecb3e43b52..01ffdd4964f 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -283,6 +283,8 @@ static inline int vmalloc_fault(unsigned long address) return 0; } +int show_unhandled_signals = 1; + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -303,6 +305,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, struct vm_area_struct * vma; unsigned long address; int write, si_code; + int fault; /* get the address */ address = read_cr2(); @@ -422,20 +425,18 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - switch (handle_mm_fault(mm, vma, address, write)) { - case VM_FAULT_MINOR: - tsk->min_flt++; - break; - case VM_FAULT_MAJOR: - tsk->maj_flt++; - break; - case VM_FAULT_SIGBUS: - goto do_sigbus; - case VM_FAULT_OOM: + fault = handle_mm_fault(mm, vma, address, write); + if (unlikely(fault & VM_FAULT_ERROR)) { + if (fault & VM_FAULT_OOM) goto out_of_memory; - default: - BUG(); + else if (fault & VM_FAULT_SIGBUS) + goto do_sigbus; + BUG(); } + if (fault & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; /* * Did it hit the DOS screen memory VA from vm86 mode? @@ -470,6 +471,14 @@ bad_area_nosemaphore: if (is_prefetch(regs, address, error_code)) return; + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk("%s%s[%d]: segfault at %08lx eip %08lx " + "esp %08lx error %lx\n", + tsk->pid > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, tsk->pid, address, regs->eip, + regs->esp, error_code); + } tsk->thread.cr2 = address; /* Kernel addresses are always protection faults */ tsk->thread.error_code = error_code | (address >= TASK_SIZE); diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 7135946d366..c3b9905af2d 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -87,7 +87,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); + paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } @@ -471,8 +471,13 @@ void zap_low_mappings (void) flush_tlb_all(); } +int nx_enabled = 0; + +#ifdef CONFIG_X86_PAE + static int disable_nx __initdata = 0; u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; +EXPORT_SYMBOL_GPL(__supported_pte_mask); /* * noexec = on|off @@ -499,9 +504,6 @@ static int __init noexec_setup(char *str) } early_param("noexec", noexec_setup); -int nx_enabled = 0; -#ifdef CONFIG_X86_PAE - static void __init set_nx(void) { unsigned int v[4], l, h; @@ -751,8 +753,7 @@ void __init pgtable_cache_init(void) PTRS_PER_PMD*sizeof(pmd_t), PTRS_PER_PMD*sizeof(pmd_t), SLAB_PANIC, - pmd_ctor, - NULL); + pmd_ctor); if (!SHARED_KERNEL_PMD) { /* If we're in PAE mode and have a non-shared kernel pmd, then the pgd size must be a @@ -799,17 +800,9 @@ void mark_rodata_ro(void) unsigned long start = PFN_ALIGN(_text); unsigned long size = PFN_ALIGN(_etext) - start; -#ifndef CONFIG_KPROBES -#ifdef CONFIG_HOTPLUG_CPU - /* It must still be possible to apply SMP alternatives. */ - if (num_possible_cpus() <= 1) -#endif - { - change_page_attr(virt_to_page(start), - size >> PAGE_SHIFT, PAGE_KERNEL_RX); - printk("Write protecting the kernel text: %luk\n", size >> 10); - } -#endif + change_page_attr(virt_to_page(start), + size >> PAGE_SHIFT, PAGE_KERNEL_RX); + printk("Write protecting the kernel text: %luk\n", size >> 10); start += size; size = (unsigned long)__end_rodata - start; change_page_attr(virt_to_page(start), diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c index fff08ae7b5e..0b278315d73 100644 --- a/arch/i386/mm/ioremap.c +++ b/arch/i386/mm/ioremap.c @@ -196,7 +196,7 @@ void iounmap(volatile void __iomem *addr) /* Reset the direct mapping. Can block */ if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) { change_page_attr(virt_to_page(__va(p->phys_addr)), - p->size >> PAGE_SHIFT, + get_vm_area_size(p) >> PAGE_SHIFT, PAGE_KERNEL); global_flush_tlb(); } diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index 2eb14a73be9..8927222b3ab 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -60,7 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); - paravirt_alloc_pt(page_to_pfn(base)); + paravirt_alloc_pt(&init_mm, page_to_pfn(base)); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, addr == address ? prot : ref_prot)); @@ -82,7 +82,7 @@ static void flush_kernel_map(void *arg) struct page *p; /* High level code is not ready for clflush yet */ - if (0 && cpu_has_clflush) { + if (cpu_has_clflush) { list_for_each_entry (p, lh, lru) cache_flush_page(p); } else if (boot_cpu_data.x86_model >= 4) @@ -136,6 +136,12 @@ static inline void revert_page(struct page *kpte_page, unsigned long address) ref_prot)); } +static inline void save_page(struct page *kpte_page) +{ + if (!test_and_set_bit(PG_arch_1, &kpte_page->flags)) + list_add(&kpte_page->lru, &df_list); +} + static int __change_page_attr(struct page *page, pgprot_t prot) { @@ -150,6 +156,9 @@ __change_page_attr(struct page *page, pgprot_t prot) if (!kpte) return -EINVAL; kpte_page = virt_to_page(kpte); + BUG_ON(PageLRU(kpte_page)); + BUG_ON(PageCompound(kpte_page)); + if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { if (!pte_huge(*kpte)) { set_pte_atomic(kpte, mk_pte(page, prot)); @@ -179,11 +188,11 @@ __change_page_attr(struct page *page, pgprot_t prot) * time (not via split_large_page) and in turn we must not * replace it with a largepage. */ + + save_page(kpte_page); if (!PageReserved(kpte_page)) { if (cpu_has_pse && (page_private(kpte_page) == 0)) { - ClearPagePrivate(kpte_page); paravirt_release_pt(page_to_pfn(kpte_page)); - list_add(&kpte_page->lru, &df_list); revert_page(kpte_page, address); } } @@ -236,6 +245,11 @@ void global_flush_tlb(void) spin_unlock_irq(&cpa_lock); flush_map(&l); list_for_each_entry_safe(pg, next, &l, lru) { + list_del(&pg->lru); + clear_bit(PG_arch_1, &pg->flags); + if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0) + continue; + ClearPagePrivate(pg); __free_page(pg); } } diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 8d7c0864cc0..01437c46baa 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -235,7 +235,7 @@ static inline void pgd_list_del(pgd_t *pgd) #if (PTRS_PER_PMD == 1) /* Non-PAE pgd constructor */ -void pgd_ctor(void *pgd) +static void pgd_ctor(void *pgd) { unsigned long flags; @@ -257,7 +257,7 @@ void pgd_ctor(void *pgd) } #else /* PTRS_PER_PMD > 1 */ /* PAE pgd constructor */ -void pgd_ctor(void *pgd) +static void pgd_ctor(void *pgd) { /* PAE, kernel PMD may be shared */ @@ -276,7 +276,7 @@ void pgd_ctor(void *pgd) } #endif /* PTRS_PER_PMD */ -void pgd_dtor(void *pgd) +static void pgd_dtor(void *pgd) { unsigned long flags; /* can be called from interrupt context */ diff --git a/arch/i386/pci/acpi.c b/arch/i386/pci/acpi.c index b33aea845f5..bc8a44bddaa 100644 --- a/arch/i386/pci/acpi.c +++ b/arch/i386/pci/acpi.c @@ -8,20 +8,42 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum) { struct pci_bus *bus; + struct pci_sysdata *sd; + int pxm; + + /* Allocate per-root-bus (not per bus) arch-specific data. + * TODO: leak; this memory is never freed. + * It's arguable whether it's worth the trouble to care. + */ + sd = kzalloc(sizeof(*sd), GFP_KERNEL); + if (!sd) { + printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); + return NULL; + } if (domain != 0) { printk(KERN_WARNING "PCI: Multiple domains not supported\n"); + kfree(sd); return NULL; } - bus = pcibios_scan_root(busnum); + sd->node = -1; + + pxm = acpi_get_pxm(device->handle); +#ifdef CONFIG_ACPI_NUMA + if (pxm >= 0) + sd->node = pxm_to_node(pxm); +#endif + + bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); + if (!bus) + kfree(sd); + #ifdef CONFIG_ACPI_NUMA if (bus != NULL) { - int pxm = acpi_get_pxm(device->handle); if (pxm >= 0) { - bus->sysdata = (void *)(unsigned long)pxm_to_node(pxm); - printk("bus %d -> pxm %d -> node %ld\n", - busnum, pxm, (long)(bus->sysdata)); + printk("bus %d -> pxm %d -> node %d\n", + busnum, pxm, sd->node); } } #endif diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c index 3f78d4d8ecf..85503deeda4 100644 --- a/arch/i386/pci/common.c +++ b/arch/i386/pci/common.c @@ -293,6 +293,7 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = { struct pci_bus * __devinit pcibios_scan_root(int busnum) { struct pci_bus *bus = NULL; + struct pci_sysdata *sd; dmi_check_system(pciprobe_dmi_table); @@ -303,9 +304,19 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) } } + /* Allocate per-root-bus (not per bus) arch-specific data. + * TODO: leak; this memory is never freed. + * It's arguable whether it's worth the trouble to care. + */ + sd = kzalloc(sizeof(*sd), GFP_KERNEL); + if (!sd) { + printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); + return NULL; + } + printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum); - return pci_scan_bus_parented(NULL, busnum, &pci_root_ops, NULL); + return pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); } extern u8 pci_cache_line_size; diff --git a/arch/i386/pci/fixup.c b/arch/i386/pci/fixup.c index b95b42950ed..e7306dbf6c4 100644 --- a/arch/i386/pci/fixup.c +++ b/arch/i386/pci/fixup.c @@ -118,12 +118,9 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3, pci static void pci_fixup_via_northbridge_bug(struct pci_dev *d) { u8 v; - u8 revision; int where = 0x55; int mask = 0x1f; /* clear bits 5, 6, 7 by default */ - pci_read_config_byte(d, PCI_REVISION_ID, &revision); - if (d->device == PCI_DEVICE_ID_VIA_8367_0) { /* fix pci bus latency issues resulted by NB bios error it appears on bug free^Wreduced kt266x's bios forces @@ -133,8 +130,8 @@ static void pci_fixup_via_northbridge_bug(struct pci_dev *d) where = 0x95; /* the memory write queue timer register is different for the KT266x's: 0x95 not 0x55 */ } else if (d->device == PCI_DEVICE_ID_VIA_8363_0 && - (revision == VIA_8363_KL133_REVISION_ID || - revision == VIA_8363_KM133_REVISION_ID)) { + (d->revision == VIA_8363_KL133_REVISION_ID || + d->revision == VIA_8363_KM133_REVISION_ID)) { mask = 0x3f; /* clear only bits 6 and 7; clearing bit 5 causes screen corruption on the KL133/KM133 */ } @@ -142,7 +139,7 @@ static void pci_fixup_via_northbridge_bug(struct pci_dev *d) pci_read_config_byte(d, where, &v); if (v & ~mask) { printk(KERN_WARNING "Disabling VIA memory write queue (PCI ID %04x, rev %02x): [%02x] %02x & %02x -> %02x\n", \ - d->device, revision, where, v, mask, v & mask); + d->device, d->revision, where, v, mask, v & mask); v &= mask; pci_write_config_byte(d, where, v); } diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c index c7cabeed4d7..4df637e34f8 100644 --- a/arch/i386/pci/mmconfig-shared.c +++ b/arch/i386/pci/mmconfig-shared.c @@ -24,6 +24,9 @@ DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS); +/* Indicate if the mmcfg resources have been placed into the resource table. */ +static int __initdata pci_mmcfg_resources_inserted; + /* K8 systems have some devices (typically in the builtin northbridge) that are only accessible using type1 Normally this can be expressed in the MCFG by not listing them @@ -170,7 +173,7 @@ static int __init pci_mmcfg_check_hostbridge(void) return name != NULL; } -static void __init pci_mmcfg_insert_resources(void) +static void __init pci_mmcfg_insert_resources(unsigned long resource_flags) { #define PCI_MMCFG_RESOURCE_NAME_LEN 19 int i; @@ -194,10 +197,13 @@ static void __init pci_mmcfg_insert_resources(void) cfg->pci_segment); res->start = cfg->address; res->end = res->start + (num_buses << 20) - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res->flags = IORESOURCE_MEM | resource_flags; insert_resource(&iomem_resource, res); names += PCI_MMCFG_RESOURCE_NAME_LEN; } + + /* Mark that the resources have been inserted. */ + pci_mmcfg_resources_inserted = 1; } static void __init pci_mmcfg_reject_broken(int type) @@ -267,7 +273,43 @@ void __init pci_mmcfg_init(int type) if (type == 1) unreachable_devices(); if (known_bridge) - pci_mmcfg_insert_resources(); + pci_mmcfg_insert_resources(IORESOURCE_BUSY); pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; + } else { + /* + * Signal not to attempt to insert mmcfg resources because + * the architecture mmcfg setup could not initialize. + */ + pci_mmcfg_resources_inserted = 1; } } + +static int __init pci_mmcfg_late_insert_resources(void) +{ + /* + * If resources are already inserted or we are not using MMCONFIG, + * don't insert the resources. + */ + if ((pci_mmcfg_resources_inserted == 1) || + (pci_probe & PCI_PROBE_MMCONF) == 0 || + (pci_mmcfg_config_num == 0) || + (pci_mmcfg_config == NULL) || + (pci_mmcfg_config[0].address == 0)) + return 1; + + /* + * Attempt to insert the mmcfg resources but not with the busy flag + * marked so it won't cause request errors when __request_region is + * called. + */ + pci_mmcfg_insert_resources(0); + + return 0; +} + +/* + * Perform MMCONFIG resource insertion after PCI initialization to allow for + * misprogrammed MCFG tables that state larger sizes but actually conflict + * with other system resources. + */ +late_initcall(pci_mmcfg_late_insert_resources); diff --git a/arch/i386/video/Makefile b/arch/i386/video/Makefile new file mode 100644 index 00000000000..2c447c94adc --- /dev/null +++ b/arch/i386/video/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_FB) += fbdev.o diff --git a/arch/i386/video/fbdev.c b/arch/i386/video/fbdev.c new file mode 100644 index 00000000000..48fb38d7d2c --- /dev/null +++ b/arch/i386/video/fbdev.c @@ -0,0 +1,32 @@ +/* + * arch/i386/video/fbdev.c - i386 Framebuffer + * + * Copyright (C) 2007 Antonino Daplas <adaplas@gmail.com> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of this archive + * for more details. + * + */ +#include <linux/fb.h> +#include <linux/pci.h> + +int fb_is_primary_device(struct fb_info *info) +{ + struct device *device = info->device; + struct pci_dev *pci_dev = NULL; + struct resource *res = NULL; + int retval = 0; + + if (device) + pci_dev = to_pci_dev(device); + + if (pci_dev) + res = &pci_dev->resource[PCI_ROM_RESOURCE]; + + if (res && res->flags & IORESOURCE_ROM_SHADOW) + retval = 1; + + return retval; +} +EXPORT_SYMBOL(fb_is_primary_device); diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig new file mode 100644 index 00000000000..9df99e1885a --- /dev/null +++ b/arch/i386/xen/Kconfig @@ -0,0 +1,11 @@ +# +# This Kconfig describes xen options +# + +config XEN + bool "Enable support for Xen hypervisor" + depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES + help + This is the Linux Xen port. Enabling this will allow the + kernel to boot in a paravirtualized environment under the + Xen hypervisor. diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile new file mode 100644 index 00000000000..343df246bd3 --- /dev/null +++ b/arch/i386/xen/Makefile @@ -0,0 +1,4 @@ +obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ + events.o time.o manage.o xen-asm.o + +obj-$(CONFIG_SMP) += smp.o diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c new file mode 100644 index 00000000000..9a8c1181c00 --- /dev/null +++ b/arch/i386/xen/enlighten.c @@ -0,0 +1,1144 @@ +/* + * Core of Xen paravirt_ops implementation. + * + * This file contains the xen_paravirt_ops structure itself, and the + * implementations for: + * - privileged instructions + * - interrupt flags + * - segment operations + * - booting and setup + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/preempt.h> +#include <linux/hardirq.h> +#include <linux/percpu.h> +#include <linux/delay.h> +#include <linux/start_kernel.h> +#include <linux/sched.h> +#include <linux/bootmem.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/page-flags.h> +#include <linux/highmem.h> +#include <linux/smp.h> + +#include <xen/interface/xen.h> +#include <xen/interface/physdev.h> +#include <xen/interface/vcpu.h> +#include <xen/interface/sched.h> +#include <xen/features.h> +#include <xen/page.h> + +#include <asm/paravirt.h> +#include <asm/page.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> +#include <asm/fixmap.h> +#include <asm/processor.h> +#include <asm/setup.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/reboot.h> + +#include "xen-ops.h" +#include "mmu.h" +#include "multicalls.h" + +EXPORT_SYMBOL_GPL(hypercall_page); + +DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); + +DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); +DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +DEFINE_PER_CPU(unsigned long, xen_cr3); + +struct start_info *xen_start_info; +EXPORT_SYMBOL_GPL(xen_start_info); + +static /* __initdata */ struct shared_info dummy_shared_info; + +/* + * Point at some empty memory to start with. We map the real shared_info + * page as soon as fixmap is up and running. + */ +struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; + +/* + * Flag to determine whether vcpu info placement is available on all + * VCPUs. We assume it is to start with, and then set it to zero on + * the first failure. This is because it can succeed on some VCPUs + * and not others, since it can involve hypervisor memory allocation, + * or because the guest failed to guarantee all the appropriate + * constraints on all VCPUs (ie buffer can't cross a page boundary). + * + * Note that any particular CPU may be using a placed vcpu structure, + * but we can only optimise if the all are. + * + * 0: not available, 1: available + */ +static int have_vcpu_info_placement = 1; + +static void __init xen_vcpu_setup(int cpu) +{ + struct vcpu_register_vcpu_info info; + int err; + struct vcpu_info *vcpup; + + per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + + if (!have_vcpu_info_placement) + return; /* already tested, not available */ + + vcpup = &per_cpu(xen_vcpu_info, cpu); + + info.mfn = virt_to_mfn(vcpup); + info.offset = offset_in_page(vcpup); + + printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", + cpu, vcpup, info.mfn, info.offset); + + /* Check to see if the hypervisor will put the vcpu_info + structure where we want it, which allows direct access via + a percpu-variable. */ + err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info); + + if (err) { + printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); + have_vcpu_info_placement = 0; + } else { + /* This cpu is using the registered vcpu info, even if + later ones fail to. */ + per_cpu(xen_vcpu, cpu) = vcpup; + + printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", + cpu, vcpup); + } +} + +static void __init xen_banner(void) +{ + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", + paravirt_ops.name); + printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); +} + +static void xen_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + unsigned maskedx = ~0; + + /* + * Mask out inconvenient features, to try and disable as many + * unsupported kernel subsystems as possible. + */ + if (*eax == 1) + maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ + (1 << X86_FEATURE_ACPI) | /* disable ACPI */ + (1 << X86_FEATURE_ACC)); /* thermal monitoring */ + + asm(XEN_EMULATE_PREFIX "cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); + *edx &= maskedx; +} + +static void xen_set_debugreg(int reg, unsigned long val) +{ + HYPERVISOR_set_debugreg(reg, val); +} + +static unsigned long xen_get_debugreg(int reg) +{ + return HYPERVISOR_get_debugreg(reg); +} + +static unsigned long xen_save_fl(void) +{ + struct vcpu_info *vcpu; + unsigned long flags; + + vcpu = x86_read_percpu(xen_vcpu); + + /* flag has opposite sense of mask */ + flags = !vcpu->evtchn_upcall_mask; + + /* convert to IF type flag + -0 -> 0x00000000 + -1 -> 0xffffffff + */ + return (-flags) & X86_EFLAGS_IF; +} + +static void xen_restore_fl(unsigned long flags) +{ + struct vcpu_info *vcpu; + + /* convert from IF type flag */ + flags = !(flags & X86_EFLAGS_IF); + + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = flags; + preempt_enable_no_resched(); + + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + + if (flags == 0) { + preempt_check_resched(); + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + force_evtchn_callback(); + } +} + +static void xen_irq_disable(void) +{ + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; + preempt_enable_no_resched(); +} + +static void xen_irq_enable(void) +{ + struct vcpu_info *vcpu; + + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = 0; + preempt_enable_no_resched(); + + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + force_evtchn_callback(); +} + +static void xen_safe_halt(void) +{ + /* Blocking includes an implicit local_irq_enable(). */ + if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0) + BUG(); +} + +static void xen_halt(void) +{ + if (irqs_disabled()) + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + else + xen_safe_halt(); +} + +static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) +{ + BUG_ON(preemptible()); + + switch (mode) { + case PARAVIRT_LAZY_NONE: + BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE); + break; + + case PARAVIRT_LAZY_MMU: + case PARAVIRT_LAZY_CPU: + BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE); + break; + + case PARAVIRT_LAZY_FLUSH: + /* flush if necessary, but don't change state */ + if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE) + xen_mc_flush(); + return; + } + + xen_mc_flush(); + x86_write_percpu(xen_lazy_mode, mode); +} + +static unsigned long xen_store_tr(void) +{ + return 0; +} + +static void xen_set_ldt(const void *addr, unsigned entries) +{ + unsigned long linear_addr = (unsigned long)addr; + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_SET_LDT; + if (linear_addr) { + /* ldt my be vmalloced, use arbitrary_virt_to_machine */ + xmaddr_t maddr; + maddr = arbitrary_virt_to_machine((unsigned long)addr); + linear_addr = (unsigned long)maddr.maddr; + } + op->arg1.linear_addr = linear_addr; + op->arg2.nr_ents = entries; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +static void xen_load_gdt(const struct Xgt_desc_struct *dtr) +{ + unsigned long *frames; + unsigned long va = dtr->address; + unsigned int size = dtr->size + 1; + unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + int f; + struct multicall_space mcs; + + /* A GDT can be up to 64k in size, which corresponds to 8192 + 8-byte entries, or 16 4k pages.. */ + + BUG_ON(size > 65536); + BUG_ON(va & ~PAGE_MASK); + + mcs = xen_mc_entry(sizeof(*frames) * pages); + frames = mcs.args; + + for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { + frames[f] = virt_to_mfn(va); + make_lowmem_page_readonly((void *)va); + } + + MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +static void load_TLS_descriptor(struct thread_struct *t, + unsigned int cpu, unsigned int i) +{ + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + struct multicall_space mc = __xen_mc_entry(0); + + MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); +} + +static void xen_load_tls(struct thread_struct *t, unsigned int cpu) +{ + xen_mc_batch(); + + load_TLS_descriptor(t, cpu, 0); + load_TLS_descriptor(t, cpu, 1); + load_TLS_descriptor(t, cpu, 2); + + xen_mc_issue(PARAVIRT_LAZY_CPU); + + /* + * XXX sleazy hack: If we're being called in a lazy-cpu zone, + * it means we're in a context switch, and %gs has just been + * saved. This means we can zero it out to prevent faults on + * exit from the hypervisor if the next process has no %gs. + * Either way, it has been saved, and the new value will get + * loaded properly. This will go away as soon as Xen has been + * modified to not save/restore %gs for normal hypercalls. + */ + if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU) + loadsegment(gs, 0); +} + +static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, + u32 low, u32 high) +{ + unsigned long lp = (unsigned long)&dt[entrynum]; + xmaddr_t mach_lp = virt_to_machine(lp); + u64 entry = (u64)high << 32 | low; + + preempt_disable(); + + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) + BUG(); + + preempt_enable(); +} + +static int cvt_gate_to_trap(int vector, u32 low, u32 high, + struct trap_info *info) +{ + u8 type, dpl; + + type = (high >> 8) & 0x1f; + dpl = (high >> 13) & 3; + + if (type != 0xf && type != 0xe) + return 0; + + info->vector = vector; + info->address = (high & 0xffff0000) | (low & 0x0000ffff); + info->cs = low >> 16; + info->flags = dpl; + /* interrupt gates clear IF */ + if (type == 0xe) + info->flags |= 4; + + return 1; +} + +/* Locations of each CPU's IDT */ +static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc); + +/* Set an IDT entry. If the entry is part of the current IDT, then + also update Xen. */ +static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, + u32 low, u32 high) +{ + unsigned long p = (unsigned long)&dt[entrynum]; + unsigned long start, end; + + preempt_disable(); + + start = __get_cpu_var(idt_desc).address; + end = start + __get_cpu_var(idt_desc).size + 1; + + xen_mc_flush(); + + write_dt_entry(dt, entrynum, low, high); + + if (p >= start && (p + 8) <= end) { + struct trap_info info[2]; + + info[1].address = 0; + + if (cvt_gate_to_trap(entrynum, low, high, &info[0])) + if (HYPERVISOR_set_trap_table(info)) + BUG(); + } + + preempt_enable(); +} + +static void xen_convert_trap_info(const struct Xgt_desc_struct *desc, + struct trap_info *traps) +{ + unsigned in, out, count; + + count = (desc->size+1) / 8; + BUG_ON(count > 256); + + for (in = out = 0; in < count; in++) { + const u32 *entry = (u32 *)(desc->address + in * 8); + + if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) + out++; + } + traps[out].address = 0; +} + +void xen_copy_trap_info(struct trap_info *traps) +{ + const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc); + + xen_convert_trap_info(desc, traps); +} + +/* Load a new IDT into Xen. In principle this can be per-CPU, so we + hold a spinlock to protect the static traps[] array (static because + it avoids allocation, and saves stack space). */ +static void xen_load_idt(const struct Xgt_desc_struct *desc) +{ + static DEFINE_SPINLOCK(lock); + static struct trap_info traps[257]; + + spin_lock(&lock); + + __get_cpu_var(idt_desc) = *desc; + + xen_convert_trap_info(desc, traps); + + xen_mc_flush(); + if (HYPERVISOR_set_trap_table(traps)) + BUG(); + + spin_unlock(&lock); +} + +/* Write a GDT descriptor entry. Ignore LDT descriptors, since + they're handled differently. */ +static void xen_write_gdt_entry(struct desc_struct *dt, int entry, + u32 low, u32 high) +{ + preempt_disable(); + + switch ((high >> 8) & 0xff) { + case DESCTYPE_LDT: + case DESCTYPE_TSS: + /* ignore */ + break; + + default: { + xmaddr_t maddr = virt_to_machine(&dt[entry]); + u64 desc = (u64)high << 32 | low; + + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(maddr.maddr, desc)) + BUG(); + } + + } + + preempt_enable(); +} + +static void xen_load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + struct multicall_space mcs = xen_mc_entry(0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0); + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +static void xen_set_iopl_mask(unsigned mask) +{ + struct physdev_set_iopl set_iopl; + + /* Force the change at ring 0. */ + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; + HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); +} + +static void xen_io_delay(void) +{ +} + +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned long xen_apic_read(unsigned long reg) +{ + return 0; +} + +static void xen_apic_write(unsigned long reg, unsigned long val) +{ + /* Warn to see if there's any stray references */ + WARN_ON(1); +} +#endif + +static void xen_flush_tlb(void) +{ + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_TLB_FLUSH_LOCAL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + +static void xen_flush_tlb_single(unsigned long addr) +{ + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_INVLPG_LOCAL; + op->arg1.linear_addr = addr & PAGE_MASK; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + +static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, + unsigned long va) +{ + struct { + struct mmuext_op op; + cpumask_t mask; + } *args; + cpumask_t cpumask = *cpus; + struct multicall_space mcs; + + /* + * A couple of (to be removed) sanity checks: + * + * - current CPU must not be in mask + * - mask must exist :) + */ + BUG_ON(cpus_empty(cpumask)); + BUG_ON(cpu_isset(smp_processor_id(), cpumask)); + BUG_ON(!mm); + + /* If a CPU which we ran on has gone down, OK. */ + cpus_and(cpumask, cpumask, cpu_online_map); + if (cpus_empty(cpumask)) + return; + + mcs = xen_mc_entry(sizeof(*args)); + args = mcs.args; + args->mask = cpumask; + args->op.arg2.vcpumask = &args->mask; + + if (va == TLB_FLUSH_ALL) { + args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; + } else { + args->op.cmd = MMUEXT_INVLPG_MULTI; + args->op.arg1.linear_addr = va; + } + + MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + +static void xen_write_cr2(unsigned long cr2) +{ + x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; +} + +static unsigned long xen_read_cr2(void) +{ + return x86_read_percpu(xen_vcpu)->arch.cr2; +} + +static unsigned long xen_read_cr2_direct(void) +{ + return x86_read_percpu(xen_vcpu_info.arch.cr2); +} + +static void xen_write_cr4(unsigned long cr4) +{ + /* never allow TSC to be disabled */ + native_write_cr4(cr4 & ~X86_CR4_TSD); +} + +static unsigned long xen_read_cr3(void) +{ + return x86_read_percpu(xen_cr3); +} + +static void xen_write_cr3(unsigned long cr3) +{ + BUG_ON(preemptible()); + + if (cr3 == x86_read_percpu(xen_cr3)) { + /* just a simple tlb flush */ + xen_flush_tlb(); + return; + } + + x86_write_percpu(xen_cr3, cr3); + + + { + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); + + op = mcs.args; + op->cmd = MMUEXT_NEW_BASEPTR; + op->arg1.mfn = mfn; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_CPU); + } +} + +/* Early in boot, while setting up the initial pagetable, assume + everything is pinned. */ +static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) +{ + BUG_ON(mem_map); /* should only be used early */ + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +} + +/* This needs to make sure the new pte page is pinned iff its being + attached to a pinned pagetable. */ +static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) +{ + struct page *page = pfn_to_page(pfn); + + if (PagePinned(virt_to_page(mm->pgd))) { + SetPagePinned(page); + + if (!PageHighMem(page)) + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); + else + /* make sure there are no stray mappings of + this page */ + kmap_flush_unused(); + } +} + +/* This should never happen until we're OK to use struct page */ +static void xen_release_pt(u32 pfn) +{ + struct page *page = pfn_to_page(pfn); + + if (PagePinned(page)) { + if (!PageHighMem(page)) + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + } +} + +#ifdef CONFIG_HIGHPTE +static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) +{ + pgprot_t prot = PAGE_KERNEL; + + if (PagePinned(page)) + prot = PAGE_KERNEL_RO; + + if (0 && PageHighMem(page)) + printk("mapping highpte %lx type %d prot %s\n", + page_to_pfn(page), type, + (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); + + return kmap_atomic_prot(page, type, prot); +} +#endif + +static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) +{ + /* If there's an existing pte, then don't allow _PAGE_RW to be set */ + if (pte_val_ma(*ptep) & _PAGE_PRESENT) + pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & + pte_val_ma(pte)); + + return pte; +} + +/* Init-time set_pte while constructing initial pagetables, which + doesn't allow RO pagetable pages to be remapped RW */ +static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) +{ + pte = mask_rw_pte(ptep, pte); + + xen_set_pte(ptep, pte); +} + +static __init void xen_pagetable_setup_start(pgd_t *base) +{ + pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; + + /* special set_pte for pagetable initialization */ + paravirt_ops.set_pte = xen_set_pte_init; + + init_mm.pgd = base; + /* + * copy top-level of Xen-supplied pagetable into place. For + * !PAE we can use this as-is, but for PAE it is a stand-in + * while we copy the pmd pages. + */ + memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); + + if (PTRS_PER_PMD > 1) { + int i; + /* + * For PAE, need to allocate new pmds, rather than + * share Xen's, since Xen doesn't like pmd's being + * shared between address spaces. + */ + for (i = 0; i < PTRS_PER_PGD; i++) { + if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { + pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + + memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), + PAGE_SIZE); + + make_lowmem_page_readonly(pmd); + + set_pgd(&base[i], __pgd(1 + __pa(pmd))); + } else + pgd_clear(&base[i]); + } + } + + /* make sure zero_page is mapped RO so we can use it in pagetables */ + make_lowmem_page_readonly(empty_zero_page); + make_lowmem_page_readonly(base); + /* + * Switch to new pagetable. This is done before + * pagetable_init has done anything so that the new pages + * added to the table can be prepared properly for Xen. + */ + xen_write_cr3(__pa(base)); +} + +static __init void xen_pagetable_setup_done(pgd_t *base) +{ + /* This will work as long as patching hasn't happened yet + (which it hasn't) */ + paravirt_ops.alloc_pt = xen_alloc_pt; + paravirt_ops.set_pte = xen_set_pte; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* + * Create a mapping for the shared info page. + * Should be set_fixmap(), but shared_info is a machine + * address with no corresponding pseudo-phys address. + */ + set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP), + PFN_DOWN(xen_start_info->shared_info), + PAGE_KERNEL); + + HYPERVISOR_shared_info = + (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); + + } else + HYPERVISOR_shared_info = + (struct shared_info *)__va(xen_start_info->shared_info); + + /* Actually pin the pagetable down, but we can't set PG_pinned + yet because the page structures don't exist yet. */ + { + struct mmuext_op op; +#ifdef CONFIG_X86_PAE + op.cmd = MMUEXT_PIN_L3_TABLE; +#else + op.cmd = MMUEXT_PIN_L3_TABLE; +#endif + op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); + } +} + +/* This is called once we have the cpu_possible_map */ +void __init xen_setup_vcpu_info_placement(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + xen_vcpu_setup(cpu); + + /* xen_vcpu_setup managed to place the vcpu_info within the + percpu area for all cpus, so make use of it */ + if (have_vcpu_info_placement) { + printk(KERN_INFO "Xen: using vcpu_info placement\n"); + + paravirt_ops.save_fl = xen_save_fl_direct; + paravirt_ops.restore_fl = xen_restore_fl_direct; + paravirt_ops.irq_disable = xen_irq_disable_direct; + paravirt_ops.irq_enable = xen_irq_enable_direct; + paravirt_ops.read_cr2 = xen_read_cr2_direct; + paravirt_ops.iret = xen_iret_direct; + } +} + +static unsigned xen_patch(u8 type, u16 clobbers, void *insns, unsigned len) +{ + char *start, *end, *reloc; + unsigned ret; + + start = end = reloc = NULL; + +#define SITE(x) \ + case PARAVIRT_PATCH(x): \ + if (have_vcpu_info_placement) { \ + start = (char *)xen_##x##_direct; \ + end = xen_##x##_direct_end; \ + reloc = xen_##x##_direct_reloc; \ + } \ + goto patch_site + + switch (type) { + SITE(irq_enable); + SITE(irq_disable); + SITE(save_fl); + SITE(restore_fl); +#undef SITE + + patch_site: + if (start == NULL || (end-start) > len) + goto default_patch; + + ret = paravirt_patch_insns(insns, len, start, end); + + /* Note: because reloc is assigned from something that + appears to be an array, gcc assumes it's non-null, + but doesn't know its relationship with start and + end. */ + if (reloc > start && reloc < end) { + int reloc_off = reloc - start; + long *relocp = (long *)(insns + reloc_off); + long delta = start - (char *)insns; + + *relocp += delta; + } + break; + + default_patch: + default: + ret = paravirt_patch_default(type, clobbers, insns, len); + break; + } + + return ret; +} + +static const struct paravirt_ops xen_paravirt_ops __initdata = { + .paravirt_enabled = 1, + .shared_kernel_pmd = 0, + + .name = "Xen", + .banner = xen_banner, + + .patch = xen_patch, + + .memory_setup = xen_memory_setup, + .arch_setup = xen_arch_setup, + .init_IRQ = xen_init_IRQ, + .post_allocator_init = xen_mark_init_mm_pinned, + + .time_init = xen_time_init, + .set_wallclock = xen_set_wallclock, + .get_wallclock = xen_get_wallclock, + .get_cpu_khz = xen_cpu_khz, + .sched_clock = xen_sched_clock, + + .cpuid = xen_cpuid, + + .set_debugreg = xen_set_debugreg, + .get_debugreg = xen_get_debugreg, + + .clts = native_clts, + + .read_cr0 = native_read_cr0, + .write_cr0 = native_write_cr0, + + .read_cr2 = xen_read_cr2, + .write_cr2 = xen_write_cr2, + + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3, + + .read_cr4 = native_read_cr4, + .read_cr4_safe = native_read_cr4_safe, + .write_cr4 = xen_write_cr4, + + .save_fl = xen_save_fl, + .restore_fl = xen_restore_fl, + .irq_disable = xen_irq_disable, + .irq_enable = xen_irq_enable, + .safe_halt = xen_safe_halt, + .halt = xen_halt, + .wbinvd = native_wbinvd, + + .read_msr = native_read_msr_safe, + .write_msr = native_write_msr_safe, + .read_tsc = native_read_tsc, + .read_pmc = native_read_pmc, + + .iret = (void *)&hypercall_page[__HYPERVISOR_iret], + .irq_enable_sysexit = NULL, /* never called */ + + .load_tr_desc = paravirt_nop, + .set_ldt = xen_set_ldt, + .load_gdt = xen_load_gdt, + .load_idt = xen_load_idt, + .load_tls = xen_load_tls, + + .store_gdt = native_store_gdt, + .store_idt = native_store_idt, + .store_tr = xen_store_tr, + + .write_ldt_entry = xen_write_ldt_entry, + .write_gdt_entry = xen_write_gdt_entry, + .write_idt_entry = xen_write_idt_entry, + .load_esp0 = xen_load_esp0, + + .set_iopl_mask = xen_set_iopl_mask, + .io_delay = xen_io_delay, + +#ifdef CONFIG_X86_LOCAL_APIC + .apic_write = xen_apic_write, + .apic_write_atomic = xen_apic_write, + .apic_read = xen_apic_read, + .setup_boot_clock = paravirt_nop, + .setup_secondary_clock = paravirt_nop, + .startup_ipi_hook = paravirt_nop, +#endif + + .flush_tlb_user = xen_flush_tlb, + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_single = xen_flush_tlb_single, + .flush_tlb_others = xen_flush_tlb_others, + + .pte_update = paravirt_nop, + .pte_update_defer = paravirt_nop, + + .pagetable_setup_start = xen_pagetable_setup_start, + .pagetable_setup_done = xen_pagetable_setup_done, + + .alloc_pt = xen_alloc_pt_init, + .release_pt = xen_release_pt, + .alloc_pd = paravirt_nop, + .alloc_pd_clone = paravirt_nop, + .release_pd = paravirt_nop, + +#ifdef CONFIG_HIGHPTE + .kmap_atomic_pte = xen_kmap_atomic_pte, +#endif + + .set_pte = NULL, /* see xen_pagetable_setup_* */ + .set_pte_at = xen_set_pte_at, + .set_pmd = xen_set_pmd, + + .pte_val = xen_pte_val, + .pgd_val = xen_pgd_val, + + .make_pte = xen_make_pte, + .make_pgd = xen_make_pgd, + +#ifdef CONFIG_X86_PAE + .set_pte_atomic = xen_set_pte_atomic, + .set_pte_present = xen_set_pte_at, + .set_pud = xen_set_pud, + .pte_clear = xen_pte_clear, + .pmd_clear = xen_pmd_clear, + + .make_pmd = xen_make_pmd, + .pmd_val = xen_pmd_val, +#endif /* PAE */ + + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, + .exit_mmap = xen_exit_mmap, + + .set_lazy_mode = xen_set_lazy_mode, +}; + +#ifdef CONFIG_SMP +static const struct smp_ops xen_smp_ops __initdata = { + .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, + .smp_prepare_cpus = xen_smp_prepare_cpus, + .cpu_up = xen_cpu_up, + .smp_cpus_done = xen_smp_cpus_done, + + .smp_send_stop = xen_smp_send_stop, + .smp_send_reschedule = xen_smp_send_reschedule, + .smp_call_function_mask = xen_smp_call_function_mask, +}; +#endif /* CONFIG_SMP */ + +static void xen_reboot(int reason) +{ +#ifdef CONFIG_SMP + smp_send_stop(); +#endif + + if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason)) + BUG(); +} + +static void xen_restart(char *msg) +{ + xen_reboot(SHUTDOWN_reboot); +} + +static void xen_emergency_restart(void) +{ + xen_reboot(SHUTDOWN_reboot); +} + +static void xen_machine_halt(void) +{ + xen_reboot(SHUTDOWN_poweroff); +} + +static void xen_crash_shutdown(struct pt_regs *regs) +{ + xen_reboot(SHUTDOWN_crash); +} + +static const struct machine_ops __initdata xen_machine_ops = { + .restart = xen_restart, + .halt = xen_machine_halt, + .power_off = xen_machine_halt, + .shutdown = xen_machine_halt, + .crash_shutdown = xen_crash_shutdown, + .emergency_restart = xen_emergency_restart, +}; + + +/* First C function to be called on Xen boot */ +asmlinkage void __init xen_start_kernel(void) +{ + pgd_t *pgd; + + if (!xen_start_info) + return; + + BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); + + /* Install Xen paravirt ops */ + paravirt_ops = xen_paravirt_ops; + machine_ops = xen_machine_ops; + +#ifdef CONFIG_SMP + smp_ops = xen_smp_ops; +#endif + + xen_setup_features(); + + /* Get mfn list */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; + + pgd = (pgd_t *)xen_start_info->pt_base; + + init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; + + init_mm.pgd = pgd; /* use the Xen pagetables to start */ + + /* keep using Xen gdt for now; no urgent need to change it */ + + x86_write_percpu(xen_cr3, __pa(pgd)); + +#ifdef CONFIG_SMP + /* Don't do the full vcpu_info placement stuff until we have a + possible map. */ + per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; +#else + /* May as well do it now, since there's no good time to call + it later on UP. */ + xen_setup_vcpu_info_placement(); +#endif + + paravirt_ops.kernel_rpl = 1; + if (xen_feature(XENFEAT_supervisor_mode_kernel)) + paravirt_ops.kernel_rpl = 0; + + /* set the limit of our address space */ + reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); + + /* set up basic CPUID stuff */ + cpu_detect(&new_cpu_data); + new_cpu_data.hard_math = 1; + new_cpu_data.x86_capability[0] = cpuid_edx(1); + + /* Poke various useful things into boot_params */ + LOADER_TYPE = (9 << 4) | 0; + INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0; + INITRD_SIZE = xen_start_info->mod_len; + + /* Start the world */ + start_kernel(); +} diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c new file mode 100644 index 00000000000..da1b173547a --- /dev/null +++ b/arch/i386/xen/events.c @@ -0,0 +1,591 @@ +/* + * Xen event channels + * + * Xen models interrupts with abstract event channels. Because each + * domain gets 1024 event channels, but NR_IRQ is not that large, we + * must dynamically map irqs<->event channels. The event channels + * interface with the rest of the kernel by defining a xen interrupt + * chip. When an event is recieved, it is mapped to an irq and sent + * through the normal interrupt processing path. + * + * There are four kinds of events which can be mapped to an event + * channel: + * + * 1. Inter-domain notifications. This includes all the virtual + * device events, since they're driven by front-ends in another domain + * (typically dom0). + * 2. VIRQs, typically used for timers. These are per-cpu events. + * 3. IPIs. + * 4. Hardware interrupts. Not supported at present. + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/string.h> + +#include <asm/ptrace.h> +#include <asm/irq.h> +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "xen-ops.h" + +/* + * This lock protects updates to the following mapping and reference-count + * arrays. The lock does not need to be acquired to read the mapping tables. + */ +static DEFINE_SPINLOCK(irq_mapping_update_lock); + +/* IRQ <-> VIRQ mapping. */ +static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; + +/* IRQ <-> IPI mapping */ +static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; + +/* Packed IRQ information: binding type, sub-type index, and event channel. */ +struct packed_irq +{ + unsigned short evtchn; + unsigned char index; + unsigned char type; +}; + +static struct packed_irq irq_info[NR_IRQS]; + +/* Binding types. */ +enum { + IRQT_UNBOUND, + IRQT_PIRQ, + IRQT_VIRQ, + IRQT_IPI, + IRQT_EVTCHN +}; + +/* Convenient shorthand for packed representation of an unbound IRQ. */ +#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) + +static int evtchn_to_irq[NR_EVENT_CHANNELS] = { + [0 ... NR_EVENT_CHANNELS-1] = -1 +}; +static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; +static u8 cpu_evtchn[NR_EVENT_CHANNELS]; + +/* Reference counts for bindings to IRQs. */ +static int irq_bindcount[NR_IRQS]; + +/* Xen will never allocate port zero for any purpose. */ +#define VALID_EVTCHN(chn) ((chn) != 0) + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void force_evtchn_callback(void) +{ + (void)HYPERVISOR_xen_version(0, NULL); +} +EXPORT_SYMBOL_GPL(force_evtchn_callback); + +static struct irq_chip xen_dynamic_chip; + +/* Constructor for packed IRQ information. */ +static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn) +{ + return (struct packed_irq) { evtchn, index, type }; +} + +/* + * Accessors for packed IRQ information. + */ +static inline unsigned int evtchn_from_irq(int irq) +{ + return irq_info[irq].evtchn; +} + +static inline unsigned int index_from_irq(int irq) +{ + return irq_info[irq].index; +} + +static inline unsigned int type_from_irq(int irq) +{ + return irq_info[irq].type; +} + +static inline unsigned long active_evtchns(unsigned int cpu, + struct shared_info *sh, + unsigned int idx) +{ + return (sh->evtchn_pending[idx] & + cpu_evtchn_mask[cpu][idx] & + ~sh->evtchn_mask[idx]); +} + +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) +{ + int irq = evtchn_to_irq[chn]; + + BUG_ON(irq == -1); +#ifdef CONFIG_SMP + irq_desc[irq].affinity = cpumask_of_cpu(cpu); +#endif + + __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); + __set_bit(chn, cpu_evtchn_mask[cpu]); + + cpu_evtchn[chn] = cpu; +} + +static void init_evtchn_cpu_bindings(void) +{ +#ifdef CONFIG_SMP + int i; + /* By default all event channels notify CPU#0. */ + for (i = 0; i < NR_IRQS; i++) + irq_desc[i].affinity = cpumask_of_cpu(0); +#endif + + memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); + memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); +} + +static inline unsigned int cpu_from_evtchn(unsigned int evtchn) +{ + return cpu_evtchn[evtchn]; +} + +static inline void clear_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_clear_bit(port, &s->evtchn_pending[0]); +} + +static inline void set_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, &s->evtchn_pending[0]); +} + + +/** + * notify_remote_via_irq - send event to remote end of event channel via irq + * @irq: irq of event channel to send event to + * + * Unlike notify_remote_via_evtchn(), this is safe to use across + * save/restore. Notifications on a broken connection are silently + * dropped. + */ +void notify_remote_via_irq(int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + notify_remote_via_evtchn(evtchn); +} +EXPORT_SYMBOL_GPL(notify_remote_via_irq); + +static void mask_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, &s->evtchn_mask[0]); +} + +static void unmask_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + unsigned int cpu = get_cpu(); + + BUG_ON(!irqs_disabled()); + + /* Slow path (hypercall) if this is a non-local port. */ + if (unlikely(cpu != cpu_from_evtchn(port))) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } else { + struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); + + sync_clear_bit(port, &s->evtchn_mask[0]); + + /* + * The following is basically the equivalent of + * 'hw_resend_irq'. Just like a real IO-APIC we 'lose + * the interrupt edge' if the channel is masked. + */ + if (sync_test_bit(port, &s->evtchn_pending[0]) && + !sync_test_and_set_bit(port / BITS_PER_LONG, + &vcpu_info->evtchn_pending_sel)) + vcpu_info->evtchn_upcall_pending = 1; + } + + put_cpu(); +} + +static int find_unbound_irq(void) +{ + int irq; + + /* Only allocate from dynirq range */ + for (irq = 0; irq < NR_IRQS; irq++) + if (irq_bindcount[irq] == 0) + break; + + if (irq == NR_IRQS) + panic("No available IRQ to bind to: increase NR_IRQS!\n"); + + return irq; +} + +int bind_evtchn_to_irq(unsigned int evtchn) +{ + int irq; + + spin_lock(&irq_mapping_update_lock); + + irq = evtchn_to_irq[evtchn]; + + if (irq == -1) { + irq = find_unbound_irq(); + + dynamic_irq_init(irq); + set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, + handle_level_irq, "event"); + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); + } + + irq_bindcount[irq]++; + + spin_unlock(&irq_mapping_update_lock); + + return irq; +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); + +static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) +{ + struct evtchn_bind_ipi bind_ipi; + int evtchn, irq; + + spin_lock(&irq_mapping_update_lock); + + irq = per_cpu(ipi_to_irq, cpu)[ipi]; + if (irq == -1) { + irq = find_unbound_irq(); + if (irq < 0) + goto out; + + dynamic_irq_init(irq); + set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, + handle_level_irq, "ipi"); + + bind_ipi.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, + &bind_ipi) != 0) + BUG(); + evtchn = bind_ipi.port; + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); + + per_cpu(ipi_to_irq, cpu)[ipi] = irq; + + bind_evtchn_to_cpu(evtchn, cpu); + } + + irq_bindcount[irq]++; + + out: + spin_unlock(&irq_mapping_update_lock); + return irq; +} + + +static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) +{ + struct evtchn_bind_virq bind_virq; + int evtchn, irq; + + spin_lock(&irq_mapping_update_lock); + + irq = per_cpu(virq_to_irq, cpu)[virq]; + + if (irq == -1) { + bind_virq.virq = virq; + bind_virq.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq) != 0) + BUG(); + evtchn = bind_virq.port; + + irq = find_unbound_irq(); + + dynamic_irq_init(irq); + set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, + handle_level_irq, "virq"); + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); + + per_cpu(virq_to_irq, cpu)[virq] = irq; + + bind_evtchn_to_cpu(evtchn, cpu); + } + + irq_bindcount[irq]++; + + spin_unlock(&irq_mapping_update_lock); + + return irq; +} + +static void unbind_from_irq(unsigned int irq) +{ + struct evtchn_close close; + int evtchn = evtchn_from_irq(irq); + + spin_lock(&irq_mapping_update_lock); + + if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) { + close.port = evtchn; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); + + switch (type_from_irq(irq)) { + case IRQT_VIRQ: + per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) + [index_from_irq(irq)] = -1; + break; + default: + break; + } + + /* Closed ports are implicitly re-bound to VCPU0. */ + bind_evtchn_to_cpu(evtchn, 0); + + evtchn_to_irq[evtchn] = -1; + irq_info[irq] = IRQ_UNBOUND; + + dynamic_irq_init(irq); + } + + spin_unlock(&irq_mapping_update_lock); +} + +int bind_evtchn_to_irqhandler(unsigned int evtchn, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, + const char *devname, void *dev_id) +{ + unsigned int irq; + int retval; + + irq = bind_evtchn_to_irq(evtchn); + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); + +int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, const char *devname, void *dev_id) +{ + unsigned int irq; + int retval; + + irq = bind_virq_to_irq(virq, cpu); + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); + +int bind_ipi_to_irqhandler(enum ipi_vector ipi, + unsigned int cpu, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_ipi_to_irq(ipi, cpu); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} + +void unbind_from_irqhandler(unsigned int irq, void *dev_id) +{ + free_irq(irq, dev_id); + unbind_from_irq(irq); +} +EXPORT_SYMBOL_GPL(unbind_from_irqhandler); + +void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) +{ + int irq = per_cpu(ipi_to_irq, cpu)[vector]; + BUG_ON(irq < 0); + notify_remote_via_irq(irq); +} + + +/* + * Search the CPUs pending events bitmasks. For each one found, map + * the event number to an irq, and feed it into do_IRQ() for + * handling. + * + * Xen uses a two-level bitmap to speed searching. The first level is + * a bitset of words which contain pending event bits. The second + * level is a bitset of pending events themselves. + */ +fastcall void xen_evtchn_do_upcall(struct pt_regs *regs) +{ + int cpu = get_cpu(); + struct shared_info *s = HYPERVISOR_shared_info; + struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); + unsigned long pending_words; + + vcpu_info->evtchn_upcall_pending = 0; + + /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ + pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); + while (pending_words != 0) { + unsigned long pending_bits; + int word_idx = __ffs(pending_words); + pending_words &= ~(1UL << word_idx); + + while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { + int bit_idx = __ffs(pending_bits); + int port = (word_idx * BITS_PER_LONG) + bit_idx; + int irq = evtchn_to_irq[port]; + + if (irq != -1) { + regs->orig_eax = ~irq; + do_IRQ(regs); + } + } + } + + put_cpu(); +} + +/* Rebind an evtchn so that it gets delivered to a specific cpu */ +static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) +{ + struct evtchn_bind_vcpu bind_vcpu; + int evtchn = evtchn_from_irq(irq); + + if (!VALID_EVTCHN(evtchn)) + return; + + /* Send future instances of this interrupt to other vcpu. */ + bind_vcpu.port = evtchn; + bind_vcpu.vcpu = tcpu; + + /* + * If this fails, it usually just indicates that we're dealing with a + * virq or IPI channel, which don't actually need to be rebound. Ignore + * it, but don't do the xenlinux-level rebind in that case. + */ + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) + bind_evtchn_to_cpu(evtchn, tcpu); +} + + +static void set_affinity_irq(unsigned irq, cpumask_t dest) +{ + unsigned tcpu = first_cpu(dest); + rebind_irq_to_cpu(irq, tcpu); +} + +static void enable_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + unmask_evtchn(evtchn); +} + +static void disable_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + mask_evtchn(evtchn); +} + +static void ack_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + move_native_irq(irq); + + if (VALID_EVTCHN(evtchn)) + clear_evtchn(evtchn); +} + +static int retrigger_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + int ret = 0; + + if (VALID_EVTCHN(evtchn)) { + set_evtchn(evtchn); + ret = 1; + } + + return ret; +} + +static struct irq_chip xen_dynamic_chip __read_mostly = { + .name = "xen-dyn", + .mask = disable_dynirq, + .unmask = enable_dynirq, + .ack = ack_dynirq, + .set_affinity = set_affinity_irq, + .retrigger = retrigger_dynirq, +}; + +void __init xen_init_IRQ(void) +{ + int i; + + init_evtchn_cpu_bindings(); + + /* No event channels are 'live' right now. */ + for (i = 0; i < NR_EVENT_CHANNELS; i++) + mask_evtchn(i); + + /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ + for (i = 0; i < NR_IRQS; i++) + irq_bindcount[i] = 0; + + irq_ctx_init(smp_processor_id()); +} diff --git a/arch/i386/xen/features.c b/arch/i386/xen/features.c new file mode 100644 index 00000000000..0707714e40d --- /dev/null +++ b/arch/i386/xen/features.c @@ -0,0 +1,29 @@ +/****************************************************************************** + * features.c + * + * Xen feature flags. + * + * Copyright (c) 2006, Ian Campbell, XenSource Inc. + */ +#include <linux/types.h> +#include <linux/cache.h> +#include <linux/module.h> +#include <asm/xen/hypervisor.h> +#include <xen/features.h> + +u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; +EXPORT_SYMBOL_GPL(xen_features); + +void xen_setup_features(void) +{ + struct xen_feature_info fi; + int i, j; + + for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { + fi.submap_idx = i; + if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) + break; + for (j = 0; j < 32; j++) + xen_features[i * 32 + j] = !!(fi.submap & 1<<j); + } +} diff --git a/arch/i386/xen/manage.c b/arch/i386/xen/manage.c new file mode 100644 index 00000000000..aa7af9e6abc --- /dev/null +++ b/arch/i386/xen/manage.c @@ -0,0 +1,143 @@ +/* + * Handle extern requests for shutdown, reboot and sysrq + */ +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/reboot.h> +#include <linux/sysrq.h> + +#include <xen/xenbus.h> + +#define SHUTDOWN_INVALID -1 +#define SHUTDOWN_POWEROFF 0 +#define SHUTDOWN_SUSPEND 2 +/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only + * report a crash, not be instructed to crash! + * HALT is the same as POWEROFF, as far as we're concerned. The tools use + * the distinction when we return the reason code to them. + */ +#define SHUTDOWN_HALT 4 + +/* Ignore multiple shutdown requests. */ +static int shutting_down = SHUTDOWN_INVALID; + +static void shutdown_handler(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + char *str; + struct xenbus_transaction xbt; + int err; + + if (shutting_down != SHUTDOWN_INVALID) + return; + + again: + err = xenbus_transaction_start(&xbt); + if (err) + return; + + str = (char *)xenbus_read(xbt, "control", "shutdown", NULL); + /* Ignore read errors and empty reads. */ + if (XENBUS_IS_ERR_READ(str)) { + xenbus_transaction_end(xbt, 1); + return; + } + + xenbus_write(xbt, "control", "shutdown", ""); + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) { + kfree(str); + goto again; + } + + if (strcmp(str, "poweroff") == 0 || + strcmp(str, "halt") == 0) + orderly_poweroff(false); + else if (strcmp(str, "reboot") == 0) + ctrl_alt_del(); + else { + printk(KERN_INFO "Ignoring shutdown request: %s\n", str); + shutting_down = SHUTDOWN_INVALID; + } + + kfree(str); +} + +static void sysrq_handler(struct xenbus_watch *watch, const char **vec, + unsigned int len) +{ + char sysrq_key = '\0'; + struct xenbus_transaction xbt; + int err; + + again: + err = xenbus_transaction_start(&xbt); + if (err) + return; + if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { + printk(KERN_ERR "Unable to read sysrq code in " + "control/sysrq\n"); + xenbus_transaction_end(xbt, 1); + return; + } + + if (sysrq_key != '\0') + xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + + if (sysrq_key != '\0') + handle_sysrq(sysrq_key, NULL); +} + +static struct xenbus_watch shutdown_watch = { + .node = "control/shutdown", + .callback = shutdown_handler +}; + +static struct xenbus_watch sysrq_watch = { + .node = "control/sysrq", + .callback = sysrq_handler +}; + +static int setup_shutdown_watcher(void) +{ + int err; + + err = register_xenbus_watch(&shutdown_watch); + if (err) { + printk(KERN_ERR "Failed to set shutdown watcher\n"); + return err; + } + + err = register_xenbus_watch(&sysrq_watch); + if (err) { + printk(KERN_ERR "Failed to set sysrq watcher\n"); + return err; + } + + return 0; +} + +static int shutdown_event(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + setup_shutdown_watcher(); + return NOTIFY_DONE; +} + +static int __init setup_shutdown_event(void) +{ + static struct notifier_block xenstore_notifier = { + .notifier_call = shutdown_event + }; + register_xenstore_notifier(&xenstore_notifier); + + return 0; +} + +subsys_initcall(setup_shutdown_event); diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c new file mode 100644 index 00000000000..4ae038aa6c2 --- /dev/null +++ b/arch/i386/xen/mmu.c @@ -0,0 +1,564 @@ +/* + * Xen mmu operations + * + * This file contains the various mmu fetch and update operations. + * The most important job they must perform is the mapping between the + * domain's pfn and the overall machine mfns. + * + * Xen allows guests to directly update the pagetable, in a controlled + * fashion. In other words, the guest modifies the same pagetable + * that the CPU actually uses, which eliminates the overhead of having + * a separate shadow pagetable. + * + * In order to allow this, it falls on the guest domain to map its + * notion of a "physical" pfn - which is just a domain-local linear + * address - into a real "machine address" which the CPU's MMU can + * use. + * + * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be + * inserted directly into the pagetable. When creating a new + * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely, + * when reading the content back with __(pgd|pmd|pte)_val, it converts + * the mfn back into a pfn. + * + * The other constraint is that all pages which make up a pagetable + * must be mapped read-only in the guest. This prevents uncontrolled + * guest updates to the pagetable. Xen strictly enforces this, and + * will disallow any pagetable update which will end up mapping a + * pagetable page RW, and will disallow using any writable page as a + * pagetable. + * + * Naively, when loading %cr3 with the base of a new pagetable, Xen + * would need to validate the whole pagetable before going on. + * Naturally, this is quite slow. The solution is to "pin" a + * pagetable, which enforces all the constraints on the pagetable even + * when it is not actively in use. This menas that Xen can be assured + * that it is still valid when you do load it into %cr3, and doesn't + * need to revalidate it. + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ +#include <linux/sched.h> +#include <linux/highmem.h> +#include <linux/bug.h> +#include <linux/sched.h> + +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/paravirt.h> + +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/page.h> +#include <xen/interface/xen.h> + +#include "multicalls.h" +#include "mmu.h" + +xmaddr_t arbitrary_virt_to_machine(unsigned long address) +{ + pte_t *pte = lookup_address(address); + unsigned offset = address & PAGE_MASK; + + BUG_ON(pte == NULL); + + return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); +} + +void make_lowmem_page_readonly(void *vaddr) +{ + pte_t *pte, ptev; + unsigned long address = (unsigned long)vaddr; + + pte = lookup_address(address); + BUG_ON(pte == NULL); + + ptev = pte_wrprotect(*pte); + + if (HYPERVISOR_update_va_mapping(address, ptev, 0)) + BUG(); +} + +void make_lowmem_page_readwrite(void *vaddr) +{ + pte_t *pte, ptev; + unsigned long address = (unsigned long)vaddr; + + pte = lookup_address(address); + BUG_ON(pte == NULL); + + ptev = pte_mkwrite(*pte); + + if (HYPERVISOR_update_va_mapping(address, ptev, 0)) + BUG(); +} + + +void xen_set_pmd(pmd_t *ptr, pmd_t val) +{ + struct multicall_space mcs; + struct mmu_update *u; + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*u)); + u = mcs.args; + u->ptr = virt_to_machine(ptr).maddr; + u->val = pmd_val_ma(val); + MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +/* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + BUG(); + return; + } + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + BUG(); + return; + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + BUG(); + return; + } + pte = pte_offset_kernel(pmd, vaddr); + /* <mfn,flags> stored as-is, to permit clearing entries */ + xen_set_pte(pte, mfn_pte(mfn, flags)); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) +{ + if (mm == current->mm || mm == &init_mm) { + if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { + struct multicall_space mcs; + mcs = xen_mc_entry(0); + + MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); + xen_mc_issue(PARAVIRT_LAZY_MMU); + return; + } else + if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) + return; + } + xen_set_pte(ptep, pteval); +} + +#ifdef CONFIG_X86_PAE +void xen_set_pud(pud_t *ptr, pud_t val) +{ + struct multicall_space mcs; + struct mmu_update *u; + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*u)); + u = mcs.args; + u->ptr = virt_to_machine(ptr).maddr; + u->val = pud_val_ma(val); + MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +void xen_set_pte(pte_t *ptep, pte_t pte) +{ + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + +void xen_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + set_64bit((u64 *)ptep, pte_val_ma(pte)); +} + +void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + ptep->pte_low = 0; + smp_wmb(); /* make sure low gets written first */ + ptep->pte_high = 0; +} + +void xen_pmd_clear(pmd_t *pmdp) +{ + xen_set_pmd(pmdp, __pmd(0)); +} + +unsigned long long xen_pte_val(pte_t pte) +{ + unsigned long long ret = 0; + + if (pte.pte_low) { + ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low; + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + } + + return ret; +} + +unsigned long long xen_pmd_val(pmd_t pmd) +{ + unsigned long long ret = pmd.pmd; + if (ret) + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +unsigned long long xen_pgd_val(pgd_t pgd) +{ + unsigned long long ret = pgd.pgd; + if (ret) + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +pte_t xen_make_pte(unsigned long long pte) +{ + if (pte & 1) + pte = phys_to_machine(XPADDR(pte)).maddr; + + return (pte_t){ pte, pte >> 32 }; +} + +pmd_t xen_make_pmd(unsigned long long pmd) +{ + if (pmd & 1) + pmd = phys_to_machine(XPADDR(pmd)).maddr; + + return (pmd_t){ pmd }; +} + +pgd_t xen_make_pgd(unsigned long long pgd) +{ + if (pgd & _PAGE_PRESENT) + pgd = phys_to_machine(XPADDR(pgd)).maddr; + + return (pgd_t){ pgd }; +} +#else /* !PAE */ +void xen_set_pte(pte_t *ptep, pte_t pte) +{ + *ptep = pte; +} + +unsigned long xen_pte_val(pte_t pte) +{ + unsigned long ret = pte.pte_low; + + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr; + + return ret; +} + +unsigned long xen_pgd_val(pgd_t pgd) +{ + unsigned long ret = pgd.pgd; + if (ret) + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +pte_t xen_make_pte(unsigned long pte) +{ + if (pte & _PAGE_PRESENT) + pte = phys_to_machine(XPADDR(pte)).maddr; + + return (pte_t){ pte }; +} + +pgd_t xen_make_pgd(unsigned long pgd) +{ + if (pgd & _PAGE_PRESENT) + pgd = phys_to_machine(XPADDR(pgd)).maddr; + + return (pgd_t){ pgd }; +} +#endif /* CONFIG_X86_PAE */ + + + +/* + (Yet another) pagetable walker. This one is intended for pinning a + pagetable. This means that it walks a pagetable and calls the + callback function on each page it finds making up the page table, + at every level. It walks the entire pagetable, but it only bothers + pinning pte pages which are below pte_limit. In the normal case + this will be TASK_SIZE, but at boot we need to pin up to + FIXADDR_TOP. But the important bit is that we don't pin beyond + there, because then we start getting into Xen's ptes. +*/ +static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), + unsigned long limit) +{ + pgd_t *pgd = pgd_base; + int flush = 0; + unsigned long addr = 0; + unsigned long pgd_next; + + BUG_ON(limit > FIXADDR_TOP); + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { + pud_t *pud; + unsigned long pud_limit, pud_next; + + pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); + + if (!pgd_val(*pgd)) + continue; + + pud = pud_offset(pgd, 0); + + if (PTRS_PER_PUD > 1) /* not folded */ + flush |= (*func)(virt_to_page(pud), 0); + + for (; addr != pud_limit; pud++, addr = pud_next) { + pmd_t *pmd; + unsigned long pmd_limit; + + pud_next = pud_addr_end(addr, pud_limit); + + if (pud_next < limit) + pmd_limit = pud_next; + else + pmd_limit = limit; + + if (pud_none(*pud)) + continue; + + pmd = pmd_offset(pud, 0); + + if (PTRS_PER_PMD > 1) /* not folded */ + flush |= (*func)(virt_to_page(pmd), 0); + + for (; addr != pmd_limit; pmd++) { + addr += (PAGE_SIZE * PTRS_PER_PTE); + if ((pmd_limit-1) < (addr-1)) { + addr = pmd_limit; + break; + } + + if (pmd_none(*pmd)) + continue; + + flush |= (*func)(pmd_page(*pmd), 0); + } + } + } + + flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); + + return flush; +} + +static int pin_page(struct page *page, unsigned flags) +{ + unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); + int flush; + + if (pgfl) + flush = 0; /* already pinned */ + else if (PageHighMem(page)) + /* kmaps need flushing if we found an unpinned + highpage */ + flush = 1; + else { + void *pt = lowmem_page_address(page); + unsigned long pfn = page_to_pfn(page); + struct multicall_space mcs = __xen_mc_entry(0); + + flush = 0; + + MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, + pfn_pte(pfn, PAGE_KERNEL_RO), + flags); + } + + return flush; +} + +/* This is called just after a mm has been created, but it has not + been used yet. We need to make sure that its pagetable is all + read-only, and can be pinned. */ +void xen_pgd_pin(pgd_t *pgd) +{ + struct multicall_space mcs; + struct mmuext_op *op; + + xen_mc_batch(); + + if (pgd_walk(pgd, pin_page, TASK_SIZE)) { + /* re-enable interrupts for kmap_flush_unused */ + xen_mc_issue(0); + kmap_flush_unused(); + xen_mc_batch(); + } + + mcs = __xen_mc_entry(sizeof(*op)); + op = mcs.args; + +#ifdef CONFIG_X86_PAE + op->cmd = MMUEXT_PIN_L3_TABLE; +#else + op->cmd = MMUEXT_PIN_L2_TABLE; +#endif + op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(0); +} + +/* The init_mm pagetable is really pinned as soon as its created, but + that's before we have page structures to store the bits. So do all + the book-keeping now. */ +static __init int mark_pinned(struct page *page, unsigned flags) +{ + SetPagePinned(page); + return 0; +} + +void __init xen_mark_init_mm_pinned(void) +{ + pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); +} + +static int unpin_page(struct page *page, unsigned flags) +{ + unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); + + if (pgfl && !PageHighMem(page)) { + void *pt = lowmem_page_address(page); + unsigned long pfn = page_to_pfn(page); + struct multicall_space mcs = __xen_mc_entry(0); + + MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, + pfn_pte(pfn, PAGE_KERNEL), + flags); + } + + return 0; /* never need to flush on unpin */ +} + +/* Release a pagetables pages back as normal RW */ +static void xen_pgd_unpin(pgd_t *pgd) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + xen_mc_batch(); + + mcs = __xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_UNPIN_TABLE; + op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + pgd_walk(pgd, unpin_page, TASK_SIZE); + + xen_mc_issue(0); +} + +void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) +{ + spin_lock(&next->page_table_lock); + xen_pgd_pin(next->pgd); + spin_unlock(&next->page_table_lock); +} + +void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + spin_lock(&mm->page_table_lock); + xen_pgd_pin(mm->pgd); + spin_unlock(&mm->page_table_lock); +} + + +#ifdef CONFIG_SMP +/* Another cpu may still have their %cr3 pointing at the pagetable, so + we need to repoint it somewhere else before we can unpin it. */ +static void drop_other_mm_ref(void *info) +{ + struct mm_struct *mm = info; + + if (__get_cpu_var(cpu_tlbstate).active_mm == mm) + leave_mm(smp_processor_id()); +} + +static void drop_mm_ref(struct mm_struct *mm) +{ + if (current->active_mm == mm) { + if (current->mm == mm) + load_cr3(swapper_pg_dir); + else + leave_mm(smp_processor_id()); + } + + if (!cpus_empty(mm->cpu_vm_mask)) + xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, + mm, 1); +} +#else +static void drop_mm_ref(struct mm_struct *mm) +{ + if (current->active_mm == mm) + load_cr3(swapper_pg_dir); +} +#endif + +/* + * While a process runs, Xen pins its pagetables, which means that the + * hypervisor forces it to be read-only, and it controls all updates + * to it. This means that all pagetable updates have to go via the + * hypervisor, which is moderately expensive. + * + * Since we're pulling the pagetable down, we switch to use init_mm, + * unpin old process pagetable and mark it all read-write, which + * allows further operations on it to be simple memory accesses. + * + * The only subtle point is that another CPU may be still using the + * pagetable because of lazy tlb flushing. This means we need need to + * switch all CPUs off this pagetable before we can unpin it. + */ +void xen_exit_mmap(struct mm_struct *mm) +{ + get_cpu(); /* make sure we don't move around */ + drop_mm_ref(mm); + put_cpu(); + + spin_lock(&mm->page_table_lock); + xen_pgd_unpin(mm->pgd); + spin_unlock(&mm->page_table_lock); +} diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h new file mode 100644 index 00000000000..c9ff27f3ac3 --- /dev/null +++ b/arch/i386/xen/mmu.h @@ -0,0 +1,60 @@ +#ifndef _XEN_MMU_H + +#include <linux/linkage.h> +#include <asm/page.h> + +/* + * Page-directory addresses above 4GB do not fit into architectural %cr3. + * When accessing %cr3, or equivalent field in vcpu_guest_context, guests + * must use the following accessor macros to pack/unpack valid MFNs. + * + * Note that Xen is using the fact that the pagetable base is always + * page-aligned, and putting the 12 MSB of the address into the 12 LSB + * of cr3. + */ +#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) +#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) + + +void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); + +void xen_set_pte(pte_t *ptep, pte_t pteval); +void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); +void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); + +void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); +void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); +void xen_exit_mmap(struct mm_struct *mm); + +void xen_pgd_pin(pgd_t *pgd); +//void xen_pgd_unpin(pgd_t *pgd); + +#ifdef CONFIG_X86_PAE +unsigned long long xen_pte_val(pte_t); +unsigned long long xen_pmd_val(pmd_t); +unsigned long long xen_pgd_val(pgd_t); + +pte_t xen_make_pte(unsigned long long); +pmd_t xen_make_pmd(unsigned long long); +pgd_t xen_make_pgd(unsigned long long); + +void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); +void xen_set_pte_atomic(pte_t *ptep, pte_t pte); +void xen_set_pud(pud_t *ptr, pud_t val); +void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void xen_pmd_clear(pmd_t *pmdp); + + +#else +unsigned long xen_pte_val(pte_t); +unsigned long xen_pmd_val(pmd_t); +unsigned long xen_pgd_val(pgd_t); + +pte_t xen_make_pte(unsigned long); +pmd_t xen_make_pmd(unsigned long); +pgd_t xen_make_pgd(unsigned long); +#endif + +#endif /* _XEN_MMU_H */ diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c new file mode 100644 index 00000000000..c837e8e463d --- /dev/null +++ b/arch/i386/xen/multicalls.c @@ -0,0 +1,90 @@ +/* + * Xen hypercall batching. + * + * Xen allows multiple hypercalls to be issued at once, using the + * multicall interface. This allows the cost of trapping into the + * hypervisor to be amortized over several calls. + * + * This file implements a simple interface for multicalls. There's a + * per-cpu buffer of outstanding multicalls. When you want to queue a + * multicall for issuing, you can allocate a multicall slot for the + * call and its arguments, along with storage for space which is + * pointed to by the arguments (for passing pointers to structures, + * etc). When the multicall is actually issued, all the space for the + * commands and allocated memory is freed for reuse. + * + * Multicalls are flushed whenever any of the buffers get full, or + * when explicitly requested. There's no way to get per-multicall + * return results back. It will BUG if any of the multicalls fail. + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ +#include <linux/percpu.h> +#include <linux/hardirq.h> + +#include <asm/xen/hypercall.h> + +#include "multicalls.h" + +#define MC_BATCH 32 +#define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) + +struct mc_buffer { + struct multicall_entry entries[MC_BATCH]; + u64 args[MC_ARGS]; + unsigned mcidx, argidx; +}; + +static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); +DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); + +void xen_mc_flush(void) +{ + struct mc_buffer *b = &__get_cpu_var(mc_buffer); + int ret = 0; + unsigned long flags; + + BUG_ON(preemptible()); + + /* Disable interrupts in case someone comes in and queues + something in the middle */ + local_irq_save(flags); + + if (b->mcidx) { + int i; + + if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) + BUG(); + for (i = 0; i < b->mcidx; i++) + if (b->entries[i].result < 0) + ret++; + b->mcidx = 0; + b->argidx = 0; + } else + BUG_ON(b->argidx != 0); + + local_irq_restore(flags); + + BUG_ON(ret); +} + +struct multicall_space __xen_mc_entry(size_t args) +{ + struct mc_buffer *b = &__get_cpu_var(mc_buffer); + struct multicall_space ret; + unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); + + BUG_ON(preemptible()); + BUG_ON(argspace > MC_ARGS); + + if (b->mcidx == MC_BATCH || + (b->argidx + argspace) > MC_ARGS) + xen_mc_flush(); + + ret.mc = &b->entries[b->mcidx]; + b->mcidx++; + ret.args = &b->args[b->argidx]; + b->argidx += argspace; + + return ret; +} diff --git a/arch/i386/xen/multicalls.h b/arch/i386/xen/multicalls.h new file mode 100644 index 00000000000..e6f7530b156 --- /dev/null +++ b/arch/i386/xen/multicalls.h @@ -0,0 +1,45 @@ +#ifndef _XEN_MULTICALLS_H +#define _XEN_MULTICALLS_H + +#include "xen-ops.h" + +/* Multicalls */ +struct multicall_space +{ + struct multicall_entry *mc; + void *args; +}; + +/* Allocate room for a multicall and its args */ +struct multicall_space __xen_mc_entry(size_t args); + +DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags); + +/* Call to start a batch of multiple __xen_mc_entry()s. Must be + paired with xen_mc_issue() */ +static inline void xen_mc_batch(void) +{ + /* need to disable interrupts until this entry is complete */ + local_irq_save(__get_cpu_var(xen_mc_irq_flags)); +} + +static inline struct multicall_space xen_mc_entry(size_t args) +{ + xen_mc_batch(); + return __xen_mc_entry(args); +} + +/* Flush all pending multicalls */ +void xen_mc_flush(void); + +/* Issue a multicall if we're not in a lazy mode */ +static inline void xen_mc_issue(unsigned mode) +{ + if ((xen_get_lazy_mode() & mode) == 0) + xen_mc_flush(); + + /* restore flags saved in xen_mc_batch */ + local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); +} + +#endif /* _XEN_MULTICALLS_H */ diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c new file mode 100644 index 00000000000..f84e7722664 --- /dev/null +++ b/arch/i386/xen/setup.c @@ -0,0 +1,111 @@ +/* + * Machine specific setup for xen + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/pm.h> + +#include <asm/elf.h> +#include <asm/e820.h> +#include <asm/setup.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + +#include <xen/interface/physdev.h> +#include <xen/features.h> + +#include "xen-ops.h" +#include "vdso.h" + +/* These are code, but not functions. Defined in entry.S */ +extern const char xen_hypervisor_callback[]; +extern const char xen_failsafe_callback[]; + +unsigned long *phys_to_machine_mapping; +EXPORT_SYMBOL(phys_to_machine_mapping); + +/** + * machine_specific_memory_setup - Hook for machine specific memory setup. + **/ + +char * __init xen_memory_setup(void) +{ + unsigned long max_pfn = xen_start_info->nr_pages; + + e820.nr_map = 0; + add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM); + + return "Xen"; +} + +static void xen_idle(void) +{ + local_irq_disable(); + + if (need_resched()) + local_irq_enable(); + else { + current_thread_info()->status &= ~TS_POLLING; + smp_mb__after_clear_bit(); + safe_halt(); + current_thread_info()->status |= TS_POLLING; + } +} + +/* + * Set the bit indicating "nosegneg" library variants should be used. + */ +static void fiddle_vdso(void) +{ + extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S. */ + extern char vsyscall_int80_start; + u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK + + &vsyscall_int80_start); + *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; +} + +void __init xen_arch_setup(void) +{ + struct physdev_set_iopl set_iopl; + int rc; + + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); + + HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, + __KERNEL_CS, (unsigned long)xen_failsafe_callback); + + set_iopl.iopl = 1; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); + if (rc != 0) + printk(KERN_INFO "physdev_op failed %d\n", rc); + +#ifdef CONFIG_ACPI + if (!(xen_start_info->flags & SIF_INITDOMAIN)) { + printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); + disable_acpi(); + } +#endif + + memcpy(boot_command_line, xen_start_info->cmd_line, + MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? + COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); + + pm_idle = xen_idle; + +#ifdef CONFIG_SMP + /* fill cpus_possible with all available cpus */ + xen_fill_possible_map(); +#endif + + paravirt_disable_iospace(); + + fiddle_vdso(); +} diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c new file mode 100644 index 00000000000..557b8e24706 --- /dev/null +++ b/arch/i386/xen/smp.c @@ -0,0 +1,404 @@ +/* + * Xen SMP support + * + * This file implements the Xen versions of smp_ops. SMP under Xen is + * very straightforward. Bringing a CPU up is simply a matter of + * loading its initial context and setting it running. + * + * IPIs are handled through the Xen event mechanism. + * + * Because virtual CPUs can be scheduled onto any real CPU, there's no + * useful topology information for the kernel to make use of. As a + * result, all CPUs are treated as if they're single-core and + * single-threaded. + * + * This does not handle HOTPLUG_CPU yet. + */ +#include <linux/sched.h> +#include <linux/err.h> +#include <linux/smp.h> + +#include <asm/paravirt.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/cpu.h> + +#include <xen/interface/xen.h> +#include <xen/interface/vcpu.h> + +#include <asm/xen/interface.h> +#include <asm/xen/hypercall.h> + +#include <xen/page.h> +#include <xen/events.h> + +#include "xen-ops.h" +#include "mmu.h" + +static cpumask_t cpu_initialized_map; +static DEFINE_PER_CPU(int, resched_irq); +static DEFINE_PER_CPU(int, callfunc_irq); + +/* + * Structure and data for smp_call_function(). This is designed to minimise + * static memory requirements. It also looks cleaner. + */ +static DEFINE_SPINLOCK(call_lock); + +struct call_data_struct { + void (*func) (void *info); + void *info; + atomic_t started; + atomic_t finished; + int wait; +}; + +static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); + +static struct call_data_struct *call_data; + +/* + * Reschedule call back. Nothing to do, + * all the work is done automatically when + * we return from the interrupt. + */ +static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) +{ + return IRQ_HANDLED; +} + +static __cpuinit void cpu_bringup_and_idle(void) +{ + int cpu = smp_processor_id(); + + cpu_init(); + + preempt_disable(); + per_cpu(cpu_state, cpu) = CPU_ONLINE; + + xen_setup_cpu_clockevents(); + + /* We can take interrupts now: we're officially "up". */ + local_irq_enable(); + + wmb(); /* make sure everything is out */ + cpu_idle(); +} + +static int xen_smp_intr_init(unsigned int cpu) +{ + int rc; + const char *resched_name, *callfunc_name; + + per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; + + resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); + rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, + cpu, + xen_reschedule_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + resched_name, + NULL); + if (rc < 0) + goto fail; + per_cpu(resched_irq, cpu) = rc; + + callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); + rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, + cpu, + xen_call_function_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + callfunc_name, + NULL); + if (rc < 0) + goto fail; + per_cpu(callfunc_irq, cpu) = rc; + + return 0; + + fail: + if (per_cpu(resched_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); + if (per_cpu(callfunc_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); + return rc; +} + +void __init xen_fill_possible_map(void) +{ + int i, rc; + + for (i = 0; i < NR_CPUS; i++) { + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); + if (rc >= 0) + cpu_set(i, cpu_possible_map); + } +} + +void __init xen_smp_prepare_boot_cpu(void) +{ + int cpu; + + BUG_ON(smp_processor_id() != 0); + native_smp_prepare_boot_cpu(); + + /* We've switched to the "real" per-cpu gdt, so make sure the + old memory can be recycled */ + make_lowmem_page_readwrite(&per_cpu__gdt_page); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); + } + + xen_setup_vcpu_info_placement(); +} + +void __init xen_smp_prepare_cpus(unsigned int max_cpus) +{ + unsigned cpu; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); + } + + smp_store_cpu_info(0); + set_cpu_sibling_map(0); + + if (xen_smp_intr_init(0)) + BUG(); + + cpu_initialized_map = cpumask_of_cpu(0); + + /* Restrict the possible_map according to max_cpus. */ + while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { + for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--) + continue; + cpu_clear(cpu, cpu_possible_map); + } + + for_each_possible_cpu (cpu) { + struct task_struct *idle; + + if (cpu == 0) + continue; + + idle = fork_idle(cpu); + if (IS_ERR(idle)) + panic("failed fork for CPU %d", cpu); + + cpu_set(cpu, cpu_present_map); + } + + //init_xenbus_allowed_cpumask(); +} + +static __cpuinit int +cpu_initialize_context(unsigned int cpu, struct task_struct *idle) +{ + struct vcpu_guest_context *ctxt; + struct gdt_page *gdt = &per_cpu(gdt_page, cpu); + + if (cpu_test_and_set(cpu, cpu_initialized_map)) + return 0; + + ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); + if (ctxt == NULL) + return -ENOMEM; + + ctxt->flags = VGCF_IN_KERNEL; + ctxt->user_regs.ds = __USER_DS; + ctxt->user_regs.es = __USER_DS; + ctxt->user_regs.fs = __KERNEL_PERCPU; + ctxt->user_regs.gs = 0; + ctxt->user_regs.ss = __KERNEL_DS; + ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; + ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ + + memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); + + xen_copy_trap_info(ctxt->trap_ctxt); + + ctxt->ldt_ents = 0; + + BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); + make_lowmem_page_readonly(gdt->gdt); + + ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); + ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); + + ctxt->user_regs.cs = __KERNEL_CS; + ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs); + + ctxt->kernel_ss = __KERNEL_DS; + ctxt->kernel_sp = idle->thread.esp0; + + ctxt->event_callback_cs = __KERNEL_CS; + ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; + ctxt->failsafe_callback_cs = __KERNEL_CS; + ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; + + per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); + ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); + + if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt)) + BUG(); + + kfree(ctxt); + return 0; +} + +int __cpuinit xen_cpu_up(unsigned int cpu) +{ + struct task_struct *idle = idle_task(cpu); + int rc; + +#if 0 + rc = cpu_up_check(cpu); + if (rc) + return rc; +#endif + + init_gdt(cpu); + per_cpu(current_task, cpu) = idle; + irq_ctx_init(cpu); + xen_setup_timer(cpu); + + /* make sure interrupts start blocked */ + per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; + + rc = cpu_initialize_context(cpu, idle); + if (rc) + return rc; + + if (num_online_cpus() == 1) + alternatives_smp_switch(1); + + rc = xen_smp_intr_init(cpu); + if (rc) + return rc; + + smp_store_cpu_info(cpu); + set_cpu_sibling_map(cpu); + /* This must be done before setting cpu_online_map */ + wmb(); + + cpu_set(cpu, cpu_online_map); + + rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); + BUG_ON(rc); + + return 0; +} + +void xen_smp_cpus_done(unsigned int max_cpus) +{ +} + +static void stop_self(void *v) +{ + int cpu = smp_processor_id(); + + /* make sure we're not pinning something down */ + load_cr3(swapper_pg_dir); + /* should set up a minimal gdt */ + + HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); + BUG(); +} + +void xen_smp_send_stop(void) +{ + smp_call_function(stop_self, NULL, 0, 0); +} + +void xen_smp_send_reschedule(int cpu) +{ + xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); +} + + +static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) +{ + unsigned cpu; + + cpus_and(mask, mask, cpu_online_map); + + for_each_cpu_mask(cpu, mask) + xen_send_IPI_one(cpu, vector); +} + +static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) +{ + void (*func) (void *info) = call_data->func; + void *info = call_data->info; + int wait = call_data->wait; + + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(&call_data->started); + /* + * At this point the info structure may be out of scope unless wait==1 + */ + irq_enter(); + (*func)(info); + irq_exit(); + + if (wait) { + mb(); /* commit everything before setting finished */ + atomic_inc(&call_data->finished); + } + + return IRQ_HANDLED; +} + +int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), + void *info, int wait) +{ + struct call_data_struct data; + int cpus; + + /* Holding any lock stops cpus from going down. */ + spin_lock(&call_lock); + + cpu_clear(smp_processor_id(), mask); + + cpus = cpus_weight(mask); + if (!cpus) { + spin_unlock(&call_lock); + return 0; + } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + mb(); /* write everything before IPI */ + + /* Send a message to other CPUs and wait for them to respond */ + xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); + + /* Make sure other vcpus get a chance to run. + XXX too severe? Maybe we should check the other CPU's states? */ + HYPERVISOR_sched_op(SCHEDOP_yield, 0); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus || + (wait && atomic_read(&data.finished) != cpus)) + cpu_relax(); + + spin_unlock(&call_lock); + + return 0; +} diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c new file mode 100644 index 00000000000..dfd6db69ead --- /dev/null +++ b/arch/i386/xen/time.c @@ -0,0 +1,593 @@ +/* + * Xen time implementation. + * + * This is implemented in terms of a clocksource driver which uses + * the hypervisor clock as a nanosecond timebase, and a clockevent + * driver which uses the hypervisor's timer mechanism. + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/clocksource.h> +#include <linux/clockchips.h> +#include <linux/kernel_stat.h> + +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/vcpu.h> + +#include "xen-ops.h" + +#define XEN_SHIFT 22 + +/* Xen may fire a timer up to this many ns early */ +#define TIMER_SLOP 100000 +#define NS_PER_TICK (1000000000LL / HZ) + +static cycle_t xen_clocksource_read(void); + +/* These are perodically updated in shared_info, and then copied here. */ +struct shadow_time_info { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + int tsc_shift; + u32 version; +}; + +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); + +/* runstate info updated by Xen */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); + +/* snapshots of runstate info */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); + +/* unused ns of stolen and blocked time */ +static DEFINE_PER_CPU(u64, residual_stolen); +static DEFINE_PER_CPU(u64, residual_blocked); + +/* return an consistent snapshot of 64-bit time/counter value */ +static u64 get64(const u64 *p) +{ + u64 ret; + + if (BITS_PER_LONG < 64) { + u32 *p32 = (u32 *)p; + u32 h, l; + + /* + * Read high then low, and then make sure high is + * still the same; this will only loop if low wraps + * and carries into high. + * XXX some clean way to make this endian-proof? + */ + do { + h = p32[1]; + barrier(); + l = p32[0]; + barrier(); + } while (p32[1] != h); + + ret = (((u64)h) << 32) | l; + } else + ret = *p; + + return ret; +} + +/* + * Runstate accounting + */ +static void get_runstate_snapshot(struct vcpu_runstate_info *res) +{ + u64 state_time; + struct vcpu_runstate_info *state; + + BUG_ON(preemptible()); + + state = &__get_cpu_var(runstate); + + /* + * The runstate info is always updated by the hypervisor on + * the current CPU, so there's no need to use anything + * stronger than a compiler barrier when fetching it. + */ + do { + state_time = get64(&state->state_entry_time); + barrier(); + *res = *state; + barrier(); + } while (get64(&state->state_entry_time) != state_time); +} + +static void setup_runstate_info(int cpu) +{ + struct vcpu_register_runstate_memory_area area; + + area.addr.v = &per_cpu(runstate, cpu); + + if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, + cpu, &area)) + BUG(); +} + +static void do_stolen_accounting(void) +{ + struct vcpu_runstate_info state; + struct vcpu_runstate_info *snap; + s64 blocked, runnable, offline, stolen; + cputime_t ticks; + + get_runstate_snapshot(&state); + + WARN_ON(state.state != RUNSTATE_running); + + snap = &__get_cpu_var(runstate_snapshot); + + /* work out how much time the VCPU has not been runn*ing* */ + blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; + runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; + offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; + + *snap = state; + + /* Add the appropriate number of ticks of stolen time, + including any left-overs from last time. Passing NULL to + account_steal_time accounts the time as stolen. */ + stolen = runnable + offline + __get_cpu_var(residual_stolen); + + if (stolen < 0) + stolen = 0; + + ticks = 0; + while (stolen >= NS_PER_TICK) { + ticks++; + stolen -= NS_PER_TICK; + } + __get_cpu_var(residual_stolen) = stolen; + account_steal_time(NULL, ticks); + + /* Add the appropriate number of ticks of blocked time, + including any left-overs from last time. Passing idle to + account_steal_time accounts the time as idle/wait. */ + blocked += __get_cpu_var(residual_blocked); + + if (blocked < 0) + blocked = 0; + + ticks = 0; + while (blocked >= NS_PER_TICK) { + ticks++; + blocked -= NS_PER_TICK; + } + __get_cpu_var(residual_blocked) = blocked; + account_steal_time(idle_task(smp_processor_id()), ticks); +} + +/* + * Xen sched_clock implementation. Returns the number of unstolen + * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED + * states. + */ +unsigned long long xen_sched_clock(void) +{ + struct vcpu_runstate_info state; + cycle_t now; + u64 ret; + s64 offset; + + /* + * Ideally sched_clock should be called on a per-cpu basis + * anyway, so preempt should already be disabled, but that's + * not current practice at the moment. + */ + preempt_disable(); + + now = xen_clocksource_read(); + + get_runstate_snapshot(&state); + + WARN_ON(state.state != RUNSTATE_running); + + offset = now - state.state_entry_time; + if (offset < 0) + offset = 0; + + ret = state.time[RUNSTATE_blocked] + + state.time[RUNSTATE_running] + + offset; + + preempt_enable(); + + return ret; +} + + +/* Get the CPU speed from Xen */ +unsigned long xen_cpu_khz(void) +{ + u64 cpu_khz = 1000000ULL << 32; + const struct vcpu_time_info *info = + &HYPERVISOR_shared_info->vcpu_info[0].time; + + do_div(cpu_khz, info->tsc_to_system_mul); + if (info->tsc_shift < 0) + cpu_khz <<= -info->tsc_shift; + else + cpu_khz >>= info->tsc_shift; + + return cpu_khz; +} + +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. + */ +static unsigned get_time_values_from_xen(void) +{ + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + /* src is shared memory with the hypervisor, so we need to + make sure we get a consistent snapshot, even in the face of + being preempted. */ + src = &__get_cpu_var(xen_vcpu)->time; + dst = &__get_cpu_var(shadow_time); + + do { + dst->version = src->version; + rmb(); /* fetch version before data */ + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); /* test version after fetching data */ + } while ((src->version & 1) | (dst->version ^ src->version)); + + return dst->version; +} + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif __x86_64__ + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + +static u64 get_nsec_offset(struct shadow_time_info *shadow) +{ + u64 now, delta; + now = native_read_tsc(); + delta = now - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +static cycle_t xen_clocksource_read(void) +{ + struct shadow_time_info *shadow = &get_cpu_var(shadow_time); + cycle_t ret; + unsigned version; + + do { + version = get_time_values_from_xen(); + barrier(); + ret = shadow->system_timestamp + get_nsec_offset(shadow); + barrier(); + } while (version != __get_cpu_var(xen_vcpu)->time.version); + + put_cpu_var(shadow_time); + + return ret; +} + +static void xen_read_wallclock(struct timespec *ts) +{ + const struct shared_info *s = HYPERVISOR_shared_info; + u32 version; + u64 delta; + struct timespec now; + + /* get wallclock at system boot */ + do { + version = s->wc_version; + rmb(); /* fetch version before time */ + now.tv_sec = s->wc_sec; + now.tv_nsec = s->wc_nsec; + rmb(); /* fetch time before checking version */ + } while ((s->wc_version & 1) | (version ^ s->wc_version)); + + delta = xen_clocksource_read(); /* time since system boot */ + delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + + now.tv_nsec = do_div(delta, NSEC_PER_SEC); + now.tv_sec = delta; + + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +} + +unsigned long xen_get_wallclock(void) +{ + struct timespec ts; + + xen_read_wallclock(&ts); + + return ts.tv_sec; +} + +int xen_set_wallclock(unsigned long now) +{ + /* do nothing for domU */ + return -1; +} + +static struct clocksource xen_clocksource __read_mostly = { + .name = "xen", + .rating = 400, + .read = xen_clocksource_read, + .mask = ~0, + .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */ + .shift = XEN_SHIFT, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +/* + Xen clockevent implementation + + Xen has two clockevent implementations: + + The old timer_op one works with all released versions of Xen prior + to version 3.0.4. This version of the hypervisor provides a + single-shot timer with nanosecond resolution. However, sharing the + same event channel is a 100Hz tick which is delivered while the + vcpu is running. We don't care about or use this tick, but it will + cause the core time code to think the timer fired too soon, and + will end up resetting it each time. It could be filtered, but + doing so has complications when the ktime clocksource is not yet + the xen clocksource (ie, at boot time). + + The new vcpu_op-based timer interface allows the tick timer period + to be changed or turned off. The tick timer is not useful as a + periodic timer because events are only delivered to running vcpus. + The one-shot timer can report when a timeout is in the past, so + set_next_event is capable of returning -ETIME when appropriate. + This interface is used when available. +*/ + + +/* + Get a hypervisor absolute time. In theory we could maintain an + offset between the kernel's time and the hypervisor's time, and + apply that to a kernel's absolute timeout. Unfortunately the + hypervisor and kernel times can drift even if the kernel is using + the Xen clocksource, because ntp can warp the kernel's clocksource. +*/ +static s64 get_abs_timeout(unsigned long delta) +{ + return xen_clocksource_read() + delta; +} + +static void xen_timerop_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + /* unsupported */ + WARN_ON(1); + break; + + case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_RESUME: + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + HYPERVISOR_set_timer_op(0); /* cancel timeout */ + break; + } +} + +static int xen_timerop_set_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); + + if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0) + BUG(); + + /* We may have missed the deadline, but there's no real way of + knowing for sure. If the event was in the past, then we'll + get an immediate interrupt. */ + + return 0; +} + +static const struct clock_event_device xen_timerop_clockevent = { + .name = "xen", + .features = CLOCK_EVT_FEAT_ONESHOT, + + .max_delta_ns = 0xffffffff, + .min_delta_ns = TIMER_SLOP, + + .mult = 1, + .shift = 0, + .rating = 500, + + .set_mode = xen_timerop_set_mode, + .set_next_event = xen_timerop_set_next_event, +}; + + + +static void xen_vcpuop_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + int cpu = smp_processor_id(); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + WARN_ON(1); /* unsupported */ + break; + + case CLOCK_EVT_MODE_ONESHOT: + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + BUG(); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) || + HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + BUG(); + break; + case CLOCK_EVT_MODE_RESUME: + break; + } +} + +static int xen_vcpuop_set_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + int cpu = smp_processor_id(); + struct vcpu_set_singleshot_timer single; + int ret; + + WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); + + single.timeout_abs_ns = get_abs_timeout(delta); + single.flags = VCPU_SSHOTTMR_future; + + ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single); + + BUG_ON(ret != 0 && ret != -ETIME); + + return ret; +} + +static const struct clock_event_device xen_vcpuop_clockevent = { + .name = "xen", + .features = CLOCK_EVT_FEAT_ONESHOT, + + .max_delta_ns = 0xffffffff, + .min_delta_ns = TIMER_SLOP, + + .mult = 1, + .shift = 0, + .rating = 500, + + .set_mode = xen_vcpuop_set_mode, + .set_next_event = xen_vcpuop_set_next_event, +}; + +static const struct clock_event_device *xen_clockevent = + &xen_timerop_clockevent; +static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events); + +static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) +{ + struct clock_event_device *evt = &__get_cpu_var(xen_clock_events); + irqreturn_t ret; + + ret = IRQ_NONE; + if (evt->event_handler) { + evt->event_handler(evt); + ret = IRQ_HANDLED; + } + + do_stolen_accounting(); + + return ret; +} + +void xen_setup_timer(int cpu) +{ + const char *name; + struct clock_event_device *evt; + int irq; + + printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); + + name = kasprintf(GFP_KERNEL, "timer%d", cpu); + if (!name) + name = "<timer kasprintf failed>"; + + irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + name, NULL); + + evt = &per_cpu(xen_clock_events, cpu); + memcpy(evt, xen_clockevent, sizeof(*evt)); + + evt->cpumask = cpumask_of_cpu(cpu); + evt->irq = irq; + + setup_runstate_info(cpu); +} + +void xen_setup_cpu_clockevents(void) +{ + BUG_ON(preemptible()); + + clockevents_register_device(&__get_cpu_var(xen_clock_events)); +} + +__init void xen_time_init(void) +{ + int cpu = smp_processor_id(); + + get_time_values_from_xen(); + + clocksource_register(&xen_clocksource); + + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { + /* Successfully turned off 100Hz tick, so we have the + vcpuop-based timer interface */ + printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); + xen_clockevent = &xen_vcpuop_clockevent; + } + + /* Set initial system time with full resolution */ + xen_read_wallclock(&xtime); + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + tsc_disable = 0; + + xen_setup_timer(cpu); + xen_setup_cpu_clockevents(); +} diff --git a/arch/i386/xen/vdso.h b/arch/i386/xen/vdso.h new file mode 100644 index 00000000000..861fedfe523 --- /dev/null +++ b/arch/i386/xen/vdso.h @@ -0,0 +1,4 @@ +/* Bit used for the pseudo-hwcap for non-negative segments. We use + bit 1 to avoid bugs in some versions of glibc when bit 0 is + used; the choice is otherwise arbitrary. */ +#define VDSO_NOTE_NONEGSEG_BIT 1 diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S new file mode 100644 index 00000000000..1a43b60c0c6 --- /dev/null +++ b/arch/i386/xen/xen-asm.S @@ -0,0 +1,291 @@ +/* + Asm versions of Xen pv-ops, suitable for either direct use or inlining. + The inline versions are the same as the direct-use versions, with the + pre- and post-amble chopped off. + + This code is encoded for size rather than absolute efficiency, + with a view to being able to inline as much as possible. + + We only bother with direct forms (ie, vcpu in pda) of the operations + here; the indirect forms are better handled in C, since they're + generally too large to inline anyway. + */ + +#include <linux/linkage.h> + +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> +#include <asm/percpu.h> +#include <asm/processor-flags.h> +#include <asm/segment.h> + +#include <xen/interface/xen.h> + +#define RELOC(x, v) .globl x##_reloc; x##_reloc=v +#define ENDPATCH(x) .globl x##_end; x##_end=. + +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 + +/* + Enable events. This clears the event mask and tests the pending + event status with one and operation. If there are pending + events, then enter the hypervisor to get them handled. + */ +ENTRY(xen_irq_enable_direct) + /* Clear mask and test pending */ + andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + jz 1f +2: call check_events +1: +ENDPATCH(xen_irq_enable_direct) + ret + ENDPROC(xen_irq_enable_direct) + RELOC(xen_irq_enable_direct, 2b+1) + + +/* + Disabling events is simply a matter of making the event mask + non-zero. + */ +ENTRY(xen_irq_disable_direct) + movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask +ENDPATCH(xen_irq_disable_direct) + ret + ENDPROC(xen_irq_disable_direct) + RELOC(xen_irq_disable_direct, 0) + +/* + (xen_)save_fl is used to get the current interrupt enable status. + Callers expect the status to be in X86_EFLAGS_IF, and other bits + may be set in the return value. We take advantage of this by + making sure that X86_EFLAGS_IF has the right value (and other bits + in that byte are 0), but other bits in the return value are + undefined. We need to toggle the state of the bit, because + Xen and x86 use opposite senses (mask vs enable). + */ +ENTRY(xen_save_fl_direct) + testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + setz %ah + addb %ah,%ah +ENDPATCH(xen_save_fl_direct) + ret + ENDPROC(xen_save_fl_direct) + RELOC(xen_save_fl_direct, 0) + + +/* + In principle the caller should be passing us a value return + from xen_save_fl_direct, but for robustness sake we test only + the X86_EFLAGS_IF flag rather than the whole byte. After + setting the interrupt mask state, it checks for unmasked + pending events and enters the hypervisor to get them delivered + if so. + */ +ENTRY(xen_restore_fl_direct) + testb $X86_EFLAGS_IF>>8, %ah + setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* check for unmasked and pending */ + cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending + jz 1f +2: call check_events +1: +ENDPATCH(xen_restore_fl_direct) + ret + ENDPROC(xen_restore_fl_direct) + RELOC(xen_restore_fl_direct, 2b+1) + +/* + This is run where a normal iret would be run, with the same stack setup: + 8: eflags + 4: cs + esp-> 0: eip + + This attempts to make sure that any pending events are dealt + with on return to usermode, but there is a small window in + which an event can happen just before entering usermode. If + the nested interrupt ends up setting one of the TIF_WORK_MASK + pending work flags, they will not be tested again before + returning to usermode. This means that a process can end up + with pending work, which will be unprocessed until the process + enters and leaves the kernel again, which could be an + unbounded amount of time. This means that a pending signal or + reschedule event could be indefinitely delayed. + + The fix is to notice a nested interrupt in the critical + window, and if one occurs, then fold the nested interrupt into + the current interrupt stack frame, and re-process it + iteratively rather than recursively. This means that it will + exit via the normal path, and all pending work will be dealt + with appropriately. + + Because the nested interrupt handler needs to deal with the + current stack state in whatever form its in, we keep things + simple by only using a single register which is pushed/popped + on the stack. + + Non-direct iret could be done in the same way, but it would + require an annoying amount of code duplication. We'll assume + that direct mode will be the common case once the hypervisor + support becomes commonplace. + */ +ENTRY(xen_iret_direct) + /* test eflags for special cases */ + testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) + jnz hyper_iret + + push %eax + ESP_OFFSET=4 # bytes pushed onto stack + + /* Store vcpu_info pointer for easy access. Do it this + way to avoid having to reload %fs */ +#ifdef CONFIG_SMP + GET_THREAD_INFO(%eax) + movl TI_cpu(%eax),%eax + movl __per_cpu_offset(,%eax,4),%eax + lea per_cpu__xen_vcpu_info(%eax),%eax +#else + movl $per_cpu__xen_vcpu_info, %eax +#endif + + /* check IF state we're restoring */ + testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) + + /* Maybe enable events. Once this happens we could get a + recursive event, so the critical region starts immediately + afterwards. However, if that happens we don't end up + resuming the code, so we don't have to be worried about + being preempted to another CPU. */ + setz XEN_vcpu_info_mask(%eax) +xen_iret_start_crit: + + /* check for unmasked and pending */ + cmpw $0x0001, XEN_vcpu_info_pending(%eax) + + /* If there's something pending, mask events again so we + can jump back into xen_hypervisor_callback */ + sete XEN_vcpu_info_mask(%eax) + + popl %eax + + /* From this point on the registers are restored and the stack + updated, so we don't need to worry about it if we're preempted */ +iret_restore_end: + + /* Jump to hypervisor_callback after fixing up the stack. + Events are masked, so jumping out of the critical + region is OK. */ + je xen_hypervisor_callback + + iret +xen_iret_end_crit: + +hyper_iret: + /* put this out of line since its very rarely used */ + jmp hypercall_page + __HYPERVISOR_iret * 32 + + .globl xen_iret_start_crit, xen_iret_end_crit + +/* + This is called by xen_hypervisor_callback in entry.S when it sees + that the EIP at the time of interrupt was between xen_iret_start_crit + and xen_iret_end_crit. We're passed the EIP in %eax so we can do + a more refined determination of what to do. + + The stack format at this point is: + ---------------- + ss : (ss/esp may be present if we came from usermode) + esp : + eflags } outer exception info + cs } + eip } + ---------------- <- edi (copy dest) + eax : outer eax if it hasn't been restored + ---------------- + eflags } nested exception info + cs } (no ss/esp because we're nested + eip } from the same ring) + orig_eax }<- esi (copy src) + - - - - - - - - + fs } + es } + ds } SAVE_ALL state + eax } + : : + ebx } + ---------------- + return addr <- esp + ---------------- + + In order to deliver the nested exception properly, we need to shift + everything from the return addr up to the error code so it + sits just under the outer exception info. This means that when we + handle the exception, we do it in the context of the outer exception + rather than starting a new one. + + The only caveat is that if the outer eax hasn't been + restored yet (ie, it's still on stack), we need to insert + its value into the SAVE_ALL state before going on, since + it's usermode state which we eventually need to restore. + */ +ENTRY(xen_iret_crit_fixup) + /* offsets +4 for return address */ + + /* + Paranoia: Make sure we're really coming from userspace. + One could imagine a case where userspace jumps into the + critical range address, but just before the CPU delivers a GP, + it decides to deliver an interrupt instead. Unlikely? + Definitely. Easy to avoid? Yes. The Intel documents + explicitly say that the reported EIP for a bad jump is the + jump instruction itself, not the destination, but some virtual + environments get this wrong. + */ + movl PT_CS+4(%esp), %ecx + andl $SEGMENT_RPL_MASK, %ecx + cmpl $USER_RPL, %ecx + je 2f + + lea PT_ORIG_EAX+4(%esp), %esi + lea PT_EFLAGS+4(%esp), %edi + + /* If eip is before iret_restore_end then stack + hasn't been restored yet. */ + cmp $iret_restore_end, %eax + jae 1f + + movl 0+4(%edi),%eax /* copy EAX */ + movl %eax, PT_EAX+4(%esp) + + lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ + + /* set up the copy */ +1: std + mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ + rep movsl + cld + + lea 4(%edi),%esp /* point esp to new frame */ +2: ret + + +/* + Force an event check by making a hypercall, + but preserve regs before making the call. + */ +check_events: + push %eax + push %ecx + push %edx + call force_evtchn_callback + pop %edx + pop %ecx + pop %eax + ret diff --git a/arch/i386/xen/xen-head.S b/arch/i386/xen/xen-head.S new file mode 100644 index 00000000000..bc71f3bc401 --- /dev/null +++ b/arch/i386/xen/xen-head.S @@ -0,0 +1,38 @@ +/* Xen-specific pieces of head.S, intended to be included in the right + place in head.S */ + +#ifdef CONFIG_XEN + +#include <linux/elfnote.h> +#include <asm/boot.h> +#include <xen/interface/elfnote.h> + + .section .init.text +ENTRY(startup_xen) + movl %esi,xen_start_info + cld + movl $(init_thread_union+THREAD_SIZE),%esp + jmp xen_start_kernel + +.pushsection ".bss.page_aligned" + .align PAGE_SIZE_asm +ENTRY(hypercall_page) + .skip 0x1000 +.popsection + + .section .text + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") +#ifdef CONFIG_X86_PAE + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") +#else + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") +#endif + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") + +#endif /*CONFIG_XEN */ diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h new file mode 100644 index 00000000000..b9aaea45f07 --- /dev/null +++ b/arch/i386/xen/xen-ops.h @@ -0,0 +1,71 @@ +#ifndef XEN_OPS_H +#define XEN_OPS_H + +#include <linux/init.h> + +/* These are code, but not functions. Defined in entry.S */ +extern const char xen_hypervisor_callback[]; +extern const char xen_failsafe_callback[]; + +void xen_copy_trap_info(struct trap_info *traps); + +DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); +DECLARE_PER_CPU(unsigned long, xen_cr3); + +extern struct start_info *xen_start_info; +extern struct shared_info *HYPERVISOR_shared_info; + +char * __init xen_memory_setup(void); +void __init xen_arch_setup(void); +void __init xen_init_IRQ(void); + +void xen_setup_timer(int cpu); +void xen_setup_cpu_clockevents(void); +unsigned long xen_cpu_khz(void); +void __init xen_time_init(void); +unsigned long xen_get_wallclock(void); +int xen_set_wallclock(unsigned long time); +unsigned long long xen_sched_clock(void); + +void xen_mark_init_mm_pinned(void); + +DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); + +static inline unsigned xen_get_lazy_mode(void) +{ + return x86_read_percpu(xen_lazy_mode); +} + +void __init xen_fill_possible_map(void); + +void __init xen_setup_vcpu_info_placement(void); +void xen_smp_prepare_boot_cpu(void); +void xen_smp_prepare_cpus(unsigned int max_cpus); +int xen_cpu_up(unsigned int cpu); +void xen_smp_cpus_done(unsigned int max_cpus); + +void xen_smp_send_stop(void); +void xen_smp_send_reschedule(int cpu); +int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic, + int wait); +int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait); + +int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), + void *info, int wait); + + +/* Declare an asm function, along with symbols needed to make it + inlineable */ +#define DECL_ASM(ret, name, ...) \ + ret name(__VA_ARGS__); \ + extern char name##_end[]; \ + extern char name##_reloc[] \ + +DECL_ASM(void, xen_irq_enable_direct, void); +DECL_ASM(void, xen_irq_disable_direct, void); +DECL_ASM(unsigned long, xen_save_fl_direct, void); +DECL_ASM(void, xen_restore_fl_direct, unsigned long); + +void xen_iret_direct(void); +#endif /* XEN_OPS_H */ |